# Vamos construir nosso modelo de previsão baseando-se em um LSTM AutoEncoder

- Autoencoders are a type of self-supervised learning model that can learn a compressed representation of input data.
- LSTM Autoencoders can learn a compressed representation of sequence data and have been used on video, text, audio, and time series sequence data.

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

from keras.models import Sequential
from keras.layers import LSTM, Dense, RepeatVector, TimeDistributed
from keras.callbacks import EarlyStopping, LearningRateScheduler

In [2]:
df = pd.read_csv('./bases/treino.csv')
df.shape

(4735, 20)

In [56]:
x_cols = ['Past_1_Days_Close',
'Past_2_Days_Close',
'Past_3_Days_Close',
'Past_4_Days_Close',
'Past_5_Days_Close',
'Past_6_Days_Close',
'Past_7_Days_Close',
'Past_8_Days_Close',
'Past_9_Days_Close',
'Past_10_Days_Close',
'Past_11_Days_Close',
'Past_12_Days_Close',
'Past_13_Days_Close',
'Past_14_Days_Close']

y_cols = ['Past_2_Days_Close',
'Past_3_Days_Close',
'Past_4_Days_Close',
'Past_5_Days_Close',
'Past_6_Days_Close',
'Past_7_Days_Close',
'Past_8_Days_Close',
'Past_9_Days_Close',
'Past_10_Days_Close',
'Past_11_Days_Close',
'Past_12_Days_Close',
'Past_13_Days_Close',
'Past_14_Days_Close',
'Past_15_Days_Close']

df[x_cols].values[0]

array([1.08897853, 1.07980466, 1.08635736, 1.09815192, 1.06932187,
       1.0614593 , 1.06276929, 1.0811162 , 1.06670117, 1.0811162 ,
       1.0247668 , 1.04835486, 1.0339402 , 1.02345657])

In [62]:
X_temp, X_test, y_temp, y_test = train_test_split(df[x_cols], df[y_cols], shuffle = False, test_size=0.1)
X_train, X_validation, y_train, y_validation = train_test_split(X_temp, y_temp, shuffle = False, train_size=0.8)

In [63]:
features = X_train.shape[1]

scaler_rate = MinMaxScaler(feature_range=(-1,1))
scaler_rate.fit(X_train)

X_train_scaled = scaler_rate.transform(X_train).reshape(-1, features, 1)
X_validation_scaled = scaler_rate.transform(X_validation).reshape(-1, features, 1)
X_test_scaled = scaler_rate.transform(X_test).reshape(-1, features, 1)

scaler_trend = MinMaxScaler(feature_range=(-1,1))
scaler_trend.fit(y_train)

y_train_scaled = scaler_trend.transform(y_train).reshape(-1, features, 1)
y_validation_scaled = scaler_trend.transform(y_validation).reshape(-1, features, 1)
y_test_scaled = scaler_trend.transform(y_test).reshape(-1, features, 1)

In [64]:
print('Train', X_train_scaled.shape)
print('Validation', X_validation_scaled.shape)
print('Test', X_test_scaled.shape)

Train (3408, 14, 1)
Validation (853, 14, 1)
Test (474, 14, 1)


In [67]:
model = Sequential()
model.add(LSTM(100, activation='relu', input_shape=(features, 1)))
model.add(RepeatVector(features))
model.add(LSTM(100, activation='relu', return_sequences=True))
model.add(TimeDistributed(Dense(1)))
model.compile(optimizer='adam', loss='mse')

In [68]:
es = EarlyStopping(patience=10, verbose=0, min_delta=0.001, monitor='val_loss', mode='auto', restore_best_weights=True)
red_lr = LearningRateScheduler(lambda x: 1e-3 * 0.90 ** x)

model.fit(
    x=X_train_scaled, 
    y=y_train_scaled, 
    validation_data=(X_validation_scaled, y_validation_scaled),
    batch_size=1,
    shuffle=False,
    epochs=300, 
    verbose=1,
    callbacks=[es, red_lr]
)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300


<keras.src.callbacks.History at 0x7fb4cf022650>

In [69]:
yhat = model.predict(X_test_scaled)
yhat.shape



(474, 14, 1)

In [70]:
df_yhat = pd.DataFrame(yhat.reshape(yhat.shape[0], yhat.shape[1]))
df_yhat_unscaled = pd.DataFrame(scaler_trend.inverse_transform(df_yhat))
df_yhat_unscaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,18.059635,16.43409,17.89006,18.077581,18.566059,18.906126,19.262169,19.509714,19.836903,19.766409,19.452061,19.202967,18.440727,17.007618
1,18.138468,16.440966,18.026663,18.238346,18.814768,19.224266,19.681057,20.046896,20.492931,20.461088,19.950544,19.13657,17.527023,15.430189
2,18.222895,16.457323,18.130072,18.357611,19.001923,19.481209,20.036549,20.524389,21.095318,21.068941,20.304949,18.939095,16.705061,14.348816
3,18.174421,16.412159,18.144098,18.376959,19.064091,19.58902,20.219782,20.815521,21.543682,21.649441,20.888117,19.331097,16.721163,14.026813
4,18.171404,16.385445,18.210249,18.46232,19.225727,19.843809,20.624874,21.443295,22.440388,22.700556,21.790695,19.643763,16.283575,13.26434


In [71]:
print('RMSE', mean_squared_error(y_test, df_yhat_unscaled))
print('MSE', mean_absolute_error(y_test, df_yhat_unscaled))
print('MSE Percentage', mean_absolute_percentage_error(y_test, df_yhat_unscaled))

RMSE 116.48848208348022
MSE 6.644754375838026
MSE Percentage 0.24161487295096862


In [86]:
df_yhat_unscaled.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
count,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0
mean,21.926922,17.437611,20.848406,22.582226,25.43947,28.307318,29.168314,30.377663,29.637938,29.753546,22.283985,22.169481,12.181756,17.798859
std,6.048754,2.017956,4.590934,7.200738,10.703225,13.447351,12.647903,13.198005,10.957108,12.412968,9.666454,10.341459,20.29604,8.848428
min,13.541565,13.772452,14.561196,15.087702,15.603121,16.151459,16.67864,16.857632,17.089212,16.965515,-12.297239,9.457033,-92.449364,-44.677986
25%,17.614208,16.191124,17.565979,17.712727,18.229668,18.569991,18.95393,19.198688,19.56722,19.551832,18.345445,17.12696,11.777218,11.870637
50%,18.774663,16.528139,18.751877,18.956655,19.970723,20.954962,22.277595,23.640052,25.282697,25.360147,20.53816,18.944288,16.706179,16.764525
75%,25.931201,18.329339,23.646703,26.315183,30.408148,37.35649,44.923018,42.605172,41.506823,36.259497,26.083419,22.863872,19.205157,20.806693
max,36.597355,23.326645,32.721523,42.924133,57.333916,57.961208,54.079441,59.044853,50.82325,66.596855,48.254383,65.12056,53.729111,57.624016


In [87]:
y_test.describe()

Unnamed: 0,Past_2_Days_Close,Past_3_Days_Close,Past_4_Days_Close,Past_5_Days_Close,Past_6_Days_Close,Past_7_Days_Close,Past_8_Days_Close,Past_9_Days_Close,Past_10_Days_Close,Past_11_Days_Close,Past_12_Days_Close,Past_13_Days_Close,Past_14_Days_Close,Past_15_Days_Close
count,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0,474.0
mean,25.58064,25.54907,25.515396,25.479232,25.44183,25.4035,25.364597,25.324694,25.287767,25.24992,25.212526,25.180304,25.148545,25.115684
std,5.733458,5.709793,5.682459,5.650912,5.614125,5.575336,5.538773,5.500246,5.463428,5.425122,5.394213,5.367719,5.339752,5.308964
min,17.470224,17.470224,17.470224,17.470224,17.470224,17.470224,17.470224,17.470224,17.470224,17.470224,17.470224,17.470224,17.470224,17.470224
25%,21.399224,21.399224,21.399224,21.399224,21.399224,21.399224,21.379377,21.375377,21.375377,21.375377,21.373235,21.373235,21.373235,21.373235
50%,23.657326,23.656199,23.643619,23.606599,23.582159,23.578593,23.575027,23.557829,23.52638,23.512129,23.501442,23.462256,23.430195,23.418983
75%,29.04135,28.945153,28.906621,28.833179,28.780056,28.735697,28.689598,28.625674,28.563313,28.549046,28.519201,28.438052,28.400723,28.378229
max,40.156639,40.156639,40.156639,40.156639,40.156639,40.119583,40.119583,40.032852,40.032852,40.032852,40.032852,40.032852,40.032852,40.032852


In [None]:
plt.figure(figsize=(15,4), dpi=100)
#plt.plot(dates, X_test[15].values, label='Real')
plt.plot(dates, y_test[frame_size].values, label='HP Filter')
plt.plot(dates, df_yhat_unscaled[frame_size].values, label='LSTM Predict')
plt.gca().set(title=f'BNB rate from {pd.to_datetime(dates[0]).year}/{pd.to_datetime(dates[0]).month} to {pd.to_datetime(dates[-1]).year}/{pd.to_datetime(dates[-1]).month}', xlabel='Date', ylabel='Rate in dolar')
plt.xlabel("Position")
plt.ylabel("Value ($)")
plt.grid(True)
plt.legend()
plt.show()

# Classificador

In [92]:
X = df[y_cols]
y = df['Label']

In [93]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(random_state=42)
model.fit(X, y)

In [96]:
model.predict(df_yhat_unscaled.iloc[0])



ValueError: Expected 2D array, got 1D array instead:
array=[18.059635 16.43409  17.89006  18.077581 18.56606  18.906126 19.262169
 19.509714 19.836903 19.766409 19.45206  19.202967 18.440727 17.007618].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.