In [None]:
import pandas as pd
import os
import numpy as np
from datetime import timedelta
import sklearn.metrics as m
import seaborn as sns
%matplotlib inline

In [None]:
frame = pd.read_parquet('D:\\Backup Daniel\\projetos\\bolsa\\historicos\\dados.parquet')
frame['gap_petr'] = (frame['abertura_petr']/frame['fechamento_petr'].shift(1))-1
frame['gap_petroleo'] = (frame['abertura_petroleo']/frame['fechamento_petroleo'].shift(1))-1
frame['valor_abertura_petr_futuro'] = frame['abertura_petr'].shift(-1)
frame['valor_abertura_petroleo_futuro'] = frame['abertura_petroleo'].shift(-1)
frame['target'] = frame['fechamento_petr'].shift(-1)
frame.dropna(inplace=True)
frame.head()

In [None]:
def print_report(y_test, y_pred):
    print('MSE: ', m.mean_squared_error(y_test, y_pred))
    print('R2: ', m.r2_score(y_test, y_pred))
    print('MAE: ', m.median_absolute_error(y_test, y_pred))
    print('************Descricao Erro:')
    frame_final = pd.DataFrame(y_pred, y_test.ravel())
    frame_final.reset_index(inplace=True)
    frame_final.rename(inplace=True, columns={'index':'predicao', 0:'real'})
    frame_final['erro'] = abs(frame_final['real']-frame_final['predicao'])
    print(frame_final.erro.describe())
    frame_final.plot(kind='scatter', x='real', y='erro')

<b><center>Train Test Split</center></b>

In [None]:
from sklearn.model_selection import train_test_split

X = frame[[
    'fechamento_petr', 'abertura_petr', 'maxima_petr', 'minima_petr', 
    'fechamento_petroleo', 'abertura_petroleo', 'maxima_petroleo', 'minima_petroleo',
    'valor_abertura_petr_futuro', 'valor_abertura_petroleo_futuro', 'gap_petr', 'gap_petroleo'
]]

y = frame['target']

X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state=5)

<b><center>Standard Scaler</center></b>

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

<b><center>Feature Selection</center></b>

In [None]:
from sklearn.feature_selection import RFECV

model = LinearRegression()

rfecv = RFECV(
    model,
    n_jobs=-1
)

rfecv.fit(X_train, y_train)

for k in zip(X.columns, rfecv.support_):
    print(k)

In [None]:
X_train = rfecv.transform(X_train)
X_test = rfecv.transform(X_test)

<b><center>Linear Regression</center><b>

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(
    n_jobs=-1
)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print_report(y_test, y_pred)

In [None]:
frame_final.plot(kind='scatter', y='predicao', x='real', figsize=(7,5))

In [None]:
g = sns.jointplot(x="real", y="predicao", data=frame_final,
                  height=7)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(
    n_estimators=1000,
    n_jobs=-1
)

rfr.fit(X_train, y_train)

y_pred = rfr.predict(X_test)

print_report(y_test, y_pred)

<b><center>Teste em 2020</center></b>

In [None]:
frame_2020 = pd.read_parquet('D:\\Backup Daniel\\projetos\\bolsa\\historicos\\dados_2020.parquet')
frame_2020['gap_petr'] = (frame_2020['abertura_petr']/frame_2020['fechamento_petr'].shift(1))-1
frame_2020['gap_petroleo'] = (frame_2020['abertura_petroleo']/frame_2020['fechamento_petroleo'].shift(1))-1
frame_2020['valor_abertura_petr_futuro'] = frame_2020['abertura_petr'].shift(-1)
frame_2020['valor_abertura_petroleo_futuro'] = frame_2020['abertura_petroleo'].shift(-1)
frame_2020['target'] = frame_2020['fechamento_petr'].shift(-1)
frame_2020.dropna(inplace=True)
frame_2020.sort_values(by='data', ascending=True, inplace=True)
frame_2020.head()

In [None]:
X_2020 = frame_2020[[
    'fechamento_petr', 'abertura_petr', 'maxima_petr', 'minima_petr', 
    'fechamento_petroleo', 'abertura_petroleo', 'maxima_petroleo', 'minima_petroleo',
    'valor_abertura_petr_futuro', 'valor_abertura_petroleo_futuro'
]]

y_2020 = frame_2020['target']

X_2020 = scaler.transform(X_2020)
y_2020_pred = lr.predict(X_2020)

In [None]:
dados = list()
for k in zip(frame_2020['data'].values, y_2020_pred, y_2020.ravel()):
    dados.append(k)

In [None]:
frame_final = pd.DataFrame(dados)
frame_final.head()
frame_final.rename(inplace=True, columns={0:'data', 1:'predito', 2:'real'})
frame_final['erro'] = abs(frame_final['real']-frame_final['predito'])

In [None]:
frame_final.plot(y=['real', 'predicao'], kind='line', figsize=(15,5))

In [None]:
ax = sns.lineplot(x='data', y="predito", data=frame_final, sizes=(15,5))
ax1 = sns.lineplot(x='data', y="real", data=frame_final, sizes=(15,5))