In [None]:
!pip install ta

import ta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Random Forest

## Data Extraction

In [None]:
np.random.seed(1111)

df = pd.read_csv('Database.csv')
df

In [None]:
dataDates = pd.to_datetime(df['Date'], errors='coerce')

df["year"] =  dataDates.dt.year
df

## Feature Engineering

### Time Series Feature



In [None]:
open = df.Open
high = df.High
low = df.Low
close = df.Close
volume = df.Volume

In [None]:
# Close Lag Feature
df['Close_Lag_1'] = close.shift(1)

# Volume difference
df['Volume_diff'] = volume.diff()

# Volume Weighted Average Price
df['vwap'] = (close * volume).cumsum() / volume.cumsum()

# Volume Percentage Change
df['volume_percentage_change'] = volume.pct_change()

# Daily Return
df['daily_Return'] = close.pct_change() * 100

In [None]:
interators = [2, 3, 4, 5, 10]

for interator in interators:

    # Lag Feature
    close_lag_column = f"Close_Lag_{interator}"
    df[close_lag_column] = close.shift(interator)

    # Rolling Avg
    rolling_averages = df.rolling(interator).mean()
    close_rolling_mean_column = f"Close_Rolling_Mean_{interator}"
    df[close_rolling_mean_column] = rolling_averages["Close"]

    # Close Ratio
    close_ratio_column = f"Close_Ratio_{interator}"
    df[close_ratio_column] = close / rolling_averages["Close"]

    # Rolling Standard Deviation
    rolling_std = df.rolling(interator).std()
    close_rolling_std_column = f"Close_Rolling_Std_{interator}"
    df[close_rolling_std_column] = rolling_std["Close"]

    # Expanding Avg
    expanding_averages = df.expanding(interator).mean()
    close_expanding_mean_column = f"Close_Expanding_Mean_{interator}"
    df[close_expanding_mean_column] = expanding_averages["Close"]

    # Expanding Standard Deviation
    expanding_std = df.expanding(interator).std()
    close_expanding_std_column = f"Close_Expanding_Mean_{interator}"
    df[close_expanding_std_column] = expanding_std["Close"]

    # Exponential Moving Avg
    exponential_mov_avg = df.ewm(span=interator, adjust=False).mean()
    exponential_moving_avg_column = f"Exponential_Moving_Avg_{interator}"
    df[exponential_moving_avg_column] = exponential_mov_avg["Close"]

    # Simple Moving Average
    sma = df.rolling(interator).mean()
    sma_mean_column = f"sma_{interator}"
    df[sma_mean_column] = sma["Close"]

In [None]:
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()
df.head()

### Technical Indicator Feature

In [None]:
# Moving Average Convergence Divergence (MACD)
df['MACD'] = ta.trend.macd_diff(df['Close'])

# Relative Strength Index (RSI)
df['RSI_5'] = ta.momentum.rsi(df['Close'], window=5)
df['RSI_14'] = ta.momentum.rsi(df['Close'], window=14)

# Money Flow Multiplier: [(Close - Low) - (High - Close)] /(High - Low)
money_flow_mult = ((close - low) - (high - close)) /(high - low)
df['money_flow_mult'] = money_flow_mult

# Money Flow Volume: Money Flow Multiplier x Volume for the Period
money_flow_volume = money_flow_mult * volume
df['money_flow_volume'] = money_flow_volume

# ADL(ADI): Previous ADL + Current Period's Money Flow Volume
adi = money_flow_volume.cumsum()
df['adi'] = adi

# Chaikin Money Flow
cmf = money_flow_volume.rolling(20).sum() / volume.rolling(20).sum()
df['chaikin_money_flow'] = cmf

df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

df.head()

## Split Dataset

In [None]:
y_test_aux = df.loc[df['year'] >= 2022, ['Date', 'Output']].reset_index()

y_train = df.loc[df['year'] < 2022, 'Output']
X_train = df.loc[df['year']  < 2022].drop(['Output', 'Return', 'Date'], axis="columns")

y_test = df.loc[df['year']  >= 2022, 'Output']
X_test = df.loc[df['year']  >= 2022].drop(['Output', 'Return', 'Date'], axis="columns")

## Holdout

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Aplica Random Forest Classification para o conjunto de treinamento
classifier = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1111)
classifier.fit(X_train, y_train)

# Predição com os o conjunto de teste
y_pred = classifier.predict(X_test)


## Evaluate

In [None]:
from sklearn.metrics import recall_score, precision_score, accuracy_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

def evaluate_classification(y_test, y_pred):
  cm = confusion_matrix(y_test, y_pred)

  ConfusionMatrixDisplay(confusion_matrix=cm).plot();

  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  AUC = roc_auc_score(y_test, y_pred)

  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("Recall:", recall)
  print("AUC:", AUC)

In [None]:
evaluate_classification(y_test, y_pred)

## Generate Output

In [None]:
def generate_output(y_test, y_pred):
  date = df.loc[df['year'] >= 2022, ['Date', 'Output']].reset_index()
  date = pd.to_datetime(date['Date']).dt.strftime('%Y%m%d')
  date = pd.DataFrame(date)

  pred = pd.DataFrame(y_pred, columns = ['Prediction'])

  output = pd.concat([date, pred], axis=1)
  output.to_csv('output.csv', index=False)

  return output

In [None]:
generate_output(y_test,y_pred)

## Time Series Split

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler

def expanding_window(model, X_train, y_train, window_size):

    pred = []
    actuals = []

    size = len(df.loc[df['year'] < 2022].index)
    size_max =  len(df)

    train_starts = range(size, size_max, window_size)
    i = 0

    for train_start in train_starts:

        X_train_window, X_test_window = X_train[:train_start], X_train[train_start : train_start + window_size]
        y_train_window, y_test_window = y_train[:train_start], y_train[train_start : train_start + window_size]

        print(f"Fold: {i}")
        #print(f"Train: index={X_train[:train_start].index} - Size: {X_train[:train_start].size}")
        #print(f"Test:  index={y_train[train_start : train_start + window_size].index} - Size: {y_train[train_start : train_start + window_size].size} \n")

        model.fit(X_train_window, y_train_window)
        y_pred_window = model.predict(X_test_window)

        i+=1
        pred.extend(y_pred_window)
        actuals.extend(y_test_window)

    return np.array(pred), np.array(actuals)


In [None]:
y = df['Output']
X = df.drop(['Output', 'Return', 'Date'], axis="columns")

### Time Series Split (22 Days)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=1111)
y_pred_window, y_test_window = expanding_window(classifier, X, y, window_size=22)

In [None]:
evaluate_classification(y_test_window, y_pred_window)

In [None]:
generate_output(y_test_window, y_pred_window)

### Time Series Split (5 Days)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=1111)
y_pred_window, y_test_window = expanding_window(classifier, X, y, window_size=5)

In [None]:
evaluate_classification(y_test_window, y_pred_window)

In [None]:
generate_output(y_test_window, y_pred_window)

### Time Series Split (1 Days)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=1111)
y_pred_window, y_test_window = expanding_window(classifier, X, y, window_size=1)

In [None]:
evaluate_classification(y_test_window, y_pred_window)

In [None]:
generate_output(y_test_window, y_pred_window)

## Baseline Classification

In [None]:
y_train = df.loc[df['year'] < 2022, 'Output']
X_train = df.loc[df['year'] < 2022].drop(['Output', 'Return'], axis="columns")

y_test = df.loc[df['year'] >= 2022, 'Output']
X_test = df.loc[df['year'] >= 2022].drop(['Output', 'Return'], axis="columns")

In [None]:
y_pred_base_classifier = pd.DataFrame(y_test)
y_pred_base_classifier.value_counts()

In [None]:
y_pred_base_classifier['Output'] = 1
y_pred_base_classifier

In [None]:
evaluate_classification(y_test, y_pred_base_classifier)

# MLP

In [None]:
y_test_aux = df.loc[df['year'] >= 2022, ['Date', 'Output']].reset_index()

y_train = df.loc[df['year'] < 2022, 'Return']
X_train = df.loc[df['year']  < 2022].drop(['Output', 'Return', 'Date'], axis="columns")

y_test = df.loc[df['year'] >= 2022, 'Return']
X_test = df.loc[df['year'] >= 2022].drop(['Output', 'Return', 'Date'], axis="columns")

## Holdout

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def evaluate_regressor(y_test, y_pred):
  test_set_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  print('RMSE = ', test_set_rmse)

  mae = mean_absolute_error(y_test, y_pred)
  print("MAE = ", mae)

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.neural_network import MLPRegressor

# Aplica MLP para o conjunto de treinamento
regressor = MLPRegressor(random_state=1111)
regressor.fit(X_train, y_train)

# Predição com os o conjunto de teste
y_pred = regressor.predict(X_test)

In [None]:
# Avalia
evaluate_regressor(y_test, y_pred)

In [None]:
generate_output(y_test, y_pred)

## Time Series Split

In [None]:
y = df['Return']
X = df.drop(['Output', 'Return', 'Date'], axis="columns")

### Time Series Split (22 Days)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
from sklearn.neural_network import MLPRegressor

regressor = MLPRegressor(hidden_layer_sizes=(1), max_iter=100, random_state=1111)
y_pred_window, y_test_window = expanding_window(regressor, X, y, window_size=22)

In [None]:
evaluate_regressor(y_test_window, y_pred_window)

In [None]:
generate_output(y_test_window, y_pred_window)

### Time Series Split (5 Days)

In [None]:
from sklearn.neural_network import MLPRegressor

regressor = MLPRegressor(random_state=1111)
y_pred_window, y_test_window = expanding_window(regressor, X, y, window_size=5)

In [None]:
evaluate_regressor(y_test_window, y_pred_window)

In [None]:
generate_output(y_test_window, y_pred_window)

### Time Series Split (1 Days)

In [None]:
from sklearn.neural_network import MLPRegressor

regressor = MLPRegressor(random_state=1111)
y_pred_window, y_test_window = expanding_window(regressor, X, y, window_size=1)

In [None]:
evaluate_regressor(y_test_window, y_pred_window)

In [None]:
generate_output(y_test_window, y_pred_window)

## Baseline Regression

In [None]:
y_test_aux = df.loc[df['year'] >= 2022, ['Date', 'Output']].reset_index()

y_train = df.loc[df['year'] < 2022, 'Output']
X_train = df.loc[df['year']  < 2022].drop(['Output', 'Return', 'Date'], axis="columns")

y_test = df.loc[df['year'] >= 2022, 'Return']
X_test = df.loc[df['year'] >= 2022].drop(['Output', 'Return', 'Date'], axis="columns")

In [None]:
y_pred_base_regressor = pd.DataFrame(y_test).shift(1, fill_value=0)
y_pred_base_regressor

In [None]:
evaluate_regressor(y_test, y_pred_base_regressor)