# The overfitting problem

## Load the data

In [11]:
import pandas as pd

df = pd.read_excel('data/Microsoft_LinkedIn_Processed.xlsx', parse_dates=['Date'], index_col=0)
df = df.dropna()

## Machine Learning Model

### Separate the data

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [12]:
target = df.change_tomorrow
explanatory = df[['Open','High','Low','Close','Volume']]

## Train test split

### Split the dataset

In [13]:
n_days = len(df.index)
n_days

2070

In [14]:
n_days_split = int(n_days*0.7)
n_days_split

1449

In [15]:
X_train, y_train = explanatory.iloc[:n_days_split], target.iloc[:n_days_split]
X_test, y_test = explanatory.iloc[n_days_split:], target.iloc[n_days_split:]

### Fit the model on train set

In [16]:
from sklearn.tree import DecisionTreeRegressor

In [17]:
model_dt_split = DecisionTreeRegressor(max_depth=15, random_state=42)

In [18]:
model_dt_split.fit(X=X_train, y=y_train)

### Evaluate model

#### On test set

In [19]:
from sklearn.metrics import mean_squared_error

y_pred_test = model_dt_split.predict(X=X_test)
mean_squared_error(y_true=y_test, y_pred=y_pred_test)

4.551107199774148

#### On train set

In [20]:
y_pred_train = model_dt_split.predict(X=X_train)
mean_squared_error(y_true=y_train, y_pred=y_pred_train)

1.1003521703683126

## [ ] Backtesting

In [21]:
from backtesting import Backtest, Strategy



### Create the `Strategy`

In [22]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

### Run the backtest on `test` data

In [23]:
bt = Backtest(X_test, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

In [24]:
results = bt.run(limit_buy=1, limit_sell=-5)

df_results_test = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'Out of Sample (Test)'}, axis=1)
df_results_test

Unnamed: 0,Out of Sample (Test)
Start,2022-09-13 00:00:00
End,2025-03-05 00:00:00
Duration,904 days 00:00:00
Exposure Time [%],0.0
Equity Final [$],16851.735101
Equity Peak [$],19532.777826
Return [%],68.517351


### Run the backtest on `train` data

In [25]:
bt = Backtest(X_train, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

results = bt.run(limit_buy=1, limit_sell=-5)

df_results_train = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'In Sample (Train)'}, axis=1)
df_results_train

Unnamed: 0,In Sample (Train)
Start,2016-12-08 00:00:00
End,2022-09-12 00:00:00
Duration,2104 days 00:00:00
Exposure Time [%],64.665286
Equity Final [$],55262.677035
Equity Peak [$],70598.831607
Commissions [$],2079.254467
Return [%],452.62677


### Compare both backtests

In [26]:
df_results = pd.concat([df_results_test, df_results_train], axis=1)
df_results

Unnamed: 0,Out of Sample (Test),In Sample (Train)
Start,2022-09-13 00:00:00,2016-12-08 00:00:00
End,2025-03-05 00:00:00,2022-09-12 00:00:00
Duration,904 days 00:00:00,2104 days 00:00:00
Exposure Time [%],0.0,64.665286
Equity Final [$],16851.735101,55262.677035
Equity Peak [$],19532.777826,70598.831607
Return [%],68.517351,452.62677
Commissions [$],,2079.254467
