---
### Challenge: Walk Forward on Other Datasets
---

#### I. Download data from `yfinance`

In [18]:
import yfinance as yf

In [19]:
ticker = 'AAPL'
df = yf.download(ticker)
df.columns = df.columns.droplevel('Ticker')  # Flatten the columns
df.head(n=5)

[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1980-12-12,0.098726,0.099155,0.098726,0.098726,469033600
1980-12-15,0.093575,0.094005,0.093575,0.094005,175884800
1980-12-16,0.086707,0.087136,0.086707,0.087136,105728000
1980-12-17,0.088853,0.089282,0.088853,0.088853,86441600
1980-12-18,0.091429,0.091858,0.091429,0.091429,73449600


---
#### II. Preprocess the data

Filter the date range. We take the data from the 01/01/2021.

In [20]:
df = df.loc['2021-01-01':].copy()
df.head(n=5)

Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-04,126.405251,130.507728,123.816779,130.419822,143301900
2021-01-05,127.968079,128.68114,125.447982,125.897308,97664900
2021-01-06,123.660484,128.007165,123.445591,124.754482,155088000
2021-01-07,127.880196,128.573717,124.891247,125.379638,109578200
2021-01-08,128.983932,129.550467,127.206184,129.355099,105158200


We create the target variable: `change_tomorrow`.

In [21]:
df['change_tomorrow'] = df.Close.pct_change(-1)
df.change_tomorrow = df.change_tomorrow * -1
df.change_tomorrow = df.change_tomorrow * 100
df.head(n=5)

Price,Close,High,Low,Open,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-04,126.405251,130.507728,123.816779,130.419822,143301900,1.221264
2021-01-05,127.968079,128.68114,125.447982,125.897308,97664900,-3.483404
2021-01-06,123.660484,128.007165,123.445591,124.754482,155088000,3.299738
2021-01-07,127.880196,128.573717,124.891247,125.379638,109578200,0.855717
2021-01-08,128.983932,129.550467,127.206184,129.355099,105158200,-2.380208


We remove rows with any missing data.

In [22]:
df = df.dropna().copy()
df.head(n=5)

Price,Close,High,Low,Open,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-04,126.405251,130.507728,123.816779,130.419822,143301900,1.221264
2021-01-05,127.968079,128.68114,125.447982,125.897308,97664900,-3.483404
2021-01-06,123.660484,128.007165,123.445591,124.754482,155088000,3.299738
2021-01-07,127.880196,128.573717,124.891247,125.379638,109578200,0.855717
2021-01-08,128.983932,129.550467,127.206184,129.355099,105158200,-2.380208


---
#### III. Compute Machine Learning model

Feature selection:

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [23]:
y = df.change_tomorrow
X = df[['Open','High','Low','Close','Volume']]

---
#### IV. Time Series Split

In [24]:
from sklearn.model_selection import TimeSeriesSplit

ts = TimeSeriesSplit(test_size=200)

Compute and evaluate model in a for loop:

1. Separate the data in train and test
2. Compute the model on the train set
3. Evaluate the model (mse) on the test set
4. Append the errors (mse) in an empty list

In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model_dt = RandomForestRegressor(max_depth=15, random_state=42)

error_mse_list = []

for index_train, index_test in ts.split(df):
    X_train, y_train = X.iloc[index_train], y.iloc[index_train]
    X_test, y_test = X.iloc[index_test], y.iloc[index_test]
    
    model_dt.fit(X_train, y_train)
    
    y_pred = model_dt.predict(X_test)
    error_mse = mean_squared_error(y_test, y_pred)
    
    error_mse_list.append(error_mse)

In [26]:
error_mse_list

[np.float64(3.621984315093625),
 np.float64(5.478278628577397),
 np.float64(2.6061500023638295),
 np.float64(2.1922565790356225),
 np.float64(4.461165377366684)]

---
#### V. Anchored Walk Forward evaluation in backtesting

Create a new strategy.

In [27]:
from backtesting import Backtest, Strategy

In [28]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    n_train = 600
    coef_retrain = 200
    
    def init(self):
        self.model = RandomForestRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        X_train = self.data.df.iloc[:self.n_train, :-1]
        y_train = self.data.df.iloc[:self.n_train, -1]
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :-1]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

In [29]:
class WalkForwardAnchored(Regression):
    def next(self):
        
        # we don't take any action and move on to the following day
        if len(self.data) < self.n_train:
            return
        
        # we retrain the model each 200 days
        if len(self.data) % self.coef_retrain == 0:
            X_train = self.data.df.iloc[:, :-1]
            y_train = self.data.df.iloc[:, -1]

            self.model.fit(X_train, y_train)

            super().next()
            
        else:
            
            super().next()

Run the backtest with optimization.

In [30]:
from backtesting import Backtest
bt = Backtest(df, WalkForwardAnchored, cash=10000, commission=.002, exclusive_orders=True)

In [31]:
import multiprocessing as mp
mp.set_start_method('fork')

RuntimeError: context has already been set

In [32]:
stats_skopt, heatmap, optimize_result = bt.optimize(
    limit_buy = range(0, 6), limit_sell = range(-6, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'
    )

  stats_skopt, heatmap, optimize_result = bt.optimize(


In [33]:
dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]', ascending=False)
dff.head(n=5)

Unnamed: 0,limit_buy,limit_sell,Return [%]
4,0,-2,39.388934
5,0,-1,28.434592
0,0,-6,11.108962
2,0,-4,11.108962
3,0,-3,11.108962


---
#### VI. Unanchored Walk Forward

Import the strategy and perform the backtest with optimization.

In [34]:
%load_ext autoreload
%autoreload 2
import iOStrategies
iOStrategies.WalkForwardUnanchored

iOStrategies.WalkForwardUnanchored

In [35]:
bt_unanchored = Backtest(df, iOStrategies.WalkForwardUnanchored, cash=10000, commission=.002, exclusive_orders=True)

stats_skopt, heatmap, optimize_result = bt_unanchored.optimize(
    limit_buy = range(0, 6), limit_sell = range(-6, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'
    )

  stats_skopt, heatmap, optimize_result = bt_unanchored.optimize(


In [36]:
dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]', ascending=False)
dff.head(n=5)

Unnamed: 0,limit_buy,limit_sell,Return [%]
4,0,-2,13.368624
0,0,-6,11.8695
2,0,-4,11.8695
1,0,-5,11.8695
12,2,-6,11.038287


### Interpret the strategies' performance

In [38]:
bt.plot(filename='backtests/AAPL_walk_forward_anchored.html')

<p align="center">
  <img src="screen/backtest_report_AAPL_AWF.png" width="800"/>
</p>

In [39]:
bt_unanchored.plot(filename='backtests/AAPL_walk_forward_unanchored.html')

<p align="center">
  <img src="screen/backtest_report_AAPL_UWF.png" width="800"/>
</p>