In [61]:
import warnings
warnings.filterwarnings('ignore')

In [79]:
# imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  
from talib import RSI
from datetime import datetime as dt
from scipy.stats import spearmanr

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline

idx = pd.IndexSlice

YEAR = 252

### Get data 

In [63]:
raw = pd.read_csv('http://hilpisch.com/pyalgo_eikon_eod_data.csv', parse_dates=True).dropna()
raw.head()

Unnamed: 0,Date,AAPL.O,MSFT.O,INTC.O,AMZN.O,GS.N,SPY,.SPX,.VIX,EUR=,XAU=,GDX,GLD
1,2010-01-04,30.572827,30.95,20.88,133.9,173.08,113.33,1132.99,20.04,1.4411,1120.0,47.71,109.8
2,2010-01-05,30.625684,30.96,20.87,134.69,176.14,113.63,1136.52,19.35,1.4368,1118.65,48.17,109.7
3,2010-01-06,30.138541,30.77,20.8,132.25,174.26,113.71,1137.14,19.16,1.4412,1138.5,49.34,111.51
4,2010-01-07,30.082827,30.452,20.6,130.0,177.67,114.19,1141.69,19.06,1.4318,1131.9,49.1,110.82
5,2010-01-08,30.282827,30.66,20.83,133.52,174.31,114.57,1144.98,18.13,1.4412,1136.1,49.84,111.37


In [64]:
# reshape table 
prices = pd.melt(raw,  id_vars = ['Date'], value_vars = raw.columns,
                 var_name = 'Ticker', 
                 value_name = 'Close') 
# set index by ticker and Data 
prices.rename(columns={'Ticker': 'ticker', 'Date':'date', 'Close' :'close'}, inplace=True) 
prices.set_index(['ticker', 'date'], inplace=True) 
prices.sort_index(level = ['ticker', 'date'], inplace=True)

# select timeframe
start = '2012-01-01'
end = '2017-12-31'
prices = prices.loc[idx[:, start:end], :]
prices.head()
prices.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,close
ticker,date,Unnamed: 2_level_1
.SPX,2012-01-03,1277.06
.SPX,2012-01-04,1277.3
.SPX,2012-01-05,1281.06
.SPX,2012-01-06,1277.81
.SPX,2012-01-09,1280.7


### Add signals 

In [65]:
rolling_1m = prices.close.groupby('ticker').rolling(window=21).mean().values
prices['rolling_1m'] = rolling_1m

rolling_rank = prices.rolling_1m.groupby('ticker').rank(ascending=False).values
prices['rolling_rank'] = rolling_rank

prices['rsi'] = prices.close.groupby(level='ticker').apply(RSI)

### Calculate lagged returns

In [71]:
lags = [1, 5, 10] 

In [72]:
for lag in lags: 
    prices[f'returns_{lag}d'] = prices.groupby('ticker').close.pct_change(lag) 
    
for t in lags:
    prices[f'target_{t}d'] = prices.groupby('ticker')[f'returns_{t}d'].shift(-t) 

In [73]:
prices.head(20) 

Unnamed: 0_level_0,Unnamed: 1_level_0,close,rolling_1m,rolling_rank,rsi,returns_10d,target_10d,returns_1d,returns_5d,target_1d,target_5d
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
.SPX,2012-01-03,1277.06,,,,,0.024259,,,0.000188,0.011761
.SPX,2012-01-04,1277.3,,,,,0.029124,0.000188,,0.002944,0.011884
.SPX,2012-01-05,1281.06,,,,,0.02679,0.002944,,-0.002537,0.011272
.SPX,2012-01-06,1277.81,,,,,0.029887,-0.002537,,0.002262,0.008828
.SPX,2012-01-09,1280.7,,,,,0.026509,0.002262,,0.008886,0.010127
.SPX,2012-01-10,1292.08,,,,,0.026291,0.008886,0.011761,0.00031,0.012352
.SPX,2012-01-11,1292.48,,,,,0.020078,0.00031,0.011884,0.002337,0.017037
.SPX,2012-01-12,1295.5,,,,,0.016079,0.002337,0.011272,-0.004948,0.015345
.SPX,2012-01-13,1289.09,,,,,0.018556,-0.004948,0.008828,0.003553,0.020875
.SPX,2012-01-17,1293.67,,,,,0.014486,0.003553,0.010127,0.011108,0.016217


### Linear Regression 

In [83]:
prices = prices.dropna().drop('close', axis=1) 
y = prices.filter(like='target')
x = prices.drop(y.columns, axis=1)

In [69]:
class MultipleTimeSeriesCV:
    ''' 
    Generates tuples of train_idx, test_idx pairs
    Assumes the MultiIndex contains levels 'symbol' and 'date'
    '''
    def __init__(self,
                 n_splits=3,
                 train_period_length=126,
                 test_period_length=21,
                 lookahead=None,
                 shuffle=False):
        self.n_splits = n_splits
        self.lookahead = lookahead
        self.test_length = test_period_length
        self.train_length = train_period_length
        self.shuffle = shuffle

    def split(self, X, y=None, groups=None):
        unique_dates = X.index.get_level_values('date').unique()
        days = sorted(unique_dates, reverse=True)

        split_idx = []
        for i in range(self.n_splits):
            test_end_idx = i * self.test_length
            test_start_idx = test_end_idx + self.test_length
            train_end_idx = test_start_idx + self.lookahead - 1
            train_start_idx = train_end_idx + self.train_length + self.lookahead - 1
            split_idx.append([train_start_idx, train_end_idx,
                              test_start_idx, test_end_idx])

        dates = X.reset_index()[['date']]
        print(dates)
        for train_start, train_end, test_start, test_end in split_idx:
            train_idx = dates[(dates.date > days[train_start])
                              & (dates.date <= days[train_end])].index
            test_idx = dates[(dates.date > days[test_start])
                             & (dates.date <= days[test_end])].index
            if self.shuffle:
                np.random.shuffle(list(train_idx))
            yield train_idx, test_idx

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [76]:
train_period_length = 63
test_period_length = 10
n_splits = int(3 * YEAR/test_period_length)
lookahead =1 

cv = MultipleTimeSeriesCV(n_splits=n_splits,
                          test_period_length=test_period_length,
                          lookahead=lookahead,
                          train_period_length=train_period_length)

In [80]:
target = f'target_{lookahead}d'
lr_predictions, lr_scores = [], []
lr = LinearRegression()
for i, (train_idx, test_idx) in enumerate(cv.split(prices), 1):
    X_train, y_train, =  x.iloc[train_idx], y[target].iloc[train_idx]
    X_test, y_test = x.iloc[test_idx], y[target].iloc[test_idx]
    lr.fit(X=X_train, y=y_train)
    y_pred = lr.predict(X_test)

    preds = y_test.to_frame('actuals').assign(predicted=y_pred)
    preds_by_day = preds.groupby(level='date')
    scores = pd.concat([preds_by_day.apply(lambda x: spearmanr(x.predicted,
                                                               x.actuals)[0] * 100)
                        .to_frame('ic'),
                        preds_by_day.apply(lambda x: np.sqrt(mean_squared_error(y_pred=x.predicted,
                                                                                y_true=x.actuals)))
                        .to_frame('rmse')], axis=1)

    lr_scores.append(scores)
    lr_predictions.append(preds)

lr_scores = pd.concat(lr_scores)
lr_predictions = pd.concat(lr_predictions)

             date
0      2012-01-03
1      2012-01-04
2      2012-01-05
3      2012-01-06
4      2012-01-09
...           ...
18103  2017-12-22
18104  2017-12-26
18105  2017-12-27
18106  2017-12-28
18107  2017-12-29

[18108 rows x 1 columns]


ValueError: Input contains NaN.