In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_datareader as pdr
import os
import talib as tl
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn_pandas import DataFrameMapper
%matplotlib inline

In [2]:
def fills(df):
    df.fillna(method='ffill', inplace=True)
    df.fillna(method='bfill', inplace=True)
    return df

In [3]:
aapl = fills(pdr.get_data_google('AAPL', '2000-01-01', '2018-01-01'))
spy = fills(pdr.get_data_google('SPY', '2000-01-01', '2018-01-01'))

In [4]:
oopen = aapl['Open'].values
high = aapl['High'].values
low = aapl['Low'].values
close = aapl['Close'].values

In [5]:
runup252 = aapl['Close'].pct_change(252)
aapl['Run Up 252'] = runup252

In [6]:
beta63 = tl.BETA(close, spy['Close'].values, timeperiod=63)
aapl['Beta 63'] = beta63

In [7]:
ema100 = tl.EMA(close, timeperiod=100)
aapl['EMA 100'] = ema100
aapl['EMA 100'] = aapl['EMA 100'].pct_change()

In [8]:
sma100 = tl.SMA(close, timeperiod=100)
aapl['SMA 100'] = sma100
aapl['SMA 100'] = aapl['SMA 100'].pct_change()

In [9]:
sma_mom100 = aapl['SMA 100'] - aapl['SMA 100'].shift(1)
aapl['SMA MOM 100'] = sma_mom100

In [10]:
sp500_sma100 = tl.SMA(spy['Close'].values, timeperiod=100)
aapl['SP500 SMA 100'] = sp500_sma100
aapl['SP500 SMA 100'] = aapl['SP500 SMA 100'].pct_change()

In [11]:
sp500vola = tl.ATR(spy['High'].values, spy['Low'].values, spy['Close'].values, timeperiod=63)
aapl['SP500 Vola 63'] = sp500vola

In [12]:
sharpe_days = 63
daily_ret = aapl['Close'].pct_change()
sharpe63 = np.sqrt(sharpe_days)*(daily_ret.rolling(sharpe_days).mean()/daily_ret.rolling(sharpe_days).std())
aapl['Sharpe 63'] = sharpe63

In [13]:
vola63 = tl.ATR(high, low, close, timeperiod=63)
aapl['Vola 63'] = vola63

In [14]:
return_days = 5
ret = 100*aapl['Close'].pct_change(return_days).shift(-return_days)
aapl[str(return_days) + ' days future return'] = ret

In [15]:
aapl['Weekday'] = aapl.index.dayofweek.map(lambda x: {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri'}.get(x))

In [16]:
def to_class(val, extreme):
    if val < -extreme:
        return 'Neg'
    elif val > extreme:
        return 'Pos'
    else:
        return 'Neutral'
aapl['Return class'] = aapl[str(return_days) + ' days future return'].apply(lambda val: to_class(val, 3))
aapl['Return class'].head()

Date
2002-02-11        Neg
2002-02-12        Neg
2002-02-13        Neg
2002-02-14        Neg
2002-02-15    Neutral
Name: Return class, dtype: object

In [17]:
aapl.tail(20)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Run Up 252,Beta 63,EMA 100,SMA 100,SMA MOM 100,SP500 SMA 100,SP500 Vola 63,Sharpe 63,Vola 63,5 days future return,Weekday,Return class
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2017-12-01,169.95,171.67,168.5,171.05,39759288,0.562243,0.139494,0.001289,0.001582,-6.6e-05,0.000813,1.436363,0.497874,2.508512,-0.982169,Fri,Neutral
2017-12-04,172.48,172.62,169.63,169.8,32542385,0.545041,0.130644,0.001108,0.001374,-0.000207,0.000784,1.456739,0.552009,2.516154,1.690224,Mon,Neutral
2017-12-05,169.06,171.52,168.4,169.64,27350154,0.554761,0.132157,0.001066,0.001283,-9.1e-05,0.0007,1.467108,0.553182,2.525739,1.214336,Tue,Neutral
2017-12-06,167.5,170.2,166.46,169.01,28560000,0.537153,0.131991,0.000966,0.00121,-7.3e-05,0.000703,1.460011,0.556553,2.545013,1.92888,Wed,Neutral
2017-12-07,169.03,170.44,168.91,169.32,25673308,0.524993,0.132303,0.000984,0.001196,-1.5e-05,0.00073,1.460487,0.768109,2.528902,1.712733,Thu,Neutral
2017-12-08,170.49,171.0,168.82,169.37,23355231,0.510614,0.115573,0.00097,0.001139,-5.7e-05,0.000734,1.460955,0.581407,2.523364,2.715947,Fri,Neutral
2017-12-11,169.2,172.89,168.79,172.67,35273759,0.515314,0.117088,0.001354,0.001385,0.000245,0.000761,1.452051,0.829562,2.54839,2.171773,Mon,Neutral
2017-12-12,172.15,172.39,171.46,171.7,19409230,0.515446,0.116297,0.001207,0.001327,-5.8e-05,0.000787,1.445035,0.852235,2.527145,1.654048,Tue,Neutral
2017-12-13,172.5,173.54,172.0,172.27,23818447,0.495529,0.115701,0.001251,0.001248,-7.9e-05,0.000788,1.436542,0.990162,2.516238,1.207407,Wed,Neutral
2017-12-14,172.4,173.13,171.65,172.22,20476541,0.495095,0.123054,0.001218,0.001203,-4.5e-05,0.000721,1.439454,0.878214,2.49979,1.620021,Thu,Neutral


In [18]:
aapl.drop(['Open', 'High', 'Low', 'Close', 'Volume', str(return_days) + ' days future return'], axis=1, inplace=True)
aapl.dropna(inplace=True)

In [19]:
trafos = [([col], StandardScaler()) for col in aapl.columns[:-2].tolist()]
trafos = [('Weekday', LabelBinarizer())] + trafos
#trafos += [('Return class', LabelBinarizer())]

In [24]:
trafos

[('Weekday', LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),
 (['Run Up 252'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['Beta 63'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['EMA 100'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['SMA 100'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['SMA MOM 100'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['SP500 SMA 100'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['SP500 Vola 63'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['Sharpe 63'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['Vola 63'], StandardScaler(copy=True, with_mean=True, with_std=True))]

In [25]:
mapper = DataFrameMapper(trafos, df_out=True)

In [26]:
aapl_mapped = mapper.fit_transform(aapl)
aapl_mapped.head()

Unnamed: 0_level_0,Weekday_Fri,Weekday_Mon,Weekday_Thu,Weekday_Tue,Weekday_Wed,Run Up 252,Beta 63,EMA 100,SMA 100,SMA MOM 100,SP500 SMA 100,SP500 Vola 63,Sharpe 63,Vola 63
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2003-02-11,0.0,0.0,0.0,1.0,0.0,-1.600356,-0.798219,-1.064351,-0.761711,-0.308494,-0.726244,0.437538,-1.152011,-1.292498
2003-02-12,0.0,0.0,0.0,0.0,1.0,-1.578919,-0.915958,-0.977786,-0.604116,1.227912,-0.691979,0.435567,-0.872184,-1.292987
2003-02-13,0.0,0.0,1.0,0.0,0.0,-1.586341,-0.844042,-0.892677,-0.643545,-0.308234,-0.586438,0.423277,-1.027448,-1.293269
2003-02-14,1.0,0.0,0.0,0.0,0.0,-1.558942,-0.819084,-0.80905,-0.604127,0.306519,-0.51757,0.412245,-0.92458,-1.293547
2003-02-18,0.0,0.0,0.0,1.0,0.0,-1.48715,-0.650059,-0.494465,-0.406947,1.536547,-0.23139,0.419167,-0.965762,-1.293621


# ML model

In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [28]:
aapl_mapped['Return class'] = aapl['Return class']

### Resample umbalanced data

In [29]:
df_majority = aapl_mapped[aapl_mapped['Return class']=='Neutral']
df_minority = aapl_mapped[aapl_mapped['Return class']!='Neutral']
 
df_minority_upsampled = resample(df_minority, 
                                 replace=True,
                                 n_samples=len(df_majority)*2)

df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
df_upsampled['Return class'].value_counts()

Pos        2455
Neutral    2002
Neg        1549
Name: Return class, dtype: int64

In [30]:
X = df_upsampled.drop('Return class', axis=1)
y = df_upsampled['Return class']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [32]:
def fit_and_print(grid):
    grid.fit(X_train, y_train)
    best_estimator = grid.best_estimator_
    print('Best estimator:\n', str(best_estimator))
    
    y_pred = best_estimator.predict(X_test)
    print('Accuracy score:', str(accuracy_score(y_test, y_pred)))
    print('Classification report\n', classification_report(y_test, y_pred))
    print('Confusion matrix\n', str(confusion_matrix(y_test, y_pred)))

### K nearest

In [63]:
params = {'n_neighbors': np.arange(5,50,3),
          'metric': ['euclidean', 'l2', 'manhattan', 'cityblock'],
          'leaf_size': np.arange(5,40,5), 
          'algorithm' : ['auto']}
grid = GridSearchCV(estimator=KNeighborsClassifier(n_jobs=4), param_grid=params)

In [64]:
fit_and_print(grid)

Best estimator:
 KNeighborsClassifier(algorithm='auto', leaf_size=5, metric='manhattan',
           metric_params=None, n_jobs=4, n_neighbors=17, p=2,
           weights='uniform')
Accuracy score: 0.537336024218
Classification report
              precision    recall  f1-score   support

        Neg       0.47      0.44      0.45       508
    Neutral       0.53      0.46      0.49       686
        Pos       0.58      0.67      0.62       788

avg / total       0.53      0.54      0.53      1982

Confusion matrix
 [[223 137 148]
 [134 316 236]
 [122 140 526]]


### Random Forest

In [35]:
params = {'n_estimators': np.arange(20, 40, 2),
          'max_features': [2, 3],
          'min_samples_split': [2, 3],
          'min_samples_leaf': [2, 3, 4]}
grid = GridSearchCV(estimator=RandomForestClassifier(n_jobs=4), param_grid=params)

In [36]:
fit_and_print(grid)

Best estimator:
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Accuracy score: 0.831987891019
Classification report
              precision    recall  f1-score   support

        Neg       0.86      0.86      0.86       508
    Neutral       0.82      0.69      0.75       686
        Pos       0.82      0.94      0.87       788

avg / total       0.83      0.83      0.83      1982

Confusion matrix
 [[437  60  11]
 [ 62 475 149]
 [  7  44 737]]


# Playground

In [47]:
aapl_temp = fills(pdr.get_data_google('AAPL', '2000-01-01', '2018-01-01'))

In [52]:
aapl_temp['Close'][0:5]

Date
2002-02-11    1.78
2002-02-12    1.76
2002-02-13    1.79
2002-02-14    1.76
2002-02-15    1.71
Name: Close, dtype: float64

In [51]:
aapl_temp['Close'].shift(1)[0:5]

Date
2002-02-11     NaN
2002-02-12    1.78
2002-02-13    1.76
2002-02-14    1.79
2002-02-15    1.76
Name: Close, dtype: float64

In [56]:
aapl_temp['Close'].pct_change(2)[0:5]

Date
2002-02-11         NaN
2002-02-12         NaN
2002-02-13    0.005618
2002-02-14    0.000000
2002-02-15   -0.044693
Name: Close, dtype: float64

In [58]:
aapl_temp['Close'].pct_change(2).shift(-2)[0:5]

Date
2002-02-11    0.005618
2002-02-12    0.000000
2002-02-13   -0.044693
2002-02-14   -0.079545
2002-02-15   -0.035088
Name: Close, dtype: float64