In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_datareader as pdr
import os
import talib as tl
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn_pandas import DataFrameMapper
%matplotlib inline

In [2]:
def fills(df):
    df.fillna(method='ffill', inplace=True)
    df.fillna(method='bfill', inplace=True)
    return df

In [3]:
aapl = fills(pdr.get_data_google('AAPL', '2000-01-01', '2018-01-01'))
spy = fills(pdr.get_data_google('SPY', '2000-01-01', '2018-01-01'))

In [4]:
oopen = aapl['Open'].values
high = aapl['High'].values
low = aapl['Low'].values
close = aapl['Close'].values

In [5]:
runup252 = aapl['Close'].pct_change(252)
aapl['Run Up 252'] = runup252

In [6]:
beta63 = tl.BETA(close, spy['Close'].values, timeperiod=63)
aapl['Beta 63'] = beta63

In [7]:
ema100 = tl.EMA(close, timeperiod=100)
aapl['EMA 100'] = ema100
aapl['EMA 100'] = aapl['EMA 100'].pct_change()

In [8]:
sma100 = tl.SMA(close, timeperiod=100)
aapl['SMA 100'] = sma100
aapl['SMA 100'] = aapl['SMA 100'].pct_change()

In [9]:
sma_mom100 = aapl['SMA 100'] - aapl['SMA 100'].shift(1)
aapl['SMA MOM 100'] = sma_mom100

In [10]:
sp500_sma100 = tl.SMA(spy['Close'].values, timeperiod=100)
aapl['SP500 SMA 100'] = sp500_sma100
aapl['SP500 SMA 100'] = aapl['SP500 SMA 100'].pct_change()

In [11]:
sp500vola = tl.ATR(spy['High'].values, spy['Low'].values, spy['Close'].values, timeperiod=63)
aapl['SP500 Vola 63'] = sp500vola

In [12]:
sharpe_days = 63
daily_ret = aapl['Close'].pct_change()
sharpe63 = np.sqrt(sharpe_days)*(daily_ret.rolling(sharpe_days).mean()/daily_ret.rolling(sharpe_days).std())
aapl['Sharpe 63'] = sharpe63

In [13]:
vola63 = tl.ATR(high, low, close, timeperiod=63)
aapl['Vola 63'] = vola63

In [15]:
return_days = 10
ret = 100*aapl['Close'].pct_change(return_days).shift(-return_days)
aapl[str(return_days) + ' days future return'] = ret

In [16]:
aapl['Weekday'] = aapl.index.dayofweek.map(lambda x: {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri'}.get(x))

In [17]:
def to_class(val, extreme):
    if val < -extreme:
        return 'Neg'
    elif val > extreme:
        return 'Pos'
    else:
        return 'Neutral'
aapl['Return class'] = aapl[str(return_days) + ' days future return'].apply(lambda val: to_class(val, 5))
aapl['Return class'].head()

Date
2002-02-11        Neg
2002-02-12        Neg
2002-02-13        Neg
2002-02-14    Neutral
2002-02-15    Neutral
Name: Return class, dtype: object

In [18]:
aapl.head(6)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Run Up 252,Beta 63,EMA 100,SMA 100,SMA MOM 100,SP500 SMA 100,SP500 Vola 63,Sharpe 63,Vola 63,2 days future return,10 days future return,Weekday,Return class
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2002-02-11,1.71,1.79,1.7,1.78,98224000,,,,,,,,,,0.561798,-5.05618,Mon,Neg
2002-02-12,1.76,1.79,1.75,1.76,54475400,,,,,,,,,,0.0,-10.795455,Tue,Neg
2002-02-13,1.77,1.8,1.76,1.79,73579800,,,,,,,,,,-4.469274,-13.407821,Wed,Neg
2002-02-14,1.79,1.8,1.74,1.76,62466600,,,,,,,,,,-7.954545,-4.545455,Thu,Neutral
2002-02-15,1.75,1.78,1.7,1.71,63578200,,,,,,,,,,-3.508772,1.754386,Fri,Neutral
2002-02-19,1.7,1.71,1.61,1.62,93646000,,,,,,,,,,-4.938272,3.703704,Tue,Neutral


In [20]:
aapl.drop(['Open', 'High', 'Low', 'Close', 'Volume', str(return_days) + ' days future return'], axis=1, inplace=True)
aapl.dropna(inplace=True)

In [21]:
aapl.drop(aapl[aapl['Return class']=='Neutral'].index, inplace=True)

### Resample umbalanced data and Train-test split

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [30]:
train_index = int(len(aapl) * 0.75)
aapl_train = aapl.iloc[:train_index]
aapl_test = aapl.iloc[train_index:]

In [31]:
df_majority = aapl_train[aapl_train['Return class']=='Pos']
df_minority = aapl_train[aapl_train['Return class']!='Pos']

df_minority_upsampled = resample(df_minority, 
                                 replace=True,
                                 n_samples=len(df_majority))

aapl_train = pd.concat([df_majority, df_minority_upsampled])
 
aapl_train['Return class'].value_counts()

Pos    822
Neg    822
Name: Return class, dtype: int64

In [32]:
X_train = aapl_train.drop('Return class', axis=1)
X_test = aapl_test.drop('Return class', axis=1)
y_train = aapl_train['Return class']
y_test = aapl_test['Return class']

In [33]:
trafos = [([col], StandardScaler()) for col in aapl.columns[:-2].tolist()]
trafos = [('Weekday', LabelBinarizer())] + trafos
#trafos += [('Return class', LabelBinarizer())]

In [34]:
trafos

[('Weekday', LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),
 (['Run Up 252'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['Beta 63'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['EMA 100'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['SMA 100'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['SMA MOM 100'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['SP500 SMA 100'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['SP500 Vola 63'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['Sharpe 63'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['Vola 63'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['2 days future return'],
  StandardScaler(copy=True, with_mean=True, with_std=True))]

In [35]:
mapper = DataFrameMapper(trafos, df_out=True, default=None)

In [36]:
X_train = mapper.fit_transform(X_train)
X_test = mapper.transform(X_test)
X_train.head()

Unnamed: 0_level_0,Weekday_Fri,Weekday_Mon,Weekday_Thu,Weekday_Tue,Weekday_Wed,Run Up 252,Beta 63,EMA 100,SMA 100,SMA MOM 100,SP500 SMA 100,SP500 Vola 63,Sharpe 63,Vola 63,2 days future return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2003-04-11,1.0,0.0,0.0,0.0,0.0,-1.652577,-0.975264,-1.32928,-1.191577,-1.808453,-0.257902,0.096285,-1.250695,-1.208929,0.542856
2003-04-15,0.0,0.0,0.0,1.0,0.0,-1.657585,-1.02415,-1.176519,-0.969784,0.472081,-0.108813,0.092637,-1.065336,-1.209526,-0.5226
2003-04-16,0.0,0.0,0.0,0.0,1.0,-1.673764,-1.04189,-1.230015,-1.066389,-0.798548,-0.024879,0.079577,-1.069709,-1.208685,-0.261814
2003-04-17,0.0,0.0,1.0,0.0,0.0,-1.665325,-0.965466,-1.282777,-1.291721,-1.821498,-0.337756,0.085399,-1.182927,-1.208611,0.542856
2003-04-21,0.0,1.0,0.0,0.0,0.0,-1.648233,-1.015903,-1.269884,-1.196903,0.722603,-0.374181,0.075702,-1.026591,-1.209667,0.812024


# ML model

In [37]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [38]:
def fit_and_print(grid):
    grid.fit(X_train, y_train)
    best_estimator = grid.best_estimator_
    print('Best estimator:\n', str(best_estimator))
    
    y_pred = best_estimator.predict(X_test)
    print('Accuracy score:', str(accuracy_score(y_test, y_pred)))
    print('Classification report\n', classification_report(y_test, y_pred))
    print('Confusion matrix\n', str(confusion_matrix(y_test, y_pred)))

### K nearest

In [39]:
params = {'n_neighbors': np.arange(5,60,3)}
grid = GridSearchCV(estimator=KNeighborsClassifier(n_jobs=4), param_grid=params, scoring='recall_macro')

In [40]:
# fit_and_print(grid)

### Random Forest

In [None]:
params = {'n_estimators': np.arange(40,80,10),
          'max_features': np.arange(2,8)}
for score in ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']:
    print('==============', score, '==============')
    try:
        grid = GridSearchCV(estimator=RandomForestClassifier(n_jobs=6, class_weight='balanced_subsample'), param_grid=params, scoring=score)
        fit_and_print(grid)
    except:
        pass

Best estimator:
 RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features=2,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=60, n_jobs=6, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Accuracy score: 0.60358056266
Classification report
              precision    recall  f1-score   support

        Neg       0.58      0.11      0.19       160
        Pos       0.61      0.94      0.74       231

avg / total       0.60      0.60      0.51       391

Confusion matrix
 [[ 18 142]
 [ 13 218]]
Best estimator:
 RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features=2,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_spli

  if pos_label not in present_labels:


Best estimator:
 RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features=7,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=40, n_jobs=6, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Accuracy score: 0.687979539642
Classification report
              precision    recall  f1-score   support

        Neg       0.79      0.33      0.46       160
        Pos       0.67      0.94      0.78       231

avg / total       0.72      0.69      0.65       391

Confusion matrix
 [[ 52 108]
 [ 14 217]]
Best estimator:
 RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features=2,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_spl

  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) if tk != 0. else 0.
  return tk / np.sqrt(pk * qk) 

Best estimator:
 RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features=2,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=40, n_jobs=6, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Accuracy score: 0.659846547315
Classification report
              precision    recall  f1-score   support

        Neg       0.83      0.21      0.34       160
        Pos       0.64      0.97      0.77       231

avg / total       0.72      0.66      0.59       391

Confusion matrix
 [[ 34 126]
 [  7 224]]
Best estimator:
 RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features=5,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_spl

  if pos_label not in present_labels:


Best estimator:
 RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features=3,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=40, n_jobs=6, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Accuracy score: 0.634271099744
Classification report
              precision    recall  f1-score   support

        Neg       0.77      0.15      0.25       160
        Pos       0.62      0.97      0.76       231

avg / total       0.68      0.63      0.55       391

Confusion matrix
 [[ 24 136]
 [  7 224]]
Best estimator:
 RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features=2,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_spl

  if pos_label not in present_labels:


Best estimator:
 RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features=2,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=70, n_jobs=6, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Accuracy score: 0.616368286445
Classification report
              precision    recall  f1-score   support

        Neg       0.75      0.09      0.17       160
        Pos       0.61      0.98      0.75       231

avg / total       0.67      0.62      0.51       391

Confusion matrix
 [[ 15 145]
 [  5 226]]
Best estimator:
 RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features=2,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_spl

### Neural net

In [80]:
from sklearn.neural_network import MLPClassifier

In [81]:
params = {'alpha': [1e-5],
          'hidden_layer_sizes': [(4,2), (5,2), (8,2), (10,8,2), (10, 5, 2)]}

grid = GridSearchCV(estimator=MLPClassifier(max_iter=2000), param_grid=params, scoring='accuracy')

In [82]:
fit_and_print(grid)

Best estimator:
 MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
Accuracy score: 0.566666666667
Classification report
              precision    recall  f1-score   support

        Neg       0.60      0.25      0.36       185
        Pos       0.56      0.85      0.67       205

avg / total       0.58      0.57      0.52       390

Confusion matrix
 [[ 47 138]
 [ 31 174]]
