In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_datareader as pdr
import os
import talib as tl
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn_pandas import DataFrameMapper
%matplotlib inline

In [2]:
def fills(df):
    df.fillna(method='ffill', inplace=True)
    df.fillna(method='bfill', inplace=True)
    return df

In [3]:
aapl = fills(pdr.get_data_google('AAPL', '2000-01-01', '2018-01-01'))
spy = fills(pdr.get_data_google('SPY', '2000-01-01', '2018-01-01'))

In [4]:
oopen = aapl['Open'].values
high = aapl['High'].values
low = aapl['Low'].values
close = aapl['Close'].values

In [5]:
runup252 = aapl['Close'].pct_change(252)
aapl['Run Up 252'] = runup252

In [6]:
beta63 = tl.BETA(close, spy['Close'].values, timeperiod=63)
aapl['Beta 63'] = beta63

In [7]:
ema100 = tl.EMA(close, timeperiod=100)
aapl['EMA 100'] = ema100
aapl['EMA 100'] = aapl['EMA 100'].pct_change()

In [8]:
sma100 = tl.SMA(close, timeperiod=100)
aapl['SMA 100'] = sma100
aapl['SMA 100'] = aapl['SMA 100'].pct_change()

In [9]:
sma_mom100 = aapl['SMA 100'] - aapl['SMA 100'].shift(-1)
aapl['SMA MOM 100'] = sma_mom100

In [10]:
sp500_sma100 = tl.SMA(spy['Close'].values, timeperiod=100)
aapl['SP500 SMA 100'] = sp500_sma100
aapl['SP500 SMA 100'] = aapl['SP500 SMA 100'].pct_change()

In [11]:
sp500vola = tl.ATR(spy['High'].values, spy['Low'].values, spy['Close'].values, timeperiod=63)
aapl['SP500 Vola 63'] = sp500vola

In [12]:
sharpe_days = 63
daily_ret = aapl['Close'].pct_change()
sharpe63 = np.sqrt(sharpe_days)*(daily_ret.rolling(sharpe_days).mean()/daily_ret.rolling(sharpe_days).std())
aapl['Sharpe 63'] = sharpe63

In [13]:
vola63 = tl.ATR(high, low, close, timeperiod=63)
aapl['Vola 63'] = vola63

In [14]:
return_days = 3
ret = 100*aapl['Close'].pct_change(return_days).shift(-return_days)
aapl[str(return_days) + ' days future return'] = ret

In [15]:
aapl['Weekday'] = aapl.index.dayofweek.map(lambda x: {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri'}.get(x))

In [16]:
def to_class(val, extreme):
    if val < -extreme:
        return 'Neg'
    elif val > extreme:
        return 'Pos'
    else:
        return 'Neutral'
aapl['Return class'] = aapl['3 days future return'].apply(lambda val: to_class(val, 3))
aapl['Return class'].head()

Date
2002-02-11    Neutral
2002-02-12    Neutral
2002-02-13        Neg
2002-02-14        Neg
2002-02-15        Neg
Name: Return class, dtype: object

In [17]:
aapl.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Run Up 252,Beta 63,EMA 100,SMA 100,SMA MOM 100,SP500 SMA 100,SP500 Vola 63,Sharpe 63,Vola 63,3 days future return,Weekday,Return class
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2017-12-22,174.68,175.42,174.5,175.01,16349444,0.504945,0.139007,0.001379,0.001094,0.000177,0.000789,1.442816,1.80843,2.460835,-2.245586,Fri,Neutral
2017-12-26,170.8,171.47,169.68,170.57,33185536,0.463869,0.138846,0.000813,0.000917,4.9e-05,0.000795,1.429915,1.263734,2.506377,-0.785601,Tue,Neutral
2017-12-27,170.1,170.78,169.71,170.6,21498213,0.454887,0.13748,0.0008,0.000868,0.000119,0.000782,1.418646,1.187278,2.483577,,Wed,Neutral
2017-12-28,171.0,171.85,170.48,171.08,16480187,0.465228,0.138857,0.000841,0.000749,0.000191,0.000785,1.405652,1.294132,2.465901,,Thu,Neutral
2017-12-29,170.52,170.59,169.22,169.23,25999922,0.449756,0.142824,0.000601,0.000558,,0.000768,1.413657,1.098579,2.456284,,Fri,Neutral


In [18]:
aapl.drop(['Open', 'High', 'Low', 'Close', 'Volume', '3 days future return'], axis=1, inplace=True)
aapl.dropna(inplace=True)

In [19]:
trafos = [([col], StandardScaler()) for col in aapl.columns[:-2].tolist()]
trafos = [('Weekday', LabelBinarizer())] + trafos
#trafos += [('Return class', LabelBinarizer())]

In [20]:
trafos

[('Weekday', LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)),
 (['Run Up 252'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['Beta 63'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['EMA 100'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['SMA 100'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['SMA MOM 100'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['SP500 SMA 100'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['SP500 Vola 63'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['Sharpe 63'], StandardScaler(copy=True, with_mean=True, with_std=True)),
 (['Vola 63'], StandardScaler(copy=True, with_mean=True, with_std=True))]

In [21]:
mapper = DataFrameMapper(trafos, df_out=True)

In [22]:
aapl_mapped = mapper.fit_transform(aapl)
aapl_mapped.head()

Unnamed: 0_level_0,Weekday_Fri,Weekday_Mon,Weekday_Thu,Weekday_Tue,Weekday_Wed,Run Up 252,Beta 63,EMA 100,SMA 100,SMA MOM 100,SP500 SMA 100,SP500 Vola 63,Sharpe 63,Vola 63
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2003-02-11,0.0,0.0,0.0,1.0,0.0,-1.600159,-0.798062,-1.064306,-0.761709,-1.227681,-0.726039,0.437312,-1.151783,-1.292382
2003-02-12,0.0,0.0,0.0,0.0,1.0,-1.578725,-0.915786,-0.977752,-0.604132,0.308279,-0.691778,0.435341,-0.871988,-1.292872
2003-02-13,0.0,0.0,1.0,0.0,0.0,-1.586146,-0.843879,-0.892654,-0.643557,-0.3064,-0.586248,0.423053,-1.027235,-1.293154
2003-02-14,1.0,0.0,0.0,0.0,0.0,-1.55875,-0.818924,-0.809037,-0.604143,-1.536279,-0.517386,0.41202,-0.924379,-1.293432
2003-02-18,0.0,0.0,0.0,1.0,0.0,-1.486968,-0.649921,-0.494489,-0.406986,1.537985,-0.231234,0.418942,-0.965556,-1.293506


# ML model

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split

from sklearn.metrics import accuracy_score, classification_report



In [24]:
X = aapl_mapped
y = aapl['Return class']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [26]:
def fit_and_print(grid):
    grid.fit(X_train, y_train)
    best_estimator = grid.best_estimator_
    print('Best estimator:\n', str(best_estimator))
    
    y_pred = best_estimator.predict(X_test)
    print('Accuracy score:', str(accuracy_score(y_test, y_pred)))
    print('Classification report\n', classification_report(y_test, y_pred))

### K nearest

In [27]:
params = {'n_neighbors': np.arange(20,100,10),
          'metric': ['euclidean', 'l2', 'manhattan', 'cityblock'],
          'algorithm' : ['auto', 'ball_tree', 'kd_tree']}
grid = GridSearchCV(estimator=KNeighborsClassifier(n_jobs=4), param_grid=params)

In [28]:
fit_and_print(grid)

Best estimator:
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=4, n_neighbors=20, p=2,
           weights='uniform')
Accuracy score: 0.683104284559
Classification report
              precision    recall  f1-score   support

        Neg       0.44      0.13      0.20       154
    Neutral       0.70      0.95      0.80       823
        Pos       0.62      0.16      0.25       260

avg / total       0.65      0.68      0.61      1237



### Random Forest

In [None]:
params = {'n_estimators': np.arange(1, 22, 4),
          'max_features': np.arange(2, 5),
          'min_samples_split': [2, 3, 4],
          'min_samples_leaf': [1, 2, 3]}
grid = GridSearchCV(estimator=RandomForestClassifier(n_jobs=4), param_grid=params)

In [None]:
fit_and_print(grid)