**Logistic Regression**

In [88]:
# Data Manipulation
import numpy as np
import pandas as pd
from datetime import datetime

# Plotting graphs
import matplotlib.pyplot as plt

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from ta import add_all_ta_features #pip install --upgrade ta https://github.com/bukosabino/ta https://medium.datadriveninvestor.com/predicting-the-stock-market-with-python-bba3cf4c56ef
from fastai.tabular.all import add_datepart #pip install fastai https://docs.fast.ai/tabular.core.html https://www.analyticsvidhya.com/blog/2018/10/predicting-stock-pri

1. LR without technical features

In [89]:
df = pd.read_csv('data/AAPL_data.csv')
df.dropna()
df

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2017-04-28,36.075001,35.817501,36.022499,35.912498,83441600.0,33.907143
1,2017-05-01,36.799999,36.240002,36.275002,36.645000,134411600.0,34.598736
2,2017-05-02,37.022499,36.709999,36.884998,36.877499,181408800.0,34.818253
3,2017-05-03,36.872501,36.067501,36.397499,36.764999,182788000.0,34.712040
4,2017-05-04,36.785000,36.452499,36.630001,36.632500,93487600.0,34.586937
...,...,...,...,...,...,...,...
1255,2022-04-22,167.869995,161.500000,166.460007,161.789993,84775200.0,161.789993
1256,2022-04-25,163.169998,158.460007,161.119995,162.880005,96046400.0,162.880005
1257,2022-04-26,162.339996,156.720001,162.250000,156.800003,95623200.0,156.800003
1258,2022-04-27,159.789993,155.380005,155.910004,156.570007,88063200.0,156.570007


In [90]:
X = df[['High', 'Low', 'Open']]
y = np.where(df['Close'].shift(-1) > df['Close'], 1, -1)

In [91]:
split = int(0.8 * len(df))
X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:]

In [92]:
print(len(X_train), len(X_test))

1008 252


In [93]:
model = LogisticRegression(max_iter=float('inf'))
model = model.fit(X_train, y_train)

In [94]:
model.score(X_test, y_test)

0.5317460317460317

2. LR with selected technical features

In [95]:
df = pd.read_csv('data/AAPL_data.csv')
df.dropna()

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2017-04-28,36.075001,35.817501,36.022499,35.912498,83441600.0,33.907143
1,2017-05-01,36.799999,36.240002,36.275002,36.645000,134411600.0,34.598736
2,2017-05-02,37.022499,36.709999,36.884998,36.877499,181408800.0,34.818253
3,2017-05-03,36.872501,36.067501,36.397499,36.764999,182788000.0,34.712040
4,2017-05-04,36.785000,36.452499,36.630001,36.632500,93487600.0,34.586937
...,...,...,...,...,...,...,...
1255,2022-04-22,167.869995,161.500000,166.460007,161.789993,84775200.0,161.789993
1256,2022-04-25,163.169998,158.460007,161.119995,162.880005,96046400.0,162.880005
1257,2022-04-26,162.339996,156.720001,162.250000,156.800003,95623200.0,156.800003
1258,2022-04-27,159.789993,155.380005,155.910004,156.570007,88063200.0,156.570007


In [96]:
df["Date"]=pd.to_datetime(df.Date, format="%Y-%m-%d")
df.index=df['Date']
data = df.sort_index(ascending=True, axis=0)
add_datepart(df, 'Date', drop=False)
df.drop('Elapsed', axis=1, inplace=True)
df

Unnamed: 0_level_0,Date,High,Low,Open,Close,Volume,Adj Close,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-04-28,2017-04-28,36.075001,35.817501,36.022499,35.912498,83441600.0,33.907143,2017,4,17,28,4,118,False,False,False,False,False,False
2017-05-01,2017-05-01,36.799999,36.240002,36.275002,36.645000,134411600.0,34.598736,2017,5,18,1,0,121,False,True,False,False,False,False
2017-05-02,2017-05-02,37.022499,36.709999,36.884998,36.877499,181408800.0,34.818253,2017,5,18,2,1,122,False,False,False,False,False,False
2017-05-03,2017-05-03,36.872501,36.067501,36.397499,36.764999,182788000.0,34.712040,2017,5,18,3,2,123,False,False,False,False,False,False
2017-05-04,2017-05-04,36.785000,36.452499,36.630001,36.632500,93487600.0,34.586937,2017,5,18,4,3,124,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-22,2022-04-22,167.869995,161.500000,166.460007,161.789993,84775200.0,161.789993,2022,4,16,22,4,112,False,False,False,False,False,False
2022-04-25,2022-04-25,163.169998,158.460007,161.119995,162.880005,96046400.0,162.880005,2022,4,17,25,0,115,False,False,False,False,False,False
2022-04-26,2022-04-26,162.339996,156.720001,162.250000,156.800003,95623200.0,156.800003,2022,4,17,26,1,116,False,False,False,False,False,False
2022-04-27,2022-04-27,159.789993,155.380005,155.910004,156.570007,88063200.0,156.570007,2022,4,17,27,2,117,False,False,False,False,False,False


In [97]:
df = add_all_ta_features(
    df, high="High", low="Low", open="Open", close="Close", volume="Volume")

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


In [98]:
selected_features = ['trend_sma_fast', 'trend_ema_fast', 'momentum_stoch_rsi_k', 'momentum_stoch_rsi_d', 'momentum_rsi', \
                    'trend_macd', 'momentum_wr', 'volume_adi', 'momentum_roc', 'volume_obv', \
                    'volatility_bbh', 'volatility_bbl']
basic_features = ['High', 'Low', 'Open', 'Volume', 'Year', 'Month', 'Week', 'Day', 'Dayofweek']

**Notice here we need to deal with a lot of missing values**
- filled as 0 for now - can discuss later

In [99]:
for col, _ in df.iteritems():
    df[col] = df[col].fillna(0)
X = df[selected_features + basic_features]
y = np.where(df['Close'].shift(-1) > df['Close'], 1, -1)

In [100]:
split = int(0.8 * len(df))
X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:]

In [101]:
print(len(X_train), len(X_test))

1008 252


In [103]:
model = LogisticRegression(max_iter=float('inf'))
model = model.fit(X_train, y_train)

In [104]:
model.score(X_test, y_test)

0.5515873015873016

3. LR with full set of features

In [105]:
X = df.drop(['Close', 'trend_psar_down', 'trend_psar_up', 'Date', 'Adj Close'], axis=1)

In [106]:
split = int(0.8 * len(df))
X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:]

In [108]:
model = LogisticRegression(max_iter=float('inf'))
model = model.fit(X_train, y_train)

In [109]:
model.score(X_test, y_test)

0.5079365079365079

**Ensemble Methods**  
- note: currently the code is not using cv for ensemble methods since we have a larger dataset - can discuss later

1. APPLE

In [110]:
df = pd.read_csv('data/AAPL_data.csv')
df.dropna()

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2017-04-28,36.075001,35.817501,36.022499,35.912498,83441600.0,33.907143
1,2017-05-01,36.799999,36.240002,36.275002,36.645000,134411600.0,34.598736
2,2017-05-02,37.022499,36.709999,36.884998,36.877499,181408800.0,34.818253
3,2017-05-03,36.872501,36.067501,36.397499,36.764999,182788000.0,34.712040
4,2017-05-04,36.785000,36.452499,36.630001,36.632500,93487600.0,34.586937
...,...,...,...,...,...,...,...
1255,2022-04-22,167.869995,161.500000,166.460007,161.789993,84775200.0,161.789993
1256,2022-04-25,163.169998,158.460007,161.119995,162.880005,96046400.0,162.880005
1257,2022-04-26,162.339996,156.720001,162.250000,156.800003,95623200.0,156.800003
1258,2022-04-27,159.789993,155.380005,155.910004,156.570007,88063200.0,156.570007


In [111]:
df["Date"]=pd.to_datetime(df.Date, format="%Y-%m-%d")
df.index=df['Date']
data = df.sort_index(ascending=True, axis=0)
add_datepart(df, 'Date', drop=False)
df.drop('Elapsed', axis=1, inplace=True)
df

Unnamed: 0_level_0,Date,High,Low,Open,Close,Volume,Adj Close,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-04-28,2017-04-28,36.075001,35.817501,36.022499,35.912498,83441600.0,33.907143,2017,4,17,28,4,118,False,False,False,False,False,False
2017-05-01,2017-05-01,36.799999,36.240002,36.275002,36.645000,134411600.0,34.598736,2017,5,18,1,0,121,False,True,False,False,False,False
2017-05-02,2017-05-02,37.022499,36.709999,36.884998,36.877499,181408800.0,34.818253,2017,5,18,2,1,122,False,False,False,False,False,False
2017-05-03,2017-05-03,36.872501,36.067501,36.397499,36.764999,182788000.0,34.712040,2017,5,18,3,2,123,False,False,False,False,False,False
2017-05-04,2017-05-04,36.785000,36.452499,36.630001,36.632500,93487600.0,34.586937,2017,5,18,4,3,124,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-22,2022-04-22,167.869995,161.500000,166.460007,161.789993,84775200.0,161.789993,2022,4,16,22,4,112,False,False,False,False,False,False
2022-04-25,2022-04-25,163.169998,158.460007,161.119995,162.880005,96046400.0,162.880005,2022,4,17,25,0,115,False,False,False,False,False,False
2022-04-26,2022-04-26,162.339996,156.720001,162.250000,156.800003,95623200.0,156.800003,2022,4,17,26,1,116,False,False,False,False,False,False
2022-04-27,2022-04-27,159.789993,155.380005,155.910004,156.570007,88063200.0,156.570007,2022,4,17,27,2,117,False,False,False,False,False,False


In [112]:
df = add_all_ta_features(
df, high="High", low="Low", open="Open", close="Close", volume="Volume")
selected_features = ['trend_sma_fast', 'trend_ema_fast', 'momentum_stoch_rsi_k', 'momentum_stoch_rsi_d', 'momentum_rsi', \
                'trend_macd', 'momentum_wr', 'volume_adi', 'momentum_roc', 'volume_obv', \
                'volatility_bbh', 'volatility_bbl']
basic_features = ['High', 'Low', 'Open', 'Volume', 'Year', 'Month', 'Week', 'Day', 'Dayofweek']

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


In [113]:
for col, _ in df.iteritems():
    df[col] = df[col].fillna(0)
X = df[selected_features + basic_features]
y = np.where(df['Close'].shift(-1) > df['Close'], 1, -1)

In [114]:
split = int(0.8 * len(df))
X_train, X_test, y_train, y_test = X[:split], X[split:], y[:split], y[split:]

In [73]:
def random_forest(X_train, X_test, y_train, y_test):
    '''
    Run random forest and return model and accuracy score. 
    '''
    model = RandomForestClassifier(random_state=42, n_estimators=100)
    model.fit(X_train, y_train)
    return model, model.score(X_test, y_test), model.feature_importances_

In [60]:
def adaboost(X_train, X_test, y_train, y_test):
    '''
    Run adaboost and return model and accuracy score. 
    '''   
    model = AdaBoostClassifier(random_state=42, n_estimators=100)
    model.fit(X_train, y_train)
    return model, model.score(X_test, y_test), model.feature_importances_

In [61]:
def gradient_boost(X_train, X_test, y_train, y_test):
    '''
    Run gradient boosting and return model and accuracy score. 
    '''       
    model = GradientBoostingClassifier(random_state=42, n_estimators=100)
    model.fit(X_train, y_train)
    return model, model.score(X_test, y_test), model.feature_importances_

In [121]:
_, score_rf, fi_rf = random_forest(X_train, X_test, y_train, y_test)
print('random forest accuracy is, ', score_rf)

random forest accuracy is,  0.5079365079365079


In [122]:
_, score_ada, fi_ada = adaboost(X_train, X_test, y_train, y_test)
print('adaboosting accuracy is, ', score_ada)

adaboosting accuracy is,  0.5


In [124]:
_, score_gb, fi_gb = gradient_boost(X_train, X_test, y_train, y_test)
print('gradient boosting accuracy is, ', score_gb)

gradient boosting accuracy is,  0.5277777777777778
