In [1]:
import sys
sys.path.append("../") # go to parent dir
from util.read_data import DataReader

In [5]:
reader = DataReader()
df = reader.read_all_data()
df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'DayOfWeek', 'Month',
       'AdjClose', 'OpenDiff', 'CloseDiff', 'AdjCloseDiff', 'HighLowDiff'],
      dtype='object')

In [20]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import datetime
import pandas as pd

In [21]:
y_true = df.Volume
train, test = df[df.index < datetime.datetime(2017, 1, 1)], df[df.index >= datetime.datetime(2017, 1, 1)]

test_start_date = datetime.datetime(2017, 1, 1)
y_test_true = y_true[y_true.index >= test_start_date]


Baseline model: $\hat{E}[v_{d+1}]=v_d$

In [22]:

def baseline_model(df):
    y_true = df.Volume
    y_pred = df.Volume.shift(1)
    y_test_pred = y_pred[y_pred.index >= test_start_date]
    mse = mean_squared_error(y_test_true, y_test_pred)
    r2 = r2_score(y_test_true, y_test_pred)
    return (mse, r2)

In [23]:
baseline_model_metrics = baseline_model(df)
print("Baseline model metrics: mse={:e}, r2={:.3f}".format(baseline_model_metrics[0], baseline_model_metrics[1]))

Baseline model metrics: mse=3.701181e+17, r2=0.027


The baseline r2 score is **0.027**

Another simple model would use moving average over some time window.
The model is following: $V_{d+1}=\frac 1n \sum_{i=0}^{n-1} V_{d-i}$.
We can try multiple n values, le'ts try following values: {1, 3, 5, 10, 50, 100}



In [24]:
def moving_average_model(df, n):
    y_pred = df.Volume.rolling(n).mean().shift(1)
    y_test_pred = y_pred[y_pred.index >= test_start_date]
    mse = mean_squared_error(y_test_true, y_test_pred)
    r2 = r2_score(y_test_true, y_test_pred)
    return (mse, r2)

In [25]:
window_size = [1,3,5,10,50,100]
moving_average_models = [moving_average_model(df, n) for n in window_size]
for n, model in zip(window_size, moving_average_models):
    print("Moving average model metrics with for windows size {}: mse={:e}, r2={:.3f}".format(n, model[0], model[1]))


Moving average model metrics with for windows size 1: mse=3.701181e+17, r2=0.027
Moving average model metrics with for windows size 3: mse=3.131172e+17, r2=0.176
Moving average model metrics with for windows size 5: mse=3.180588e+17, r2=0.163
Moving average model metrics with for windows size 10: mse=3.310901e+17, r2=0.129
Moving average model metrics with for windows size 50: mse=3.631842e+17, r2=0.045
Moving average model metrics with for windows size 100: mse=3.828377e+17, r2=-0.007


The moving average models for window size <3, 50> already exceed baseline model. The window size 1 confirms that te algorithm has correct implementation.

Let's not try to use first real machine-learning model, random forest classifier. The features will be month of year, day of week and historical parametrized window of following market data: volume data, changes in closing price in time and difference between low and high price. Size of the window will be: {3, 5, 10, 50, 100}. The expected result is that the precision will be better with longer time window, but with deminishing returns.

In [14]:
from scipy.stats import ttest_ind
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [15]:
def scale_basic_features(df):
    df = df.copy()
    scale = StandardScaler()
    df[['CloseDifference', 'HighLowDifference']]= scale.fit_transform(df[['CloseDifference', 'HighLowDifference']].values) 
    return df

def scale_volume(df):
    df = df.copy()
    scale = StandardScaler()
    df[['Volume']]= scale.fit_transform(df[['Volume']].values) 
    return df, scale

def inverse_scale_volume(df, scale):
    df = df.copy()
    df[['Volume']]= scale.inverse_transform(df[['Volume']].values)
    return df

In [16]:
def enrich_features(df, n):
    df = df.copy()
    
    # add past window of volume as features
    for i in range(1, n + 1):
        df["Volume" + str(i)] = df.Volume.shift(i)
        df["CloseDifference" + str(i)] = df.CloseDifference.shift(i)
        df["HighLowDifference" + str(i)] = df.HighLowDifference.shift(i)

    # we cannot include following features, as we can use historical data from prediction only, 
    # not the market data from the same day
    df = df.drop(["CloseDifference", "HighLowDifference"], axis=1)
    # one-hot encoding of categorical data
    df = pd.get_dummies(df, columns=['DayOfWeek','Month'])
    df = df.dropna()
    return df

def random_forest_model(df):
    
    # initial values have nan
    df = df.dropna()
    
    regressor = RandomForestRegressor(n_estimators=100)
    
    x_train = df.drop(['Volume'], axis=1)[df.index < test_start_date]
    y_train = df.Volume[df.index < test_start_date]
    
    regressor.fit(x_train, y_train)
    
    x_test = df.drop(['Volume'], axis=1)[df.index >= test_start_date]
    y_test_pred = regressor.predict(x_test)
    y_test_pred = inverse_scale_volume(y_test_pred, scaler)
    
    y_test_true = df.Volume[df.index >= test_start_date]
    
    mse = mean_squared_error(y_test_true, y_test_pred)
    r2 = r2_score(y_test_true, y_test_pred)
    
    importances = list(zip(x_train.columns, regressor.feature_importances_))
    importances.sort(key=lambda x: x[1], reverse = True)
    
    print("Feature importances:")
    for importance in importances:
        print('Feature: {:30} \t Importance: {}'.format(importance[0], importance[1]))

    return(mse, r2)
    

In [17]:
df = scale_basic_features(df)
df_scaled, scaler = scale_volume(df)

rich_features_df = enrich_features(df_scaled, 10)
model = random_forest_model(rich_features_df)
print("Random forest classifier with for windows size {}: mse={:e}, r2={:.3f}".format(5, model[0], model[1]))

  from ipykernel import kernelapp as app


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
rich_features_df

NameError: name 'scale_basic_features' is not defined

In [154]:
df2, scaler = scale_volume(df)



In [155]:
scaler.inverse_transform(df2[['Volume']].values)

array([[3.74050e+08],
       [9.31800e+08],
       [1.00900e+09],
       ...,
       [4.23399e+09],
       [4.09661e+09],
       [3.70262e+09]])

In [161]:
df

Unnamed: 0_level_0,Volume,DayOfWeek,Month,CloseDifference,HighLowDifference
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1999-12-31,-1.791614,Friday,12,,
2000-01-03,-1.420593,Monday,1,-0.892619,0.092202
2000-01-04,-1.369239,Tuesday,1,-3.510409,-0.102567
2000-01-05,-1.318350,Wednesday,1,0.155250,-0.169910
2000-01-06,-1.313827,Thursday,1,0.070645,1.043566
2000-01-07,-1.225420,Friday,1,2.369446,2.043795
2000-01-10,-1.332120,Monday,1,0.997562,2.985263
2000-01-11,-1.365913,Tuesday,1,-1.206597,-0.080775
2000-01-12,-1.392122,Wednesday,1,-0.408795,-0.675638
2000-01-13,-1.355003,Thursday,1,1.079038,0.574815


In [6]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,DayOfWeek,Month,AdjClose,OpenDiff,CloseDiff,AdjCloseDiff,HighLowDiff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2000-01-03,1469.25,1478.0,1438.359985,1455.219971,931800000,Monday,1,1455.219971,4.780029,-14.030029,-14.030029,39.640015
2000-01-04,1455.219971,1455.219971,1397.430054,1399.420044,1009000000,Tuesday,1,1399.420044,-14.030029,-55.799927,-55.799927,57.789917
2000-01-05,1399.420044,1413.27002,1377.680054,1402.109985,1085500000,Wednesday,1,1402.109985,-55.799927,2.689941,2.689941,35.589966
2000-01-06,1402.109985,1411.900024,1392.099976,1403.449951,1092300000,Thursday,1,1403.449951,2.689941,1.339966,1.339966,19.800048
2000-01-07,1403.449951,1441.469971,1400.72998,1441.469971,1225200000,Friday,1,1441.469971,1.339966,38.02002,38.02002,40.739991
