In [1]:
import sys
sys.path.append("../") # go to parent dir
from util.read_data import DataReader

In [2]:
reader = DataReader()
df = reader.read_all_data()

In [3]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import datetime

In [25]:
train, test = df[df.Date < datetime.datetime(2017, 1, 1)], df[df.Date >= datetime.datetime(2017, 1, 1)]

test_start_date = datetime.datetime(2017, 1, 1)
y_true_test = y_true[y_true.index >= test_start_date]


Baseline model: $\hat{E}[v_{d+1}]=v_d$

In [33]:

def baseline_model(df):
    y_true = df.Volume
    y_pred = df.Volume.shift(1)
    y_pred_test = y_pred[y_pred.index >= test_start_date]
    mse = mean_squared_error(y_true_test, y_pred_test)
    r2 = r2_score(y_true_test, y_pred_test)
    return (mse, r2)

In [42]:
baseline_model_metrics = baseline_model(df)
print("Baseline model metrics: mse={:e}, r2={:.3f}".format(baseline_model_metrics[0], baseline_model_metrics[1]))

Baseline model metrics: mse=3.701181e+17, r2=0.027


The baseline r2 score is **0.027**

Another simple model would use moving average over some time window.
The model is following: $V_{d+1}=\frac 1n \sum_{i=0}^{n-1} V_{d-i}$.
We can try multiple n values, le'ts try following values: {5, 10, 50, 100}



In [56]:
def moving_average_model(df, n):
    y_pred = df.Volume.rolling(n).mean().shift(1)
    y_pred_test = y_pred[y_pred.index >= datetime.datetime(2017, 1, 1)]
    mse = mean_squared_error(y_true_test, y_pred_test)
    r2 = r2_score(y_true_test, y_pred_test)
    return (mse, r2)

In [68]:
window_size = [1,3,5,10,50,100]
moving_average_models = [moving_average_model(df, n) for n in window_size]
for n, model in zip(window_size, moving_average_models):
    print("Moving average model metrics with for windows size {}: mse={:e}, r2={:.3f}".format(n, model[0], model[1]))


Moving average model metrics with for windows size 1: mse=3.701181e+17, r2=0.027
Moving average model metrics with for windows size 3: mse=3.131172e+17, r2=0.176
Moving average model metrics with for windows size 5: mse=3.180588e+17, r2=0.163
Moving average model metrics with for windows size 10: mse=3.310901e+17, r2=0.129
Moving average model metrics with for windows size 50: mse=3.631842e+17, r2=0.045
Moving average model metrics with for windows size 100: mse=3.828377e+17, r2=-0.007


The moving average models for window size <3, 50> already exceed baseline model. The window size 1 confirms that te algorithm has correct implementation.