In [1]:
import sys
sys.path.append("../") # go to parent dir
from util.read_data import DataReader
from util.evaluator import ModelEvaluator


In [2]:
reader = DataReader()
df = reader.read_all_data_normalized()
evaluator = ModelEvaluator(reader.label_scaler)
df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'DayOfWeek', 'Month',
       'AdjClose', 'OpenDiff', 'CloseDiff', 'AdjCloseDiff', 'HighLowDiff'],
      dtype='object')

In [3]:
df_test_features, series_test_volume = reader.get_test_data(df)


Baseline model: $\hat{E}[v_{d+1}]=v_d$

In [4]:

def baseline_model(volume):
    pred_volume = volume.shift(1)
    return pred_volume


In [5]:
baseline_pred_volume = baseline_model(series_test_volume)
baseline_model_results = evaluator.evaluate("baseline", series_test_volume[1:], baseline_pred_volume[1:])
print(baseline_model_results)

baseline: MSE = 3.684400e+17, R2 = 0.033, confidence interval 95% = (-53,069,427 - 53,341,067)


The baseline r2 score is **0.033**

Another simple model would use moving average over some time window.
The model is following: $V_{d+1}=\frac 1n \sum_{i=0}^{n-1} V_{d-i}$.
We can try multiple n values, le'ts try following values: {1, 3, 5, 10, 50, 100}



In [6]:
def moving_average_model(volume, n):
    pred_volume = volume.rolling(n).mean().shift(1)
    return pred_volume

In [7]:
window_size = [1,3,5,10,50,100]
moving_average_result_list = [evaluator.evaluate("moving_average {}".format(n), 
                                                 moving_average_model(series_test_volume, n)[n:], series_test_volume[n:]) 
                                                 for n in window_size]

for result in moving_average_result_list:
    print(result)


moving_average 1: MSE = 3.684400e+17, R2 = 0.033, confidence interval 95% = (-53,341,067 - 53,069,427)
moving_average 3: MSE = 3.099159e+17, R2 = -0.239, confidence interval 95% = (-48,259,465 - 49,530,282)
moving_average 5: MSE = 3.128615e+17, R2 = -0.523, confidence interval 95% = (-45,836,522 - 52,614,748)
moving_average 10: MSE = 3.298599e+17, R2 = -1.303, confidence interval 95% = (-39,324,762 - 62,279,075)
moving_average 50: MSE = 3.892483e+17, R2 = -6.839, confidence interval 95% = (-23,387,014 - 91,775,513)
moving_average 100: MSE = 4.390438e+17, R2 = -23.992, confidence interval 95% = (-11,765,661 - 117,942,651)


The moving average models for window size <3, 50> does not help much. The window size 1 at least confirms that te algorithm has correct implementation.

Le'ts try to use exponential moving average, 

In [10]:
def exp_moving_average_model(volume, n):
    pred_volume = volume.ewm(span=n).mean().shift(1)
    return pred_volume



In [15]:
window_size = [1,3,5,10,50,100]
exp_moving_average_result_list = [evaluator.evaluate("moving_average {}".format(n),
                                                 exp_moving_average_model(series_test_volume, n)[n:], series_test_volume[n:])
                                                 for n in window_size]

for result in exp_moving_average_result_list:
    print(result)
data 

moving_average 1: MSE = 3.684400e+17, R2 = 0.033, confidence interval 95% = (-53,341,067 - 53,069,427)
moving_average 3: MSE = 2.880625e+17, R2 = -0.213, confidence interval 95% = (-46,252,592 - 48,026,359)
moving_average 5: MSE = 2.845396e+17, R2 = -0.489, confidence interval 95% = (-42,536,855 - 51,352,553)
moving_average 10: MSE = 2.953661e+17, R2 = -1.156, confidence interval 95% = (-38,026,698 - 58,118,071)
moving_average 50: MSE = 3.594106e+17, R2 = -6.870, confidence interval 95% = (-25,767,251 - 84,893,406)
moving_average 100: MSE = 4.073537e+17, R2 = -18.608, confidence interval 95% = (-20,472,852 - 104,466,639)


Exponential moving average on the Volume does not help much either.

Let's not try to use first real machine-learning model, random forest classifier. The features will be month of year, day of week, difference between low and high price of previous day and historical parametrized window of following market data: volume data, and changes in the adjusted closing price. Size of the window will be: {3, 5, 10, 50, 100}. The expected result is that the precision will be better with longer time window, but with deminishing returns.

In [19]:
from scipy.stats import ttest_ind
from sklearn.ensemble import RandomForestRegressor

In [63]:
def random_forest_model(features, volume):
    regressor = RandomForestRegressor(n_estimators=100)
    
    regressor.fit(features, volume)
    return regressor


def eval_random_forest_model(n):
    # prepare the features
    df_enriched = reader.prepare_window_features_for_training(df, n)[n:]

    train_features, train_volume = reader.get_train_data(df_enriched)
    test_features, test_volume = reader.get_test_data(df_enriched)

    regressor = random_forest_model(train_features, train_volume)

   
    importances = list(zip(train_features.columns, regressor.feature_importances_))
    importances.sort(key=lambda x: x[1], reverse = True)
    print("First 5 Feature importances {}:".format(n))
    for importance in importances[:5]:
        print('Feature: {:30} \t Importance: {}'.format(importance[0], importance[1]))

    print(evaluator.evaluate("random_forest {} on train: ".format(n), regressor.predict(train_features), train_volume))
    random_forest_result = evaluator.evaluate("random_forest {} on test:".format(n), regressor.predict(test_features), test_volume)
    print(random_forest_result)
    print()



In [88]:
for n in (1,3,5,10,50,100):
    eval_random_forest_model(n)

First 5 Feature importances 1:
Feature: Volume1                        	 Importance: 0.9041325791077832
Feature: AdjCloseDiff1                  	 Importance: 0.04137267633802904
Feature: DayOfWeek_Monday               	 Importance: 0.009704294101326996
Feature: Month_12                       	 Importance: 0.005220430501699296
Feature: DayOfWeek_Tuesday              	 Importance: 0.004610987541570389
random_forest 1 on train: : MSE = 4.856137e+16, R2 = 0.979, confidence interval 95% = (-7,464,480 - 5,745,820)
random_forest 1 on test:: MSE = 2.897712e+17, R2 = 0.060, confidence interval 95% = (-111,726,040 - -17,451,412)

First 5 Feature importances 3:
Feature: Volume1                        	 Importance: 0.8349494424003479
Feature: Volume2                        	 Importance: 0.06276836690472176
Feature: Volume3                        	 Importance: 0.03102640283688815
Feature: AdjCloseDiff1                  	 Importance: 0.016977103762859033
Feature: AdjCloseDiff2                  	 Imp

It is obvious from the comparison between train and test results that the model has high variance, it overfits to the train set. The decision tree regression model is not suitable for the task.  is Le't see SVM model, for comparison.

In [77]:
from sklearn.svm import SVR

In [97]:
def svm_model(features, volume):
    regressor = SVR()
    
    regressor.fit(features, volume)
    return regressor


def eval_svm_model(n):
    # prepare the features
    df_enriched = reader.prepare_window_features_for_training(df, n)[n:]

    train_features, train_volume = reader.get_train_data(df_enriched)
    test_features, test_volume = reader.get_test_data(df_enriched)

    regressor = svm_model(train_features, train_volume)

    print(evaluator.evaluate("random_forest {} on train: ".format(n), regressor.predict(train_features), train_volume))
    random_forest_result = evaluator.evaluate("random_forest {} on test:".format(n), regressor.predict(test_features), test_volume)
    print(random_forest_result)
    print()

In [98]:
for n in (1, 2, 3,5,10,50,100):
    eval_svm_model(n)



random_forest 1 on train: : MSE = 2.987715e+17, R2 = 0.857, confidence interval 95% = (22,203,979 - 54,970,986)
random_forest 1 on test:: MSE = 2.841244e+17, R2 = 0.135, confidence interval 95% = (-27,130,374 - 66,221,161)





random_forest 2 on train: : MSE = 2.579920e+17, R2 = 0.877, confidence interval 95% = (18,432,532 - 48,884,906)
random_forest 2 on test:: MSE = 2.460071e+17, R2 = 0.107, confidence interval 95% = (-15,977,399 - 70,886,847)





random_forest 3 on train: : MSE = 2.479718e+17, R2 = 0.882, confidence interval 95% = (14,772,783 - 44,631,416)
random_forest 3 on test:: MSE = 2.466826e+17, R2 = 0.048, confidence interval 95% = (-14,763,232 - 72,220,184)





random_forest 5 on train: : MSE = 2.272681e+17, R2 = 0.892, confidence interval 95% = (15,414,757 - 44,006,437)
random_forest 5 on test:: MSE = 2.611143e+17, R2 = 0.006, confidence interval 95% = (-32,201,483 - 57,290,172)





random_forest 10 on train: : MSE = 2.040400e+17, R2 = 0.903, confidence interval 95% = (12,209,797 - 39,316,855)
random_forest 10 on test:: MSE = 2.597789e+17, R2 = -0.081, confidence interval 95% = (-47,058,810 - 42,203,708)





random_forest 50 on train: : MSE = 1.565243e+17, R2 = 0.924, confidence interval 95% = (12,105,240 - 35,959,218)
random_forest 50 on test:: MSE = 2.750435e+17, R2 = -0.960, confidence interval 95% = (-38,804,714 - 53,042,907)





random_forest 100 on train: : MSE = 1.463036e+17, R2 = 0.926, confidence interval 95% = (14,169,781 - 37,369,423)
random_forest 100 on test:: MSE = 2.787748e+17, R2 = -0.902, confidence interval 95% = (-66,944,709 - 25,523,821)



The best SVM model is again the least overfitted, with only latest value of Volume and AdjCloseDiff1, and seasonality features (DayOfWeek, Month). The R2 = 0.135, with confidence interval 95% of average error = (-27,130,374 - 66,221,161)

It seem that we are not moving forward a lot with traditional ML models. 