In [187]:
import numpy as np
import pandas as pd
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet, Ridge, Lasso
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, HuberRegressor, PassiveAggressiveRegressor, Lars
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import copy


### Load the Data

In [2]:
X_train = pd.read_pickle("data\\milestone_data_X_train.pkl")
y_train = pd.read_pickle("data\\milestone_data_y_train.pkl")
X_test = pd.read_pickle("data\\milestone_data_X_test.pkl")
y_test = pd.read_pickle("data\\milestone_data_y_test.pkl")

In [3]:
X_val = pd.read_pickle("data\\milestone_data_X_val.pkl")

In [4]:
print("X_train shape", X_train.shape)
print("y_train shape", y_train.shape)
print("X_test shape", X_test.shape)
print("y_test shape", y_test.shape)

X_train shape (221122, 1095)
y_train shape (221122,)
X_test shape (32257, 1095)
y_test shape (32257,)


### Data Preprocessing
Check there are no nulls

In [5]:
is_NaN = X_train.isnull()
col_has_NaN = is_NaN.any(axis=0)
col_has_NaN = col_has_NaN.loc[col_has_NaN==True].index.to_list()
col_has_NaN

[]

Identify numeric columns

In [6]:
X_train_numeric = X_train._get_numeric_data()

Drop columns that contain infinity or negative infinity

In [7]:
col_has_inf = X_train_numeric.columns.to_series()[np.isinf(X_train_numeric).any()].to_list()
col_has_inf

['volume_adi', 'trend_vortex_ind_pos', 'trend_vortex_ind_neg']

In [8]:
X_train_numeric = X_train_numeric.drop(col_has_inf, axis=1)

In [9]:
cols = X_train_numeric.columns.to_list()

### Modeling

In [10]:
%%time
model = BayesianRidge(compute_score=True)
model.fit(X_train_numeric, y_train)

Wall time: 32.7 s


BayesianRidge(compute_score=True)

In [11]:
%%time
y_pred_train = model.predict(X_train[cols])
y_pred_test = model.predict(X_test[cols])

Wall time: 1.81 s


In [12]:
# sanity check that this will run on the validation data - we won't score against the validation data until once at the very end
y_pred_val = model.predict(X_val[cols])

### Scoring
First using a typical regression metric

In [13]:
from sklearn.metrics import mean_absolute_error

First the training score:

In [14]:
mean_absolute_error(y_train, y_pred_train)

0.05871998787118553

Now the test score:

In [15]:
mean_absolute_error(y_test, y_pred_test)

0.06651413809583322

The test score is slightly worse than the train score but the difference does not seem huge.

Now converting to a classification metric. We will test for precision. We will first establish a benchmark by calculating what our precision would be if we simply predict that everything goes up.

In [16]:
from sklearn.metrics import precision_score

In [17]:
def pred_to_clf_pred(pred, threshold=1):
    pred_clf = (pred > threshold) * 1 # multiply by 1 to change to binary from True / False
    
    return pred_clf

In [18]:
y_train_clf = pred_to_clf_pred(y_train, threshold=1)
y_test_clf = pred_to_clf_pred(y_test, threshold=1)

In [19]:
y_all_up_train = [1] * len(y_train_clf)
y_all_up_test = [1] * len(y_test_clf)

In [20]:
benchmark_precision_train_score = precision_score(y_train_clf, y_all_up_train)
benchmark_precision_test_score = precision_score(y_test_clf, y_all_up_test)
benchmark_return_train = np.mean(y_train) - 1
benchmark_return_test = np.mean(y_test) - 1

In [21]:
def print_benchmarks():
    print("Benchmark figures over 20 day prediction horizon:")
    print("benchmark precision train:", round(benchmark_precision_train_score,5))
    print("benchmark precision test:", round(benchmark_precision_test_score,5))
    print("benchmark return train:", round(benchmark_return_train,5))
    print("benchmark return test:", round(benchmark_return_test,5))

In [22]:
print_benchmarks()

Benchmark figures over 20 day prediction horizon:
benchmark precision train: 0.53814
benchmark precision test: 0.5835
benchmark return train: 0.005
benchmark return test: 0.02214


This is the benchmark precision if we predict everything goes up. Now we will test the precision of our model. We can see that market conditions in the test period were noticeably different to the training period as stocks were going up a lot more frequently over 20 day trading horizons.

In [23]:
#y_pred_clf_train = pred_to_clf_pred(y_pred_train, threshold=1)
#y_pred_clf = pred_to_clf_pred(y_pred_test, threshold=1) #(y_pred_test > 1) * 1

We will check the precision score on our training data.

In [24]:
def threshold_precision(y, pred, percentile):
    threshold = np.percentile(pred, percentile)
    pred_clf = (pred > threshold) * 1
    threshold_precision = precision_score(y, pred_clf)
    
    return round(threshold_precision,5)

In [25]:
def full_and_threshold_scoring(y, y_pred, percentile):
    threshold = np.percentile(y_pred, percentile)
    results = {}
    y_clf = pred_to_clf_pred(y, threshold=1)
    y_pred_clf = pred_to_clf_pred(y_pred, threshold=1)
    results['default_precision'] = round(precision_score(y_clf, y_pred_clf),5)
    results['default_return'] = round(np.mean(y[y_pred>1])-1,5)
    results['threshold_precision'] = threshold_precision(y_clf, y_pred, percentile)
    results['threshold_return'] = round(np.mean(y[y_pred>threshold])-1,5)
    
    return results

We will calculate model peformance on train and test using the metrics specified.

In [26]:
print_benchmarks()
print("train:",full_and_threshold_scoring(y_train,y_pred_train,95))
print("test: ",full_and_threshold_scoring(y_test,y_pred_test,95))


Benchmark figures over 20 day prediction horizon:
benchmark precision train: 0.53814
benchmark precision test: 0.5835
benchmark return train: 0.005
benchmark return test: 0.02214
train: {'default_precision': 0.57186, 'default_return': 0.01559, 'threshold_precision': 0.70779, 'threshold_return': 0.08725}
test:  {'default_precision': 0.57052, 'default_return': 0.01887, 'threshold_precision': 0.52821, 'threshold_return': 0.01978}


#### Training Results
Here we can see that in our training data we achieved an improvement over the benchmark for precision and return where we set the precision threshold to predict any stock with a prediction of the price going up to go up. When we made our threshold more restrictive, so that we only predicted the stock would go up for our 5% highest predictions, on our train data the precision and return vastly outperformed the benchmark.
#### Test Results
On the test data we did not beat the benchmark when we classified all predictions that a stock would go up as the stock would go up. When we only indicated the stock would go up for our highest 5% of predictions, our precision is even worse although the average return is slightly better than at the default threshold. Now we will need to use some supervised learning techniques to select features and try with different models to improve on this precision score and aim get it above the benchmark. The issue is likely to be partly overfitting and partly model drift as we saw from the benchmark precision the market conditions were very different in the test period to the training period.

### Engineering Features
Given the large number of features already generated and retrieved, it is not clear how much feature engineering will add. However, we will at least generate features that demonstrate when volume is above average as in the financial domain, price moves are widely thought to be more meaningful and likely to continue when accompanied by heavy volume.

In [27]:
def feature_engineering(df):
    df['Volume_over_Volume_MA50'] = df['Volume'] / df['Volume_MA50']
    df['Volume_over_Volume_MA200'] = df['Volume'] / df['Volume_MA200']
    df['Volume_MA50_over_Volume_MA200'] = df['Volume_MA50'] / df['Volume_MA200']
    
    return df

In [28]:
X_train_numeric = feature_engineering(X_train_numeric)
X_test = feature_engineering(X_test)

### Feature Selection
We saw in the earlier notebook that modeling with just a few features achieved better results than using all 1000+ features. Here we will start with a tiny number of features and iteratively try adding (& later dropping) features to improve model performance.

In [34]:
cols = X_train_numeric.columns.to_list()

In [181]:
selected_cols = ['volatility_kcl', 'Open', 'High', 'Low', 'Volume', 'Volume_MA200', 'SR', 'ValueRank', 'MomentumRank', 'Price_to_Book_Latest', 'Sales_Growth_TTM_pct', 'volume_sma_em', 'volume_nvi', 'volume_vwap', 'volatility_bbhi', 'volatility_kcw', 'volatility_kchi', 'volatility_dch', 'trend_mass_index', 'trend_ichimoku_a', 'trend_psar_down', 'trend_stc', 'momentum_stoch_signal', 'momentum_kama', 'AROONOSC_14', 'MOM_40', 'ROC_60', 'ROCP_60', 'RSI_60', 'ROCR100_90', 'MACDEXT_macdhist_f3_s21_sig55', 'MACDEXT_macdhist_f8_s34_sig5', 'MACDEXT_macdhist_f8_s89_sig34', 'MACDEXT_macd_f34_s55_sig5', 'EPS_Growth_TTM_pct_NaN', 'Volume_over_Volume_MA50', 'ROE_TTM_pct', 'trend_vortex_ind_diff', 'PLUS_DI_3', 'AROONOSC_20', 'PLUS_DI_20', 'WILLR_40', 'AROONup_60', 'MACDEXT_macdhist_f2_s34_sig55', 'volume_cmf']
selected_cols = ['SR']

In [182]:
threshold_return_test = 0.06424
threshold_return_test = 0
threshold_return_train = 0

In [183]:
def run_model(current_selection):
        np.random.seed(0)
        #model = RandomForestRegressor(max_depth=3, random_state=6, criterion="mse", n_jobs=-1) #, min_impurity_decrease=0.01) # 
        #model = LinearRegression()
        #model = MLPRegressor(random_state=7)
        model = PassiveAggressiveRegressor()
        model.fit(X_train_numeric[current_selection], y_train)
        # make the predictions
        y_pred_fundamental_train = model.predict(X_train_numeric[current_selection])
        y_pred_fundamental_test = model.predict(X_test[current_selection])
        # convert to classification
        y_pred_clf_fundamental_train = pred_to_clf_pred(y_pred_fundamental_train, threshold=1)
        y_pred_clf_fundamental_test = pred_to_clf_pred(y_pred_fundamental_test, threshold=1)
        train_scores = full_and_threshold_scoring(y_train,y_pred_fundamental_train,95)
        test_scores = full_and_threshold_scoring(y_test,y_pred_fundamental_test,95)
        #print(c, test_scores['threshold_return'] )
        
        return train_scores, test_scores


In [184]:
%%time

loop_best = 0
for a in range(20): # the second loop gives a chance to drop features that were useful originally but are not useful with the latest best feature set   
    for c in tqdm(cols):
        current_selection = copy.deepcopy(selected_cols)
        if c in current_selection and len(current_selection) > 1:
            current_selection.remove(c)
        else:
            current_selection.append(c)
            
        train_scores, test_scores = run_model(current_selection)
            
        if test_scores['threshold_return'] > threshold_return_test and train_scores['threshold_return'] > threshold_return_train:
            #print("improvement found")
            #print(current_selection)
            threshold_return_test = test_scores['threshold_return']
            threshold_return_train = train_scores['threshold_return']
            selected_cols = copy.deepcopy(current_selection)
            
    print("train return at threshold", threshold_return_train, "test return at threshold", threshold_return_test)
    print("###############################################################")
    print(selected_cols)
    
    if loop_best==threshold_return_test:
        print("early stopping no improvement")
        break
    loop_best=threshold_return_test

    

100%|██████████████████████████████████████████████████████████████████████████████| 1092/1092 [10:09<00:00,  1.79it/s]
  0%|                                                                                         | 0/1092 [00:00<?, ?it/s]

train return at threshold 0.05668 test return at threshold 0.05113
###############################################################
['Open', 'Adj. close**', 'volatility_bbm', 'trend_macd', 'PLUS_DM_10', 'MACDEXT_macdhist_f5_s8_sig34']


100%|██████████████████████████████████████████████████████████████████████████████| 1092/1092 [10:56<00:00,  1.66it/s]

train return at threshold 0.05668 test return at threshold 0.05113
###############################################################
['Open', 'Adj. close**', 'volatility_bbm', 'trend_macd', 'PLUS_DM_10', 'MACDEXT_macdhist_f5_s8_sig34']
early stopping no improvement
Wall time: 21min 6s





In [185]:
train_scores, test_scores = run_model(selected_cols)

In [186]:
print_benchmarks()
print("train scores:", train_scores)
print("test scores:", test_scores)

Benchmark figures over 20 day prediction horizon:
benchmark precision train: 0.53814
benchmark precision test: 0.5835
benchmark return train: 0.005
benchmark return test: 0.02214
train scores: {'default_precision': 0.52807, 'default_return': 0.00672, 'threshold_precision': 0.60658, 'threshold_return': 0.05668}
test scores: {'default_precision': 0.57331, 'default_return': 0.02748, 'threshold_precision': 0.52945, 'threshold_return': 0.05113}


### Next Steps
Feature Selection and Feature Engineering