In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet, Ridge, Lasso, LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from tqdm import tqdm
import copy

from utils_milestone2 import get_numeric_non_infinite_cols, add_pca_cols, scale_train_test, pca_train_test, run_model


In [2]:
Run_Feature_Selection = False

### Load the Data

In [3]:
X_train = pd.read_pickle("data\\milestone_data_X_train.pkl")
y_train = pd.read_pickle("data\\milestone_data_y_train.pkl")
X_test = pd.read_pickle("data\\milestone_data_X_test.pkl")
y_test = pd.read_pickle("data\\milestone_data_y_test.pkl")

In [4]:
X_val = pd.read_pickle("data\\milestone_data_X_val.pkl")

In [5]:
print("X_train shape", X_train.shape)
print("y_train shape", y_train.shape)
print("X_test shape", X_test.shape)
print("y_test shape", y_test.shape)

X_train shape (221122, 1095)
y_train shape (221122,)
X_test shape (32257, 1095)
y_test shape (32257,)


### Data Preprocessing
Check there are no nulls

In [6]:
is_NaN = X_train.isnull()
col_has_NaN = is_NaN.any(axis=0)
col_has_NaN = col_has_NaN.loc[col_has_NaN==True].index.to_list()
col_has_NaN

[]

Identify numeric columns

In [7]:
X_train_numeric = X_train._get_numeric_data()

Drop columns that contain infinity or negative infinity

In [8]:
col_has_inf = X_train_numeric.columns.to_series()[np.isinf(X_train_numeric).any()].to_list()
col_has_inf

['volume_adi', 'trend_vortex_ind_pos', 'trend_vortex_ind_neg']

In [9]:
X_train_numeric = X_train_numeric.drop(col_has_inf, axis=1)

In [10]:
cols = X_train_numeric.columns.to_list()
X_test_numeric = X_test[cols]

### Scoring
First using a typical regression metric

In [11]:
from sklearn.metrics import mean_absolute_error

Now converting to a classification metric. We will test for precision. We will first establish a benchmark by calculating what our precision would be if we simply predict that everything goes up.

In [12]:
from sklearn.metrics import precision_score

In [13]:
def pred_to_clf_pred(pred, threshold=1):
    pred_clf = (pred > threshold) * 1 # multiply by 1 to change to binary from True / False
    
    return pred_clf

In [14]:
y_train_clf = pred_to_clf_pred(y_train, threshold=1)
y_test_clf = pred_to_clf_pred(y_test, threshold=1)

In [15]:
y_all_up_train = [1] * len(y_train_clf)
y_all_up_test = [1] * len(y_test_clf)

In [16]:
benchmark_precision_train_score = precision_score(y_train_clf, y_all_up_train)
benchmark_precision_test_score = precision_score(y_test_clf, y_all_up_test)
benchmark_return_train = np.mean(y_train) - 1
benchmark_return_test = np.mean(y_test) - 1

In [17]:
def print_benchmarks():
    print("Benchmark figures over 20 day prediction horizon:")
    print("benchmark precision train:", round(benchmark_precision_train_score,5))
    print("benchmark precision test:", round(benchmark_precision_test_score,5))
    print("benchmark return train:", round(benchmark_return_train,5))
    print("benchmark return test:", round(benchmark_return_test,5))

In [18]:
print_benchmarks()

Benchmark figures over 20 day prediction horizon:
benchmark precision train: 0.53814
benchmark precision test: 0.5835
benchmark return train: 0.005
benchmark return test: 0.02214


This is the benchmark precision if we predict everything goes up. Now we will test the precision of our model. We can see that market conditions in the test period were noticeably different to the training period as stocks were going up a lot more frequently over 20 day trading horizons.

In [19]:
#y_pred_clf_train = pred_to_clf_pred(y_pred_train, threshold=1)
#y_pred_clf = pred_to_clf_pred(y_pred_test, threshold=1) #(y_pred_test > 1) * 1

We will check the precision score on our training data.

In [20]:
def threshold_precision(y, pred, percentile):
    threshold = np.percentile(pred, percentile)
    pred_clf = (pred > threshold) * 1
    threshold_precision = precision_score(y, pred_clf)
    
    return round(threshold_precision,5)

In [21]:
# def full_and_threshold_scoring(y, y_pred, percentile, model_type="regression"):
#     threshold = np.percentile(y_pred, percentile)
#     results = {}
#     threshold = 1
#     if model_type=="classification":
#         threshold = 0.5
    
#     y_clf = pred_to_clf_pred(y, threshold=threshold)
#     y_pred_clf = pred_to_clf_pred(y_pred, threshold=threshold)
#     if model_type=="classification":
#         results['default_precision'] = round(precision_score(y_clf, y_pred_clf),5)
#         #results['threshold_precision'] = threshold_precision(y, y_pred, percentile)    
#     if model_type=="regression":
#         results['default_precision'] = round(precision_score(y_clf, y_pred_clf),5)
#         results['threshold_precision'] = threshold_precision(y_clf, y_pred, percentile)
#         results['default_return'] = round(np.mean(y[y_pred>1])-1,5)
#         results['threshold_return'] = round(np.mean(y[y_pred>threshold])-1,5)
    
#     return results

In [22]:
def full_and_threshold_scoring(y, y_pred, percentile):
    threshold = np.percentile(y_pred, percentile)
    results = {}
    y_clf = pred_to_clf_pred(y, threshold=1)
    y_pred_clf = pred_to_clf_pred(y_pred, threshold=1)
    results['default_precision'] = round(precision_score(y_clf, y_pred_clf),5)
    results['default_return'] = round(np.mean(y[y_pred>1])-1,5)
    results['threshold_precision'] = threshold_precision(y_clf, y_pred, percentile)
    results['threshold_return'] = round(np.mean(y[y_pred>threshold])-1,5)
    
    return results

We will calculate model peformance on train and test using the metrics specified.

In [23]:
print_benchmarks()

Benchmark figures over 20 day prediction horizon:
benchmark precision train: 0.53814
benchmark precision test: 0.5835
benchmark return train: 0.005
benchmark return test: 0.02214


#### Training Results
Here we can see that in our training data we achieved an improvement over the benchmark for precision and return where we set the precision threshold to predict any stock with a prediction of the price going up to go up. When we made our threshold more restrictive, so that we only predicted the stock would go up for our 5% highest predictions, on our train data the precision and return vastly outperformed the benchmark.
#### Test Results
On the test data we did not beat the benchmark when we classified all predictions that a stock would go up as the stock would go up. When we only indicated the stock would go up for our highest 5% of predictions, our precision is even worse although the average return is slightly better than at the default threshold. Now we will need to use some supervised learning techniques to select features and try with different models to improve on this precision score and aim get it above the benchmark. The issue is likely to be partly overfitting and partly model drift as we saw from the benchmark precision the market conditions were very different in the test period to the training period.

### Engineering Features
Given the large number of features already generated and retrieved, it is not clear how much feature engineering will add. However, we will at least generate features that demonstrate when volume is above average as in the financial domain, price moves are widely thought to be more meaningful and likely to continue when accompanied by heavy volume.

In [24]:
def feature_engineering(df):
    df['Volume_over_Volume_MA50'] = df['Volume'] / df['Volume_MA50']
    df['Volume_over_Volume_MA200'] = df['Volume'] / df['Volume_MA200']
    df['Volume_MA50_over_Volume_MA200'] = df['Volume_MA50'] / df['Volume_MA200']
    
    return df

In [25]:
X_train_numeric = feature_engineering(X_train_numeric)
X_test_numeric = feature_engineering(X_test_numeric)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


### Optional Scaling and Normalization

In [26]:
%%time
X_train_numeric_scaled, X_test_numeric_scaled = scale_train_test(X_train_numeric, X_test_numeric)

Wall time: 6.01 s


In [27]:
%%time
X_train_pca, X_test_pca = pca_train_test(X_train_numeric_scaled, X_test_numeric_scaled, num_components=200, random_state=2021)

Total Explained 0.9994445216152267
Wall time: 21.8 s


In [28]:
%%time
X_train_numeric_plus_pca = add_pca_cols(X_train_numeric, X_train_pca)
X_test_numeric_plus_pca = add_pca_cols(X_test_numeric, X_test_pca)

Wall time: 1.17 s


### Feature Selection
We saw in the earlier notebook that modeling with just a few features achieved better results than using all 1000+ features. Here we will start with a tiny number of features and iteratively try adding (& later dropping) features to improve model performance.

In [29]:
cols = X_train_numeric_plus_pca.columns.to_list()[::-1]

In [30]:
selected_cols = ['SR', 'ROCE_TTM_pct', 'Price_to_FCF_TTM', 'MACDSIGNAL_55', 'trend_ema_slow', 'others_dlr']
#selected_cols = ['SR', 'ROCE_TTM_pct', 'Price_to_FCF_TTM', 'MACDSIGNAL_55', 'trend_ema_slow', 'others_dlr', 'pca_199', 'pca_110', 'NATR_90', 'MACDEXT_macd_f34_s89_sig55', 'MACDEXT_macd_f21_s89_sig55', 'MACDEXT_macdsignal_f8_s89_sig55', 'MACDEXT_macdhist_f8_s89_sig5', 'MACDEXT_macd_f8_s89_sig5', 'MACDEXT_macdsignal_f8_s55_sig34', 'MACDEXT_macdhist_f3_s55_sig21', 'PLUS_DI_25', 'MACDEXT_macdsignal_f21_s89_sig8', 'MACDEXT_macdhist_f2_s21_sig8', 'ADOSC_f21_s89']

In [31]:
#threshold_return_train = 0.07564
#threshold_return_test = 0.06424
default_precision_test = 0
default_precision_train = 0

In [32]:
# model = LogisticRegression()
# y_train_clf = pred_to_clf_pred(y_train, threshold=1)
# y_test_clf = pred_to_clf_pred(y_test, threshold=1)
# model.fit(X_train_numeric_plus_pca[current_selection], y_train_clf)
# # make the predictions
# y_pred_fundamental_train = model.predict_proba(X_train_numeric_plus_pca[current_selection])[:,1]
# y_pred_fundamental_test = model.predict_proba(X_test_numeric_plus_pca[current_selection])[:,1]
# # convert to classification
# y_pred_clf_fundamental_train = pred_to_clf_pred(y_pred_fundamental_train, threshold=0.5)
# y_pred_clf_fundamental_test = pred_to_clf_pred(y_pred_fundamental_test, threshold=0.5)
# print(y_pred_fundamental_train.shape)
# print(y_train.shape)
# train_scores = full_and_threshold_scoring(y_train,y_pred_fundamental_train,95,model_type="classification")
# test_scores = full_and_threshold_scoring(y_test,y_pred_fundamental_test,95,model_type="classification")


In [33]:
#test_scores

In [34]:
%%time

loop_best = 0
if Run_Feature_Selection:
    for a in range(20): # the second loop gives a chance to drop features that were useful originally but are not useful with the latest best feature set   
        for c in tqdm(cols):
            current_selection = copy.deepcopy(selected_cols)
            if c in current_selection and len(current_selection) > 1:
                current_selection = [x for x in current_selection if x!=c]#.remove(c)
            else:
                current_selection.append(c)

            train_scores, test_scores = run_model(current_selection, X_train_numeric_plus_pca, X_test_numeric_plus_pca, y_train, y_test)

            if test_scores['default_precision'] > default_precision_test and train_scores['default_precision'] > default_precision_train:
                #print("improvement found")
                #print(current_selection)
                default_precision_test = test_scores['default_precision']
                default_precision_train = train_scores['default_precision']
                selected_cols = copy.deepcopy(current_selection)

        print("train precision at default", default_precision_train, "test precision at default", default_precision_test)
        print("##################################################################")
        print(selected_cols)

        if loop_best==default_precision_test:
            print("early stopping no improvement")
            break
        loop_best=default_precision_test

    

Wall time: 0 ns


In [35]:
train_scores, test_scores = run_model(selected_cols, X_train_numeric_plus_pca, X_test_numeric_plus_pca, y_train, y_test)

In [36]:
print_benchmarks()
print("train scores:", train_scores)
print("test scores:", test_scores)

Benchmark figures over 20 day prediction horizon:
benchmark precision train: 0.53814
benchmark precision test: 0.5835
benchmark return train: 0.005
benchmark return test: 0.02214
train scores: {'default_precision': 0.53753, 'default_return': 0.00564, 'threshold_precision': 0.60601, 'threshold_return': 0.0283}
test scores: {'default_precision': 0.58484, 'default_return': 0.02402, 'threshold_precision': 0.6181, 'threshold_return': 0.03136}


### Next Steps
Feature Selection and Feature Engineering