In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet, Ridge, Lasso, LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, precision_score
from tqdm import tqdm
import copy

from utils_milestone2 import get_numeric_non_infinite_cols, add_pca_cols, scale_train_test, pca_train_test, run_model, \
feature_engineering, threshold_precision, full_and_threshold_scoring, pred_to_clf_pred, load_data


In [2]:
Run_Feature_Selection = True
model_type="classification"

### Load the Data

In [3]:
%%time
X_train, y_train, X_test, y_test, X_val = load_data()

Wall time: 2.32 s


In [4]:
print("X_train shape", X_train.shape)
print("y_train shape", y_train.shape)
print("X_test shape", X_test.shape)
print("y_test shape", y_test.shape)

X_train shape (221122, 1105)
y_train shape (221122,)
X_test shape (32257, 1105)
y_test shape (32257,)


### Data Preprocessing
Check there are no nulls

In [5]:
is_NaN = X_train.isnull()
col_has_NaN = is_NaN.any(axis=0)
col_has_NaN = col_has_NaN.loc[col_has_NaN==True].index.to_list()
col_has_NaN

[]

Identify numeric columns

In [6]:
X_train_numeric = X_train._get_numeric_data()

Drop columns that contain infinity or negative infinity

In [7]:
col_has_inf = X_train_numeric.columns.to_series()[np.isinf(X_train_numeric).any()].to_list()
col_has_inf

['volume_adi', 'trend_vortex_ind_pos', 'trend_vortex_ind_neg']

In [8]:
X_train_numeric = X_train_numeric.drop(col_has_inf, axis=1)

In [9]:
cols = X_train_numeric.columns.to_list()
X_test_numeric = X_test[cols]

Classify train examples as 1 if the price went up otherwise 0.

In [10]:
y_train_clf = pred_to_clf_pred(y_train, threshold=1)
y_test_clf = pred_to_clf_pred(y_test, threshold=1)

Create a naive benchmark where everything is predicted to go up.

In [11]:
y_all_up_train = [1] * len(y_train_clf)
y_all_up_test = [1] * len(y_test_clf)

### Calculate Benchmark Scores

In [12]:
benchmark_precision_train_score = precision_score(y_train_clf, y_all_up_train)
benchmark_precision_test_score = precision_score(y_test_clf, y_all_up_test)
benchmark_return_train = np.mean(y_train) - 1
benchmark_return_test = np.mean(y_test) - 1

In [13]:
def print_benchmarks():
    print("Benchmark figures over 20 day prediction horizon:")
    print("benchmark precision train:", round(benchmark_precision_train_score,5))
    print("benchmark precision test:", round(benchmark_precision_test_score,5))
    print("benchmark return train:", round(benchmark_return_train,5))
    print("benchmark return test:", round(benchmark_return_test,5))

In [14]:
print_benchmarks()

Benchmark figures over 20 day prediction horizon:
benchmark precision train: 0.53814
benchmark precision test: 0.5835
benchmark return train: 0.005
benchmark return test: 0.02214


This is the benchmark precision if we predict everything goes up. Now we will test the precision of our model. We can see that market conditions in the test period were noticeably different to the training period as stocks were going up a lot more frequently over 20 day trading horizons.

### Engineering Features
Given the large number of features already generated and retrieved, it is not clear how much feature engineering will add. However, we will at least generate features that demonstrate when volume is above average as in the financial domain, price moves are widely thought to be more meaningful and likely to continue when accompanied by heavy volume.

In [15]:
X_train_numeric = feature_engineering(X_train_numeric)
X_test_numeric = feature_engineering(X_test_numeric)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Volume_over_Volume_MA50'] = df['Volume'] / df['Volume_MA50']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Volume_over_Volume_MA200'] = df['Volume'] / df['Volume_MA200']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Volume_MA50_over_Volume_MA200'] = df['Volume_MA50'] / df['Volume_MA20

### Optional Scaling and Normalization

In [16]:
%%time
X_train_numeric_scaled, X_test_numeric_scaled = scale_train_test(X_train_numeric, X_test_numeric)

Wall time: 6.16 s


In [17]:
%%time
X_train_pca, X_test_pca = pca_train_test(X_train_numeric_scaled, X_test_numeric_scaled, num_components=200, random_state=2021)

Total Explained 0.9992301541002819
Wall time: 19.3 s


In [18]:
%%time
X_train_numeric_plus_pca = add_pca_cols(X_train_numeric, X_train_pca)
X_test_numeric_plus_pca = add_pca_cols(X_test_numeric, X_test_pca)

Wall time: 1.37 s


### Feature Selection
We saw in the earlier notebook that modeling with just a few features achieved better results than using all 1000+ features. Here we will start with a tiny number of features and iteratively try adding (& later dropping) features to improve model performance.

In [19]:
cols = X_train_numeric_plus_pca.columns.to_list()#[::-1]

In [20]:
selected_cols = ['SR', 'ROCE_TTM_pct', 'Price_to_FCF_TTM', 'MACDSIGNAL_55', 'trend_ema_slow', 'others_dlr']
#selected_cols = ['SR', 'ROCE_TTM_pct', 'Price_to_FCF_TTM', 'MACDSIGNAL_55', 'trend_ema_slow', 'others_dlr', 'Open', 'Volume', 'Sales_Growth_TTM_pct', 'volume_vpt']

In [21]:
default_precision_test = 0
default_precision_train = 0
default_return_test = 0
default_return_train = 0

In [22]:
cols_to_search = copy.deepcopy(cols)
cols_to_search = ['SR', 'ROCE_TTM_pct', 'Price_to_FCF_TTM', 'MACDSIGNAL_55', 'trend_ema_slow', 'others_dlr'] + ['Sector_Basic Materials',
 'Sector_Consumer Cyclicals',
 'Sector_Consumer Defensives',
 'Sector_Energy',
 'Sector_Financials',
 'Sector_Healthcare',
 'Sector_Industrials',
 'Sector_Technology',
 'Sector_Telecoms',
 'Sector_Utilities']
cols_to_search

['SR',
 'ROCE_TTM_pct',
 'Price_to_FCF_TTM',
 'MACDSIGNAL_55',
 'trend_ema_slow',
 'others_dlr',
 'Sector_Basic Materials',
 'Sector_Consumer Cyclicals',
 'Sector_Consumer Defensives',
 'Sector_Energy',
 'Sector_Financials',
 'Sector_Healthcare',
 'Sector_Industrials',
 'Sector_Technology',
 'Sector_Telecoms',
 'Sector_Utilities']

In [23]:
%%time

loop_best = 0
if Run_Feature_Selection:
    for a in range(20): # later loops give a chance to drop features that were useful originally but are not useful with the latest best feature set   
        for c in tqdm(cols_to_search):
            current_selection = copy.deepcopy(selected_cols)
            if c in current_selection and len(current_selection) > 1:
                current_selection = [x for x in current_selection if x!=c]#.remove(c)
            else:
                current_selection.append(c)

            train_scores, test_scores = run_model(current_selection, X_train_numeric_plus_pca, X_test_numeric_plus_pca, y_train, y_test, model_type=model_type)

            if test_scores['default_return'] > default_return_test and train_scores['default_return'] > default_return_train:
                default_return_test = test_scores['default_return']
                default_return_train = train_scores['default_return']
                selected_cols = copy.deepcopy(current_selection)

        print("train return at default", default_return_train, "test return at default", default_return_test)
        print("##################################################################")
        print(selected_cols)

        if loop_best==default_return_test:
            print("early stopping no improvement")
            break
        loop_best=default_return_test

    

100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:47<00:00,  3.00s/it]
  0%|                                                                                           | 0/16 [00:00<?, ?it/s]

train return at default 0.00581 test return at default 0.02206
##################################################################
['ROCE_TTM_pct', 'Price_to_FCF_TTM', 'MACDSIGNAL_55', 'trend_ema_slow', 'others_dlr']


100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:47<00:00,  2.95s/it]

train return at default 0.00581 test return at default 0.02206
##################################################################
['ROCE_TTM_pct', 'Price_to_FCF_TTM', 'MACDSIGNAL_55', 'trend_ema_slow', 'others_dlr']
early stopping no improvement
Wall time: 1min 35s





### Final Model Run
With our selected features we can run the final model.

In [24]:
train_scores, test_scores = run_model(selected_cols, X_train_numeric_plus_pca, X_test_numeric_plus_pca, y_train, y_test, model_type=model_type)

In [25]:
print_benchmarks()
print("train scores:", train_scores)
print("test scores:", test_scores)

Benchmark figures over 20 day prediction horizon:
benchmark precision train: 0.53814
benchmark precision test: 0.5835
benchmark return train: 0.005
benchmark return test: 0.02214
train scores: {'default_precision': 0.5424, 'default_return': 0.00581, 'threshold_precision': 0.66211, 'threshold_return': 0.0403}
test scores: {'default_precision': 0.58404, 'default_return': 0.02206, 'threshold_precision': 0.64764, 'threshold_return': 0.03728}


### Next Steps
Feature Selection and Feature Engineering