<h2> Import Libraries </h2>

In [None]:
!pip install lime
!pip install shap
!pip install eli5

In [113]:
#Data handling
import pandas as pd
import numpy as np
import scipy as sp
import gc
import pickle
#preprocessing and feature selection
import sklearn.preprocessing
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2
from sklearn.model_selection import train_test_split, RandomizedSearchCV
#models
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
#evaluation and interpretability
import lime
import shap
import eli5
import sklearn.metrics

<h2> Pre-process Data for Model Training </h2>

<b> STEP 1: Encode Datasets <b>

In [114]:
alldata = pd.read_pickle('../data/sp500finaldata.pkl')
#The only features that need to be label encoded are 'Sector' and 'DistanceFromLast'
alldata.dtypes[alldata.dtypes=='object']

firm                object
Sector              object
DistanceFromLast    object
dtype: object

In [115]:
labenc = LabelEncoder()
labenc.fit(alldata['Sector'])
with open('labencsector.pkl', 'wb') as q:
    pickle.dump(labenc, q)
print(labenc.transform(alldata['Sector']))
alldata['Sector'] = labenc.transform(alldata['Sector'])
labenc1 = LabelEncoder()
labenc1.fit(alldata['DistanceFromLast'])
with open('labencdist.pkl', 'wb') as q:
    pickle.dump(labenc1, q)
print(labenc1.transform(alldata['DistanceFromLast']))
alldata['DistanceFromLast'] = labenc1.transform(alldata['DistanceFromLast'])


[4 4 4 ... 4 4 4]
[6 0 0 ... 0 0 0]


In [116]:
onehot = OneHotEncoder(sparse=False)
#Since we would be dropping the 'firm' feature, only 'Sector' and 'DistanceFromLast' from above need to be hot-encoded
onehot.fit(alldata[['Sector', 'DistanceFromLast']])
with open('onehotenc.pkl', 'wb') as q:
    pickle.dump(onehot, q)
print(onehot.transform(alldata[['Sector', 'DistanceFromLast']]))
namessector = ['Sector_'+str(i) for i in labenc.classes_]
namesdist = ['DistLast_'+str(i) for i in labenc1.classes_]
names = np.append(namessector, namesdist)
enc = onehot.transform(alldata[['Sector', 'DistanceFromLast']])
enc = pd.DataFrame(enc, columns=names)
alldata.drop(['firm', 'Sector', 'DistanceFromLast'], axis=1, inplace=True)
alldata = pd.concat([alldata, enc], axis=1)

[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


<b> STEP 2 - Separate Two Models </b>

We divide our project into two distinct models - one where we attempt to predict 'Div_Paid?' (hence a classifier model predicting whether or not dividends will be paid), and another where we attempt to predict how much dividend, if paid (therefore a regressor model). Let us designate the different datasets for both.

In [117]:
classifdata = alldata.copy()
classifdata.drop(['Dividend', 'Dividend Yield', 'Payout Ratio'], axis=1, inplace=True)
regrdata = alldata[alldata['Div_Paid?']==1].copy()
regrdata.drop(['Div_Paid?','Dividend Yield', 'Payout Ratio'], axis=1, inplace=True)
classifdata.to_pickle('../data/classifdatafull.pkl')
regrdata.to_pickle('../data/regrdatafull.pkl')

<h2> Classifier Model - Dividend Outlook </h2>

 First we work towards the classifier model, and begin with feature selection. Subsequently, we try three different fits - RandomForestClassifier, GaussianProcessClassifier and MLPClassifier.

<b> STEP 1 - Feature Elimination </b>

We perform basic feature elimination here, and leave actual selection to fit based on model. The only selection performed here is chi2 testing (to check relationship to the target variable) and VarianceThreshold (differences within the feature).

In [118]:
#The simple variance elimination conducted on all variables. A generic 0.001 variance cut-off is used, since such a
#lax value allows us to include a multitude of variables whilst excluding those that do not significantly vary
scaler = MinMaxScaler(feature_range=(0,1), copy=True)
scld = scaler.fit_transform(classifdata)
scld = pd.DataFrame(scld, columns=classifdata.columns.values)
slt = VarianceThreshold(0.001)
slt.fit(scld)
lowvar = classifdata.columns.values[slt.get_support(indices=False)==False]
goodvar = list(set(classifdata.columns.values) - set(lowvar))
print(str(len(lowvar)) + ", " + str(len(goodvar)))
classifdata.drop(lowvar, axis=1, inplace=True)

9, 122


In [119]:
#We subsequently proceed to chi square analysis to determine which features do not truly vary with the response var
#Since 'other income (expense)' has negative values, we ignore it since chi2 needs positive values for calculations
#To reject the null hypothesis and claim that there is truly a relationship between the feature and response, we set 
#a chi-square cut-off at 4 (1 df approx value).
chi2arr, pval = chi2(classifdata.drop(['Div_Paid?','Other income (expense)'], axis=1), classifdata['Div_Paid?'])
lowrel = classifdata.drop(['Div_Paid?','Other income (expense)'], axis=1).columns.values[chi2arr < 4]
classifdata.drop(lowrel, axis=1, inplace=True)

<b> STEP 2 - Separate Training and Test Data </b>

In [120]:
#We use train-test split to decide the training and test data
Train_x, Test_x, Train_y, Test_y = train_test_split(classifdata.drop(['Div_Paid?'], axis=1), classifdata['Div_Paid?'], random_state=1)

<b> STEP 3 - RandomForestClassifier Implementation </b>

In [122]:
#We set up a randomizedsearchCV to perform hyperparameter tuning on a random forest classifier
params = {'n_estimators':[300, 500, 700], 'min_samples_split':[2,4,5], 'min_samples_leaf':[1,2,3], 
          'max_depth':[10, None], 'max_features':['auto', 'log2']}
mdl = RandomForestClassifier(n_jobs=-1, random_state=1)
randforcf = RandomizedSearchCV(estimator=mdl, param_distributions=params, n_jobs=-1, n_iter=10, cv=3, random_state=1)
randforcf.fit(Train_x, Train_y)
print()




In [123]:
#Refit random forest on the best estimators gained
paramdict = randforcf.best_params_
randforcf = RandomForestClassifier(n_jobs=-1, random_state=1, n_estimators=paramdict['n_estimators'], min_samples_split=paramdict['min_samples_split'], min_samples_leaf=paramdict['min_samples_leaf'], max_features=paramdict['max_features'], max_depth=paramdict['max_depth'])
randforcf.fit(Train_x, Train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [129]:
#As before, we use a 0.001 cut-off for better fit. Using feature_importances_, we retrieve features with lower
#importance and eliminate them
elim=pd.Series(randforcf.feature_importances_, index=Train_x.columns)
elim = elim[elim < 0.001].index
Train_x.drop(elim, axis=1, inplace=True)
Test_x.drop(elim, axis=1, inplace=True)
randforcf = RandomForestClassifier(n_jobs=-1, random_state=1, n_estimators=paramdict['n_estimators'], min_samples_split=paramdict['min_samples_split'], min_samples_leaf=paramdict['min_samples_leaf'], max_features=paramdict['max_features'], max_depth=paramdict['max_depth'])
randforcf.fit(Train_x, Train_y)
print()




In [130]:
#Perform sklearn metrics evaluation
ypred = randforcf.predict(Test_x)
sklearn.metrics.accuracy_score(Test_y, ypred)

0.9789674952198852

In [None]:
shap