### Feature Importance and Selection:

- Objective: identify most important features for ML algorithm and then output a selected dataset
- Method: first use Univariate tests to discard poor features (important when dealing with many features)
    Then, use recursive feature elimination and feature importances to get a feeling for ranking among features left


Dilemma: should we tune a model before feeding it to RFECV? Answer: first loosely tune models and then use regularized decision trees for a potentially more rigorouse answer:
            https://arxiv.org/pdf/1201.1587.pdf

In [7]:
%load_ext autoreload

%autoreload 2

In [8]:
%matplotlib inline
import pandas as pd
import numpy as np
from scipy.io import arff
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import os

In [9]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, average_precision_score
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.utils.fixes import signature
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [10]:
os.listdir('../data/pickle_files')

['df_pickle_w_all_stats', 'df_pickle', 'df_pickle_w_time_stats']

In [11]:
#Loading df1 after it has been augmented in iteration 2:
df = pd.read_pickle('../data/pickle_files/df_pickle_w_time_stats')
#Excluding the duration variable as it cannot be used in our baseline
df = df.drop(columns = ['duration', 'Date'])

In [12]:
def scale_noncat_only(df):
    X_temp_noncat = df.select_dtypes(exclude=['int8'])
    X_temp_cat = df.select_dtypes(include=['int8'])
    scaler = preprocessing.StandardScaler().fit(X_temp_noncat)
    X_transformed = scaler.transform(X_temp_noncat)
    X = pd.concat([pd.DataFrame(X_transformed, columns =X_temp_noncat.columns), X_temp_cat], axis = 1)
    X = X.drop(columns=['y'])
    y = df['y']
    #will work with numpy arrays
    y = np.array(y)
    X = np.array(X)
    
    return X, y

X, y = scale_noncat_only(df)

In [13]:
#Checking dtypes have loaded correctly (should return empty index)
df.select_dtypes(exclude = ['float64', 'int64', 'int8']).columns

Index([], dtype='object')

In [14]:
y = df['y']
X = df.drop(columns=['y'])

In [15]:
X.shape

(41188, 36)

In [16]:
names = list(X.columns)

Univariate Feature Selection

In [17]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

In [18]:
#Using F Classifier scoring function
test = SelectKBest(score_func=f_classif, k='all')
fit = test.fit(X, y)
# summarize scores
np.set_printoptions(precision=3)

In [19]:
sorted(list(zip(names,list(fit.scores_))), key=lambda tup:tup[1], reverse=True)

[('nr.employed', 5926.610601111052),
 ('year', 5668.675543704237),
 ('emplmnt_industry_index', 5488.271703529525),
 ('pdays', 4861.173272049289),
 ('euribor3m', 4309.479048109171),
 ('emp.var.rate', 4023.829925487281),
 ('financial_past', 3102.0897885964437),
 ('previous', 2304.2570879417895),
 ('economy_past', 1310.1144145473545),
 ('inflation_delta_expect', 1238.188732938843),
 ('savings_delta_expect', 1209.943101759026),
 ('contact', 881.7070703057972),
 ('ind_turnover', 814.6379928053427),
 ('cons.price.idx', 778.5897948540367),
 ('poutcome', 705.678164442302),
 ('major_purch_expect', 687.1475353974454),
 ('economy_expect', 548.9286313222702),
 ('default', 410.59078467356807),
 ('unempl_expect', 301.84052115190866),
 ('railway_passengers', 183.03689361945257),
 ('campaign', 182.1566728779538),
 ('education', 138.0517555082783),
 ('cons.conf.idx', 124.40997473290119),
 ('cons_conf', 104.12406685116402),
 ('marital', 88.10704673107693),
 ('major_purch_opportu', 84.80120899724722),
 (

In [20]:
#Same as above with Mutual Info classification scoring function
test = SelectKBest(score_func=mutual_info_classif, k='all')
fit = test.fit(X, y)
# summarize scores
np.set_printoptions(precision=3)

In [21]:
sorted(list(zip(names,list(fit.scores_))), key=lambda tup:tup[1], reverse=True)

[('euribor3m', 0.075776270335983),
 ('financial_past', 0.07054494101370135),
 ('railway_passengers', 0.0704520297946305),
 ('cons.conf.idx', 0.06974844897392263),
 ('inflation_delta_expect', 0.0694976476041993),
 ('major_purch_expect', 0.06934043190878869),
 ('cons.price.idx', 0.0683562297522482),
 ('emplmnt_industry_index', 0.0678314387553669),
 ('ind_turnover', 0.06664044841988881),
 ('economy_past', 0.06563099355906554),
 ('unempl_expect', 0.06526844604400539),
 ('nr.employed', 0.06426763009528136),
 ('cons_conf', 0.0637162438309613),
 ('economy_expect', 0.06260972910029805),
 ('savings_delta_expect', 0.061037918433022975),
 ('household_debt_ratio', 0.060359977322939073),
 ('savings_expect', 0.059824428490030224),
 ('major_purch_opportu', 0.05770193061646367),
 ('emp.var.rate', 0.05709386154714746),
 ('year', 0.05373779075436391),
 ('poutcome', 0.03605166291194606),
 ('pdays', 0.035716881688217406),
 ('month', 0.028014115720239152),
 ('month_last_contact', 0.027378082493479727),
 ('

Recursive Feature Elimination

In [22]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [23]:
model = LogisticRegression()
rfe = RFE(model, 20)
fit = rfe.fit(X, y)

In [24]:
print("Num Features Selected: %d" % (fit.n_features_))
#print("Selected Features: %s" % (fit.support_))
#print("Feature Ranking: %s" % (fit.ranking_))

Num Features Selected: 20


In [25]:
sorted(list(zip(names, fit.ranking_)), key=lambda tup:tup[1], reverse=False)

[('cons_conf', 1),
 ('major_purch_opportu', 1),
 ('unempl_expect', 1),
 ('economy_expect', 1),
 ('economy_past', 1),
 ('financial_past', 1),
 ('savings_delta_expect', 1),
 ('savings_expect', 1),
 ('month', 1),
 ('campaign', 1),
 ('previous', 1),
 ('emp.var.rate', 1),
 ('cons.conf.idx', 1),
 ('euribor3m', 1),
 ('marital', 1),
 ('default', 1),
 ('contact', 1),
 ('month_last_contact', 1),
 ('day_of_week_last_contact', 1),
 ('poutcome', 1),
 ('emplmnt_industry_index', 2),
 ('ind_turnover', 3),
 ('housing', 4),
 ('education', 5),
 ('loan', 6),
 ('cons.price.idx', 7),
 ('year', 8),
 ('nr.employed', 9),
 ('inflation_delta_expect', 10),
 ('major_purch_expect', 11),
 ('household_debt_ratio', 12),
 ('day', 13),
 ('job', 14),
 ('age', 15),
 ('pdays', 16),
 ('railway_passengers', 17)]

Feature Importance with Decision Trees

In [26]:
from sklearn.ensemble import ExtraTreesClassifier

  from numpy.core.umath_tests import inner1d


In [27]:
model = ExtraTreesClassifier()
model.fit(X, y)
print(model.feature_importances_)

[0.002 0.008 0.007 0.001 0.009 0.011 0.004 0.015 0.013 0.007 0.002 0.003
 0.028 0.002 0.005 0.015 0.053 0.17  0.106 0.043 0.017 0.006 0.008 0.004
 0.064 0.019 0.091 0.048 0.083 0.014 0.036 0.027 0.014 0.003 0.047 0.017]


In [28]:
sorted(list(zip(names, model.feature_importances_)), key=lambda tup:tup[1], reverse=True)

[('age', 0.17030409549145606),
 ('campaign', 0.10578892586580306),
 ('job', 0.09050257176049559),
 ('education', 0.08328948611172694),
 ('euribor3m', 0.06365186839819546),
 ('day', 0.05331782459632758),
 ('marital', 0.04797313483726211),
 ('day_of_week_last_contact', 0.04715881471422661),
 ('pdays', 0.04336104600067493),
 ('housing', 0.03551229721766414),
 ('emplmnt_industry_index', 0.027719701114551177),
 ('loan', 0.02657492697547436),
 ('nr.employed', 0.019414622899268773),
 ('previous', 0.016892898686821478),
 ('poutcome', 0.016533156484106814),
 ('economy_past', 0.014992344644178365),
 ('year', 0.014744733610246988),
 ('contact', 0.014294404405441361),
 ('default', 0.013774531024267086),
 ('financial_past', 0.013368055728243556),
 ('inflation_delta_expect', 0.010691645383209183),
 ('unempl_expect', 0.008725088381314328),
 ('cons.price.idx', 0.00833571633914716),
 ('ind_turnover', 0.007758732641767558),
 ('savings_delta_expect', 0.006770542086186303),
 ('major_purch_expect', 0.00653