### Feature Engineering and Selection:

- Objective: engineer new features and analyze their importance
- Method: first use Univariate tests to discard poor features (important when dealing with many features)
    Then, use recursive feature elimination and feature importances to get a feeling for ranking among features left


Dilemma: should we tune a model before feeding it to RFECV? Answer: first loosely tune models and then use regularized decision trees for a potentially more rigorouse answer:
            https://arxiv.org/pdf/1201.1587.pdf

In [55]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
%matplotlib inline
import pandas as pd
import numpy as np
from scipy.io import arff
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import os

In [57]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, average_precision_score
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.utils.fixes import signature
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [58]:
os.listdir('../data/pickle_files')

['df_pickle_w_all_stats', 'df_pickle', 'df_pickle_w_time_stats']

In [59]:
#Loading df1 after it has been augmented in iteration 2:
df = pd.read_pickle('../data/pickle_files/df_pickle_w_all_stats')
#Excluding the duration variable as it cannot be used in our baseline
df = df.drop(columns = ['duration', 'Date'])

In [60]:
y = df['y']
X = df.drop(columns='y')
names = list(X.columns)

Feature Importance:

Starting with an exploration of the most important features with univariate methods and RFE (with CV)

Univariate Feature Selection

In [61]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

In [62]:
#Using F Classifier scoring function
test = SelectKBest(score_func=f_classif, k='all')
fit = test.fit(X, y)
# summarize scores
np.set_printoptions(precision=3)

In [63]:
sorted(list(zip(names,list(fit.scores_))), key=lambda tup:tup[1], reverse=True)

[('Household credit', 6423.957630542542),
 ('Mortgage credit', 6259.710075255824),
 ('Unemployment rate', 6215.328224329231),
 ('Foreign exchange reserves', 6127.619935911332),
 ('nr.employed', 5926.610601111052),
 ('Government debt', 5893.926455086986),
 ('Employment', 5830.594300707896),
 ('Minimum wage', 5709.937349795137),
 ('year', 5668.675543704237),
 ('emplmnt_industry_index', 5488.271703529525),
 ('Private sector credit', 5171.645855551422),
 ('pdays', 4861.173272049289),
 ('Labor cost', 4670.487138412593),
 ('Business credit interest rate', 4556.632400328948),
 ('Deposit interest rate', 4432.398191653248),
 ('Consumer credit', 4375.106973760946),
 ('Investment as percent of GDP', 4361.241416793941),
 ('euribor3m', 4309.479048109171),
 ('Mortgage credit interest rate', 4115.273907037838),
 ('emp.var.rate', 4023.829925487281),
 ('Investment', 3577.29704945438),
 ('Business credit', 3565.6933778955704),
 ('Government expenditure', 3306.875885293535),
 ('financial_past', 3102.0897

In [64]:
#Same as above with Mutual Info classification scoring function
test = SelectKBest(score_func=mutual_info_classif, k='all')
fit = test.fit(X, y)
# summarize scores
np.set_printoptions(precision=3)

In [65]:
sorted(list(zip(names,list(fit.scores_))), key=lambda tup:tup[1], reverse=True)

[('euribor3m', 0.07248659679375846),
 ('cons.price.idx', 0.07129766500371382),
 ('Exports', 0.07073904829071531),
 ('Business credit interest rate', 0.07044783283077805),
 ('Mortgage credit', 0.07005046660561431),
 ('Money supply (broad money)', 0.06935715237457796),
 ('financial_past', 0.06913538536998187),
 ('FDI', 0.06907773930537742),
 ('Private sector credit', 0.06893630288424268),
 ('emplmnt_industry_index', 0.06890832893757581),
 ('Consumer credit', 0.06889218556795229),
 ('Business confidence survey', 0.0688443994365393),
 ('Current account balance', 0.0688366767655606),
 ('railway_passengers', 0.06878728889342356),
 ('ind_turnover', 0.06865098632655009),
 ('Consumer Price Index (CPI)', 0.06857829334127574),
 ('Deposit interest rate', 0.06845042987276595),
 ('Mortgage credit interest rate', 0.06798420418315909),
 ('major_purch_expect', 0.06761135865834023),
 ('Exchange rate to USD', 0.06754751333413278),
 ('Business credit', 0.06737619007574969),
 ('inflation_delta_expect', 0.0

Recursive Feature Elimination

In [66]:
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression

In [67]:
model = LogisticRegression(C=1, solver = 'liblinear')
rfe = RFE(model, 20)
fit = rfe.fit(X, y)

In [68]:
print("Num Features Selected: %d" % (fit.n_features_))
#print("Selected Features: %s" % (fit.support_))
#print("Feature Ranking: %s" % (fit.ranking_))

Num Features Selected: 20


In [69]:
sorted(list(zip(names, fit.ranking_)), key=lambda tup:tup[1], reverse=False)

[('economy_expect', 1),
 ('economy_past', 1),
 ('savings_expect', 1),
 ('month', 1),
 ('previous', 1),
 ('emp.var.rate', 1),
 ('default', 1),
 ('contact', 1),
 ('poutcome', 1),
 ('Consumer credit', 1),
 ('Unemployment rate', 1),
 ('Labor cost', 1),
 ('Private sector credit', 1),
 ('Economic growth', 1),
 ('Consumption growth', 1),
 ('Inflation monthly', 1),
 ('Inflation annual', 1),
 ('Business confidence survey', 1),
 ('Consumer confidence survey', 1),
 ('Economic growth Q-on-Q', 1),
 ('month_last_contact', 2),
 ('cons.conf.idx', 3),
 ('Retail sales', 4),
 ('Consumer Price Index (CPI)', 5),
 ('ind_turnover', 6),
 ('major_purch_opportu', 7),
 ('savings_delta_expect', 8),
 ('inflation_delta_expect', 9),
 ('day_of_week_last_contact', 10),
 ('marital', 11),
 ('Money supply (broad money)', 12),
 ('campaign', 13),
 ('unempl_expect', 14),
 ('Investment growth', 15),
 ('Consumption as percent of GDP', 16),
 ('Business credit interest rate', 17),
 ('housing', 18),
 ('education', 19),
 ('major_

In [70]:
rfecv = RFECV(model, step=1, cv=5)
fit = rfecv.fit(X, y)

In [71]:
print("Num Features Selected: %d" % (fit.n_features_))

Num Features Selected: 1


In [72]:
sorted(list(zip(names, fit.ranking_)), key=lambda tup:tup[1], reverse=False)

[('poutcome', 1),
 ('emp.var.rate', 2),
 ('Inflation annual', 3),
 ('contact', 4),
 ('default', 5),
 ('previous', 6),
 ('Unemployment rate', 7),
 ('month', 8),
 ('Economic growth Q-on-Q', 9),
 ('Consumer credit', 10),
 ('Private sector credit', 11),
 ('Labor cost', 12),
 ('Inflation monthly', 13),
 ('Economic growth', 14),
 ('savings_expect', 15),
 ('Business confidence survey', 16),
 ('Consumption growth', 17),
 ('Consumer confidence survey', 18),
 ('economy_expect', 19),
 ('economy_past', 20),
 ('month_last_contact', 21),
 ('cons.conf.idx', 22),
 ('Retail sales', 23),
 ('Consumer Price Index (CPI)', 24),
 ('ind_turnover', 25),
 ('major_purch_opportu', 26),
 ('savings_delta_expect', 27),
 ('inflation_delta_expect', 28),
 ('day_of_week_last_contact', 29),
 ('marital', 30),
 ('Money supply (broad money)', 31),
 ('campaign', 32),
 ('unempl_expect', 33),
 ('Investment growth', 34),
 ('Consumption as percent of GDP', 35),
 ('Business credit interest rate', 36),
 ('housing', 37),
 ('educati

In [73]:
#Trying the same as above, but this time after scaling non categorical features

In [74]:
def scale_noncat_only(df):
    X_temp_noncat = df.select_dtypes(exclude=['int8'])
    X_temp_cat = df.select_dtypes(include=['int8'])
    scaler = preprocessing.StandardScaler().fit(X_temp_noncat)
    X_transformed = scaler.transform(X_temp_noncat)
    X = pd.concat([pd.DataFrame(X_transformed, columns =X_temp_noncat.columns), X_temp_cat], axis = 1)
    X = X.drop(columns=['y'])
    y = df['y']
    #will work with numpy arrays
    y = np.array(y)
    X = np.array(X)
    
    return X, y

X, y = scale_noncat_only(df)

  return self.partial_fit(X, y)
  """


In [75]:
#Checking dtypes have loaded correctly (should return empty index)
df.select_dtypes(exclude = ['float64', 'int64', 'int8']).columns

Index([], dtype='object')

In [76]:
y = df['y']
X = df.drop(columns=['y'])

In [77]:
X.shape

(41188, 72)

In [78]:
rfecv = RFECV(model, step=1, cv=5)
fit = rfecv.fit(X, y)

In [79]:
print("Num Features Selected: %d" % (fit.n_features_))

Num Features Selected: 1


In [80]:
sorted(list(zip(names, fit.ranking_)), key=lambda tup:tup[1], reverse=False)

[('poutcome', 1),
 ('emp.var.rate', 2),
 ('Inflation annual', 3),
 ('contact', 4),
 ('default', 5),
 ('previous', 6),
 ('Unemployment rate', 7),
 ('month', 8),
 ('Economic growth Q-on-Q', 9),
 ('Consumer credit', 10),
 ('Private sector credit', 11),
 ('Labor cost', 12),
 ('Inflation monthly', 13),
 ('Economic growth', 14),
 ('savings_expect', 15),
 ('Business confidence survey', 16),
 ('Consumption growth', 17),
 ('Consumer confidence survey', 18),
 ('economy_expect', 19),
 ('economy_past', 20),
 ('month_last_contact', 21),
 ('cons.conf.idx', 22),
 ('Retail sales', 23),
 ('Consumer Price Index (CPI)', 24),
 ('ind_turnover', 25),
 ('major_purch_opportu', 26),
 ('savings_delta_expect', 27),
 ('inflation_delta_expect', 28),
 ('day_of_week_last_contact', 29),
 ('marital', 30),
 ('Money supply (broad money)', 31),
 ('campaign', 32),
 ('unempl_expect', 33),
 ('Investment growth', 34),
 ('Consumption as percent of GDP', 35),
 ('Business credit interest rate', 36),
 ('housing', 37),
 ('educati

Feature Importance with Decision Trees

In [81]:
from sklearn.ensemble import ExtraTreesClassifier

In [82]:
model = ExtraTreesClassifier()
model.fit(X, y)
print(model.feature_importances_)



[1.612e-04 1.098e-03 2.262e-03 2.874e-04 1.406e-03 5.125e-03 2.674e-04
 4.942e-03 9.389e-04 2.342e-03 2.409e-03 1.401e-03 1.135e-02 3.668e-03
 2.468e-04 8.791e-03 5.672e-02 1.708e-01 1.076e-01 2.648e-02 1.578e-02
 9.459e-03 3.243e-04 1.370e-03 5.802e-02 3.118e-03 8.952e-02 4.883e-02
 8.460e-02 1.317e-02 3.532e-02 2.528e-02 1.577e-02 1.269e-03 4.575e-02
 2.807e-02 2.288e-03 1.527e-02 6.372e-03 1.404e-02 8.341e-04 1.399e-02
 2.742e-04 1.140e-04 4.045e-04 1.198e-02 4.359e-03 1.179e-04 1.061e-03
 9.607e-05 4.770e-03 2.806e-03 3.822e-04 2.690e-04 9.630e-05 5.315e-03
 2.815e-04 4.029e-03 6.336e-03 9.906e-03 4.747e-04 1.374e-03 6.444e-05
 2.726e-04 7.147e-05 1.826e-04 1.953e-03 1.409e-03 1.173e-03 8.657e-05
 3.472e-03 2.866e-05]


In [83]:
sorted(list(zip(names, model.feature_importances_)), key=lambda tup:tup[1], reverse=True)

[('age', 0.17084381456194841),
 ('campaign', 0.10764455084485466),
 ('job', 0.08952188867652652),
 ('education', 0.08460459539760805),
 ('euribor3m', 0.0580226563148596),
 ('day', 0.056723501886880315),
 ('marital', 0.04883293869369102),
 ('day_of_week_last_contact', 0.0457463346994218),
 ('housing', 0.03532021164587391),
 ('poutcome', 0.028073821307882857),
 ('pdays', 0.026475444978892714),
 ('loan', 0.025283874462812633),
 ('previous', 0.01577853289415438),
 ('contact', 0.015771761189674598),
 ('Deposit interest rate', 0.015270125300590492),
 ('Business credit interest rate', 0.014040150657070965),
 ('Unemployment rate', 0.013989876622913247),
 ('default', 0.013167731792014897),
 ('Foreign exchange reserves', 0.011978897577578213),
 ('emplmnt_industry_index', 0.011350156697195071),
 ('Household consumption', 0.009905947546647132),
 ('emp.var.rate', 0.009459388062056538),
 ('year', 0.008790839429067408),
 ('Mortgage credit interest rate', 0.006371630261835744),
 ('Money supply (broad 

Important Visualizations