### Feature Importance and Selection:

- Objective: identify most important features for ML algorithm and then output a selected dataset
- Method: first use Univariate tests to discard poor features (important when dealing with many features)
    Then, use recursive feature elimination and feature importances to get a feeling for ranking among features left


Dilemma: should we tune a model before feeding it to RFECV? Answer: first loosely tune models and then use regularized decision trees for a potentially more rigorouse answer:
            https://arxiv.org/pdf/1201.1587.pdf

In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
from scipy.io import arff
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

In [3]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, average_precision_score
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.utils.fixes import signature
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [6]:
#Loading df1 after it has been augmented in iteration 2:
df = pd.read_pickle('../data/pickle_files/df_pickle_w_all_stats')
#Excluding the duration variable as it cannot be used in our baseline
df = df.drop(columns = ['duration', 'Date'])

In [7]:
def scale_noncat_only(df):
    X_temp_noncat = df.select_dtypes(exclude=['int8'])
    X_temp_cat = df.select_dtypes(include=['int8'])
    scaler = preprocessing.StandardScaler().fit(X_temp_noncat)
    X_transformed = scaler.transform(X_temp_noncat)
    X = pd.concat([pd.DataFrame(X_transformed, columns =X_temp_noncat.columns), X_temp_cat], axis = 1)
    X = X.drop(columns=['y'])
    y = df['y']
    #will work with numpy arrays
    y = np.array(y)
    X = np.array(X)
    
    return X, y

X, y = scale_noncat_only(df)

In [8]:
#Checking dtypes have loaded correctly (should return empty index)
df.select_dtypes(exclude = ['float64', 'int64', 'int8']).columns

Index([], dtype='object')

In [9]:
y = df['y']
X = df.drop(columns=['y'])

In [10]:
X.shape

(41188, 72)

In [11]:
names = list(X.columns)

Univariate Feature Selection

In [12]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

In [13]:
#Using F Classifier scoring function
test = SelectKBest(score_func=f_classif, k='all')
fit = test.fit(X, y)
# summarize scores
np.set_printoptions(precision=3)

In [14]:
sorted(list(zip(names,list(fit.scores_))), key=lambda tup:tup[1], reverse=True)

[('Household credit', 6423.957630542542),
 ('Mortgage credit', 6259.710075255824),
 ('Unemployment rate', 6215.328224329231),
 ('Foreign exchange reserves', 6127.619935911332),
 ('nr.employed', 5926.610601111052),
 ('Government debt', 5893.926455086986),
 ('Employment', 5830.594300707896),
 ('Minimum wage', 5709.937349795137),
 ('year', 5668.675543704237),
 ('emplmnt_industry_index', 5488.271703529525),
 ('Private sector credit', 5171.645855551422),
 ('pdays', 4861.173272049289),
 ('Labor cost', 4670.487138412593),
 ('Business credit interest rate', 4556.632400328948),
 ('Deposit interest rate', 4432.398191653248),
 ('Consumer credit', 4375.106973760946),
 ('Investment as percent of GDP', 4361.241416793941),
 ('euribor3m', 4309.479048109171),
 ('Mortgage credit interest rate', 4115.273907037838),
 ('emp.var.rate', 4023.829925487281),
 ('Investment', 3577.29704945438),
 ('Business credit', 3565.6933778955704),
 ('Government expenditure', 3306.875885293535),
 ('financial_past', 3102.0897

In [15]:
#Same as above with Mutual Info classification scoring function
test = SelectKBest(score_func=mutual_info_classif, k='all')
fit = test.fit(X, y)
# summarize scores
np.set_printoptions(precision=3)

In [16]:
sorted(list(zip(names,list(fit.scores_))), key=lambda tup:tup[1], reverse=True)

[('euribor3m', 0.07276930652382996),
 ('Money supply (broad money)', 0.07225833302885643),
 ('Inflation annual', 0.07166360224852375),
 ('Exchange rate to USD', 0.07048228652568334),
 ('Private sector credit', 0.07036034343371145),
 ('financial_past', 0.07009019362280933),
 ('Business confidence survey', 0.06998037169595728),
 ('Business credit', 0.06992931675507719),
 ('Exports', 0.06944044594647614),
 ('major_purch_expect', 0.06934259720773928),
 ('inflation_delta_expect', 0.06897443762801347),
 ('cons.conf.idx', 0.06892500805118829),
 ('cons.price.idx', 0.06882659674166702),
 ('ind_turnover', 0.06866267573099272),
 ('FDI', 0.06845983522365229),
 ('Deposit interest rate', 0.0683872474471261),
 ('Business credit interest rate', 0.06826169049077113),
 ('Consumer Price Index (CPI)', 0.06824246104083742),
 ('economy_expect', 0.0682210793244149),
 ('Mortgage credit', 0.06815931080572768),
 ('unempl_expect', 0.06771224458472092),
 ('railway_passengers', 0.06751540491259456),
 ('Employment'

Recursive Feature Elimination

In [17]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [18]:
model = LogisticRegression()
rfe = RFE(model, 20)
fit = rfe.fit(X, y)

In [19]:
print("Num Features Selected: %d" % (fit.n_features_))
#print("Selected Features: %s" % (fit.support_))
#print("Feature Ranking: %s" % (fit.ranking_))

Num Features Selected: 20


In [20]:
sorted(list(zip(names, fit.ranking_)), key=lambda tup:tup[1], reverse=False)

[('economy_expect', 1),
 ('economy_past', 1),
 ('savings_expect', 1),
 ('month', 1),
 ('previous', 1),
 ('emp.var.rate', 1),
 ('default', 1),
 ('contact', 1),
 ('poutcome', 1),
 ('Consumer credit', 1),
 ('Unemployment rate', 1),
 ('Labor cost', 1),
 ('Private sector credit', 1),
 ('Economic growth', 1),
 ('Consumption growth', 1),
 ('Inflation monthly', 1),
 ('Inflation annual', 1),
 ('Business confidence survey', 1),
 ('Consumer confidence survey', 1),
 ('Economic growth Q-on-Q', 1),
 ('month_last_contact', 2),
 ('cons.conf.idx', 3),
 ('Retail sales', 4),
 ('Consumer Price Index (CPI)', 5),
 ('ind_turnover', 6),
 ('major_purch_opportu', 7),
 ('savings_delta_expect', 8),
 ('inflation_delta_expect', 9),
 ('day_of_week_last_contact', 10),
 ('marital', 11),
 ('Money supply (broad money)', 12),
 ('campaign', 13),
 ('unempl_expect', 14),
 ('Investment growth', 15),
 ('Consumption as percent of GDP', 16),
 ('Business credit interest rate', 17),
 ('housing', 18),
 ('education', 19),
 ('major_

Feature Importance with Decision Trees

In [21]:
from sklearn.ensemble import ExtraTreesClassifier

  from numpy.core.umath_tests import inner1d


In [22]:
model = ExtraTreesClassifier()
model.fit(X, y)
print(model.feature_importances_)

[6.827e-04 2.141e-03 8.969e-04 4.541e-04 8.221e-04 2.421e-04 1.746e-04
 2.366e-03 1.564e-02 1.006e-03 1.855e-03 3.260e-04 1.589e-02 2.342e-03
 2.527e-04 1.882e-03 5.379e-02 1.739e-01 1.063e-01 2.229e-02 1.585e-02
 3.119e-04 1.143e-03 5.454e-03 6.444e-02 1.398e-02 8.960e-02 4.749e-02
 8.265e-02 1.162e-02 3.624e-02 2.731e-02 1.560e-02 4.660e-04 4.623e-02
 2.960e-02 7.662e-03 1.054e-02 1.242e-04 6.527e-04 1.762e-02 2.996e-04
 2.447e-04 9.233e-05 1.201e-03 1.376e-02 2.369e-04 5.392e-05 1.443e-03
 5.100e-04 9.133e-03 1.577e-04 4.014e-03 4.045e-04 8.159e-04 8.215e-03
 2.077e-04 4.437e-04 5.630e-03 3.906e-03 1.242e-03 5.127e-04 4.124e-03
 6.363e-03 3.261e-04 2.828e-03 7.107e-04 9.886e-04 1.034e-03 1.506e-03
 5.721e-04 1.122e-03]


In [23]:
sorted(list(zip(names, model.feature_importances_)), key=lambda tup:tup[1], reverse=True)

[('age', 0.17394574587826778),
 ('campaign', 0.1063372508280613),
 ('job', 0.08959918893254333),
 ('education', 0.08265002252322014),
 ('euribor3m', 0.06443561127536988),
 ('day', 0.05379034482149064),
 ('marital', 0.04748723900154844),
 ('day_of_week_last_contact', 0.046233096385619864),
 ('housing', 0.036235347804725426),
 ('poutcome', 0.02960386382420534),
 ('loan', 0.027309532424848144),
 ('pdays', 0.022288053956636487),
 ('Employment', 0.01761709886350751),
 ('emplmnt_industry_index', 0.015886023897514163),
 ('previous', 0.01585280683282249),
 ('financial_past', 0.015636202068373916),
 ('contact', 0.015603900240354063),
 ('nr.employed', 0.01397608084004813),
 ('Foreign exchange reserves', 0.013756639709008358),
 ('default', 0.01162123370240324),
 ('Deposit interest rate', 0.010539037324976724),
 ('Minimum wage', 0.00913349021343838),
 ('Mortgage credit', 0.0082148636277978),
 ('Consumer credit', 0.007662238820895296),
 ('Investment as percent of GDP', 0.006362781075606537),
 ('Mon