### Feature Importances:

- Objective: identify most important features for ML algorithm
- Method: first use Univariate tests to discard poor features (important when dealing with many features)
    Then, use recursive feature elimination and feature importances to get a feeling for ranking among features left


Dilemma: should we tune a model before feeding it to RFECV? Answer: first loosely tune models and then use regularized decision trees for a potentially more rigorouse answer:
            https://arxiv.org/pdf/1201.1587.pdf

In [35]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
%matplotlib inline
import pandas as pd
import numpy as np
from scipy.io import arff
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

In [32]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, average_precision_score
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.utils.fixes import signature
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [10]:
#Loading df1:
df = pd.read_pickle('data/pickle_files/df_pickle')
#Excluding the duration variable as it cannot be used in our baseline
df = df.drop(columns = ['duration'])

In [11]:
#Checking dtypes have loaded correctly (should return empty index)
df.select_dtypes(exclude = ['float64', 'int64']).columns

y = df['y']
X = df.drop(columns=['y'])

In [14]:
names = X.columns
names = list(names)

In [15]:
scaler = preprocessing.StandardScaler().fit(X)
X_transformed = scaler.transform(X)

In [17]:
y = np.array(y)

In [18]:
X = X_transformed.copy()

Univariate Feature Selection

In [23]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

In [53]:
#Using F Classifier scoring function
test = SelectKBest(score_func=f_classif, k='all')
fit = test.fit(X, y)
# summarize scores
np.set_printoptions(precision=3)

In [58]:
sorted(list(zip(names,list(fit.scores_))), key=lambda tup:tup[1], reverse=True)

[('nr.employed', 5926.610646027028),
 ('pdays', 4861.173272050324),
 ('euribor3m', 4309.479048107382),
 ('emp.var.rate', 4023.829925482691),
 ('previous', 2304.2570879403065),
 ('contact', 881.7070703063209),
 ('cons.price.idx', 778.5897856862763),
 ('poutcome', 705.6781644424019),
 ('default', 410.59078467352856),
 ('campaign', 182.1566728780736),
 ('education', 138.05175550828088),
 ('cons.conf.idx', 124.40997473778026),
 ('marital', 88.10704673114796),
 ('age', 38.094659288983294),
 ('job', 26.00986466893916),
 ('day_of_week', 10.503095880443485),
 ('housing', 5.496653388850058),
 ('month', 1.5149396996257602),
 ('loan', 0.9923712541373586)]

In [60]:
#Same as above with Mutual Info classification scoring function
test = SelectKBest(score_func=mutual_info_classif, k='all')
fit = test.fit(X, y)
# summarize scores
np.set_printoptions(precision=3)

In [61]:
sorted(list(zip(names,list(fit.scores_))), key=lambda tup:tup[1], reverse=True)

[('euribor3m', 0.07362531819079576),
 ('cons.conf.idx', 0.0693597224867446),
 ('cons.price.idx', 0.06794686386975957),
 ('nr.employed', 0.06418966306020213),
 ('emp.var.rate', 0.056370561691607035),
 ('pdays', 0.03361003561681852),
 ('poutcome', 0.03199249344529087),
 ('month', 0.02560080989118152),
 ('previous', 0.01979470817988993),
 ('age', 0.014215531098497491),
 ('contact', 0.013742130404887698),
 ('job', 0.012247581717139022),
 ('default', 0.009371262832790883),
 ('campaign', 0.00405829100507793),
 ('education', 0.0037674607712745445),
 ('marital', 0.002882621775037064),
 ('housing', 0.002699477033664799),
 ('loan', 0.002319820994676869),
 ('day_of_week', 0.0005053778487242866)]

Recursive Feature Elimination

In [3]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [66]:
model = LogisticRegression()
rfe = RFE(model, 10)
fit = rfe.fit(X, y)

In [78]:
print("Num Features Selected: %d" % (fit.n_features_))
#print("Selected Features: %s" % (fit.support_))
#print("Feature Ranking: %s" % (fit.ranking_))

Num Features Selected: 10


In [79]:
sorted(list(zip(names, fit.ranking_)), key=lambda tup:tup[1], reverse=False)

[('campaign', 1),
 ('pdays', 1),
 ('emp.var.rate', 1),
 ('cons.price.idx', 1),
 ('euribor3m', 1),
 ('nr.employed', 1),
 ('default', 1),
 ('contact', 1),
 ('month', 1),
 ('poutcome', 1),
 ('cons.conf.idx', 2),
 ('day_of_week', 3),
 ('education', 4),
 ('marital', 5),
 ('age', 6),
 ('previous', 7),
 ('housing', 8),
 ('job', 9),
 ('loan', 10)]

Feature Importance with Decision Trees

In [70]:
from sklearn.ensemble import ExtraTreesClassifier

In [71]:
model = ExtraTreesClassifier()
model.fit(X, y)
print(model.feature_importances_)

[0.177 0.104 0.061 0.013 0.049 0.023 0.027 0.099 0.043 0.091 0.049 0.08
 0.012 0.036 0.026 0.013 0.015 0.062 0.02 ]


In [81]:
sorted(list(zip(names, model.feature_importances_)), key=lambda tup:tup[1], reverse=True)

[('age', 0.1767341674850583),
 ('campaign', 0.10379980130475966),
 ('euribor3m', 0.09925860005973888),
 ('job', 0.09126972324127354),
 ('education', 0.08045474640321133),
 ('day_of_week', 0.06222333962791955),
 ('pdays', 0.06078769784667078),
 ('emp.var.rate', 0.049393155853880356),
 ('marital', 0.048805752559230864),
 ('nr.employed', 0.04347115348941223),
 ('housing', 0.03586110429424358),
 ('cons.conf.idx', 0.026771841783661994),
 ('loan', 0.025977819253523993),
 ('cons.price.idx', 0.023061937406881908),
 ('poutcome', 0.020062518168205562),
 ('month', 0.014638971017091482),
 ('previous', 0.01309584031572274),
 ('contact', 0.012817344626577899),
 ('default', 0.011514485262935354)]

In [None]:
#Do CV with RFE and try regularized tree estimator