### Feature Engineering and Selection:

- Objective: engineer new features and analyze their importance
- Method: first use Univariate tests to discard poor features (important when dealing with many features)
    Then, use recursive feature elimination and feature importances to get a feeling for ranking among features left


Dilemma: should we tune a model before feeding it to RFECV? Answer: first loosely tune models and then use regularized decision trees for a potentially more rigorouse answer:
            https://arxiv.org/pdf/1201.1587.pdf

In [5]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [126]:
%matplotlib inline
import pandas as pd
import numpy as np
from scipy.io import arff
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import os
np.set_printoptions(precision=3)

In [70]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, average_precision_score
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.utils.fixes import signature
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [71]:
os.listdir('../data/pickle_files')

['df_pickle_w_all_stats',
 'df_pickle',
 'new_df_y_pickle',
 'new_df_pdays_pickle',
 'new_df_camp_pickle',
 'df_pickle_w_time_stats',
 'new_df_prev_pickle']

In [72]:
#Loading df1 after it has been augmented in iteration 2:
new_df_pdays = pd.read_pickle('../data/pickle_files/new_df_pdays_pickle')
#Excluding the duration variable as it cannot be used in our baseline
new_df_pdays = new_df_pdays.drop(columns = ['duration', 'Date'])

In [73]:
#Loading df1 after it has been augmented in iteration 2:
new_df_camp = pd.read_pickle('../data/pickle_files/new_df_camp_pickle')
#Excluding the duration variable as it cannot be used in our baseline
new_df_camp = new_df_camp.drop(columns = ['duration', 'Date'])

In [74]:
#Loading df1 after it has been augmented in iteration 2:
new_df_prev = pd.read_pickle('../data/pickle_files/new_df_prev_pickle')
#Excluding the duration variable as it cannot be used in our baseline
new_df_prev = new_df_prev.drop(columns = ['duration', 'Date'])

In [75]:
#Loading df1 after it has been augmented in iteration 2:
df_all_stats = pd.read_pickle('../data/pickle_files/df_pickle_w_all_stats')
#Excluding the duration variable as it cannot be used in our baseline
df_all_stats = df_all_stats.drop(columns = ['duration', 'Date'])

In [135]:
def prep_df(df):
    y = df['y']
    X = df.drop(df.filter(regex='y').columns, axis=1)
    #Dropping features based on y
    X = X.drop(columns = ['cust_acquisition_flag', 'cust_retention_flag', 'prospect_conversion_flag'])
    #Dropping linearly dependent features
    X = X.drop(columns = ['new_lead'])
    names = list(X.columns)
    return X, y, names

Feature Importance:

Starting with an exploration of the most important features with univariate methods and RFE (with CV)

Univariate Feature Selection

In [93]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

In [136]:
def kbest(X,y, score_func, k):
    #Use f_classif or mutual_info_classif as scoring function
    test = SelectKBest(score_func=score_func, k=k)
    fit = test.fit(X, y)
    # summarize scores
    np.set_printoptions(precision=3)
    return sorted(list(zip(names,list(fit.scores_))), key=lambda tup:tup[1], reverse=True) 

In [204]:
X_pdays,y,names = prep_df(new_df_pdays)

In [205]:
cols_pdays = kbest(X_pdays,y, f_classif, 'all')

In [206]:
#among the top 70 indicators, many are economic indicators that are highly related to one another
#to avoid redundancy we will pick only the top 10 economic indicators
rfe_test1 = [i[0] for i in cols_pdays][:10]

In [207]:
#the same redundancies will occur if we pick too many campaign related fields 
#so we pick the top 10 campaign related fields and in between features
rfe_test1 += [i[0] for i in cols_pdays if 'pdays' in i[0]][:10]

In [208]:
X_camp,y,names = prep_df(new_df_camp)

In [209]:
cols_camp = kbest(X_camp,y, f_classif, 'all')

In [210]:
rfe_test1+= [i[0] for i in cols_camp][:10]

In [211]:
rfe_test1 += [i[0] for i in cols_camp if 'campaign' in i[0]][:10]

In [212]:
X_prev,y,names = prep_df(new_df_prev)

In [213]:
cols_prev = kbest(X_camp,y, f_classif, 'all')

In [214]:
rfe_test1+= [i[0] for i in cols_prev][:10]

In [215]:
rfe_test1 += [i[0] for i in cols_prev if 'previous' in i[0]][:10]

In [216]:
#Removing Duplicates:
#Final list of columns to use for RFE:
list(set(rfe_test1))

['Mortgage credit',
 'campaign-loan-contact-age_bin',
 'campaign-default-loan-contact-age_bin',
 'Foreign exchange reserves',
 'previous-marital-default-contact-age_bin',
 'previous-default-contact-age_bin',
 'previous-default-loan-contact-age_bin',
 'previous-housing-loan-contact-age_bin',
 'campaign-default-housing-loan-contact-age_bin',
 'campaign-contact-age_bin',
 'campaign-default-housing-contact-age_bin',
 'Minimum wage',
 'Household credit',
 'Labor cost',
 'campaign-default-contact-age_bin',
 'Deposit interest rate',
 'previous',
 'campaign-marital-default-contact-age_bin',
 'prev_subscr_flag',
 'campaign-job-default-contact',
 'previous-default-housing-loan-contact-age_bin',
 'previous-housing-contact-age_bin',
 'previous-contact-age_bin',
 'Private sector credit',
 'Business credit interest rate',
 'campaign-housing-loan-contact-age_bin',
 'previous-default-housing-contact-age_bin',
 'Government debt',
 'campaign-housing-contact-age_bin',
 'previous-loan-contact-age_bin']

Recursive Feature Elimination

In [84]:
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression

In [85]:
model = LogisticRegression(C=1, class_weight='balanced')
rfe = RFE(model, 20)
fit = rfe.fit(X, y)



KeyboardInterrupt: 

In [None]:
print("Num Features Selected: %d" % (fit.n_features_))
#print("Selected Features: %s" % (fit.support_))
#print("Feature Ranking: %s" % (fit.ranking_))

In [None]:
sorted(list(zip(names, fit.ranking_)), key=lambda tup:tup[1], reverse=False)

In [None]:
rfecv = RFECV(model, step=1, cv=5)
fit = rfecv.fit(X, y)

In [None]:
print("Num Features Selected: %d" % (fit.n_features_))

In [None]:
sorted(list(zip(names, fit.ranking_)), key=lambda tup:tup[1], reverse=False)

In [None]:
#Trying the same as above, but this time after scaling non categorical features

In [None]:
def scale_noncat_only(df):
    X_temp_noncat = df.select_dtypes(exclude=['int8'])
    X_temp_cat = df.select_dtypes(include=['int8'])
    scaler = preprocessing.StandardScaler().fit(X_temp_noncat)
    X_transformed = scaler.transform(X_temp_noncat)
    X = pd.concat([pd.DataFrame(X_transformed, columns =X_temp_noncat.columns), X_temp_cat], axis = 1)
    X = X.drop(columns=['y'])
    y = df['y']
    #will work with numpy arrays
    y = np.array(y)
    X = np.array(X)
    
    return X, y

X, y = scale_noncat_only(df)

In [None]:
#Checking dtypes have loaded correctly (should return empty index)
df.select_dtypes(exclude = ['float64', 'int64', 'int8']).columns

In [None]:
y = df['y']
X = df.drop(columns=['y'])

In [None]:
X.shape

In [None]:
rfecv = RFECV(model, step=1, cv=5)
fit = rfecv.fit(X, y)

In [None]:
print("Num Features Selected: %d" % (fit.n_features_))

In [None]:
sorted(list(zip(names, fit.ranking_)), key=lambda tup:tup[1], reverse=False)

Feature Importance with Decision Trees

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
model = ExtraTreesClassifier()
model.fit(X, y)
print(model.feature_importances_)

In [None]:
sorted(list(zip(names, model.feature_importances_)), key=lambda tup:tup[1], reverse=True)

Important Visualizations