### Feature Engineering and Selection:

- Objective: engineer new features and analyze their importance
- Method: first use Univariate tests to discard poor features (important when dealing with many features)
    Then, use recursive feature elimination and feature importances to get a feeling for ranking among features left


Dilemma: should we tune a model before feeding it to RFECV? Answer: first loosely tune models and then use regularized decision trees for a potentially more rigorouse answer:
            https://arxiv.org/pdf/1201.1587.pdf

In [3]:
%load_ext autoreload

%autoreload 2

In [4]:
%matplotlib inline
import pandas as pd
import numpy as np
from scipy.io import arff
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import os

In [5]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, average_precision_score
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.utils.fixes import signature
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [6]:
os.listdir('../data/pickle_files/')

['final_df_pickle',
 'df_pickle_w_all_stats',
 'df_pickle',
 'new_df_y_pickle',
 'new_df_pdays_pickle',
 'new_df_camp_pickle',
 'df_pickle_w_time_stats',
 'new_df_prev_pickle']

In [7]:
#Loading final df from feature engineering notebook
final_df = pd.read_pickle('../data/pickle_files/final_df_pickle')
final_df = final_df.drop(columns = ['Date'])

In [8]:
def prep_df(df):
    y = df['y']
    X = df.drop(df.filter(regex='y').columns, axis=1)
    X = X.drop(columns='duration')
    #Dropping features based on y
    X = X.drop(X.filter(regex="cust_acquisition_flag").columns, axis=1) 
    X = X.drop(X.filter(regex="cust_retention_flag").columns, axis=1)
    X = X.drop(X.filter(regex="prospect_conversion_flag").columns, axis=1)
    #Dropping linearly dependent features
    X = X.drop(columns = ['new_lead'])
    names = list(X.columns)
    return X, y, names

In [9]:
#loading df
X, y, names = prep_df(final_df)

Feature Importance:

Starting with an exploration of the most important features with univariate methods and RFE (with CV)

Univariate Feature Selection

In [10]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

In [11]:
def kbest(X,y, score_func, k):
    #Use f_classif or mutual_info_classif as scoring function
    test = SelectKBest(score_func=score_func, k=k)
    fit = test.fit(X, y)
    # summarize scores
    np.set_printoptions(precision=3)
    return sorted(list(zip(names,list(fit.scores_))), key=lambda tup:tup[1], reverse=True) 

In [13]:
best_cols = kbest(X,y, f_classif, 'all')

In [14]:
best_cols

[('prev_subscr_flag-rolling-sum-60', 6770.790199636067),
 ('prev_subscr_flag-rolling-avg-60', 6770.790199635986),
 ('Household credit', 6423.957630542542),
 ('Mortgage credit', 6259.710075255824),
 ('prev_subscr_flag-rolling-sum-30', 6233.646879118396),
 ('prev_subscr_flag-rolling-avg-30', 6233.646879118375),
 ('Foreign exchange reserves', 6127.619935911332),
 ('Government debt', 5893.926455086986),
 ('Minimum wage', 5709.937349795137),
 ('previous-rolling-sum-60', 5523.056883194985),
 ('previous-rolling-avg-60', 5523.056883194942),
 ('previous-rolling-sum-30', 5239.079764761115),
 ('previous-rolling-avg-30', 5239.079764761028),
 ('Private sector credit', 5171.645855551422),
 ('prev_subscr_flag-rolling-sum-10', 5049.758626397523),
 ('prev_subscr_flag-rolling-avg-10', 5049.75862639751),
 ('Labor cost', 4670.487138412593),
 ('prev_subscr_flag', 4577.560711884804),
 ('Business credit interest rate', 4556.632400328948),
 ('previous-rolling-avg-10', 4453.8642713621175),
 ('previous-rolling-

In [15]:
#among the top 70 indicators, many are economic indicators that are highly related to one another
#to avoid redundancy we will pick only the top 10 economic indicators
rfe_test1 = [i[0] for i in best_cols][:40]

In [16]:
X[rfe_test1].shape

(41188, 40)

Feature Importances Via Decision Tree Regression

In [17]:
def scale_noncat_only(X):
    
    X_temp_noncat = X.select_dtypes(exclude=['int8'])
    X_temp_noncat = X_temp_noncat.drop(X_temp_noncat.filter(regex="-categorical").columns, axis=1)
    
    
    scaler = preprocessing.StandardScaler().fit(X_temp_noncat)
    X_transformed = pd.DataFrame(scaler.transform(X_temp_noncat), columns = X_temp_noncat.columns)
    
    X = X_transformed.merge(X, left_index=True, right_index=True, how='inner', suffixes=('', '_rightdf'))
    X = X.drop(X.filter(regex="_rightdf").columns, axis=1)
    
    return X

In [18]:
X_test1 = X[rfe_test1]

In [19]:
X_test1 = scale_noncat_only(X_test1)

  return self.partial_fit(X, y)
  


In [24]:
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state=4)

In [26]:
model = RandomForestClassifier(max_depth = 20, min_samples_split = 1000, random_state = 4)

In [27]:
model.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=1000,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=4, verbose=0, warm_start=False)

In [28]:
accuracy_score(y_test, model.predict(X_test))

0.8959698956057296