# Best practices for the project

# ---------------- Feature engineering-------------------------###

In [134]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Sklearn 
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

###### feature engineering packages
from feature_engine import missing_data_imputers as mdi
from feature_engine import discretisers as dsc
from feature_engine import categorical_encoders as ce
from feature_engine.categorical_encoders import WoERatioCategoricalEncoder
from feature_engine.discretisers import DecisionTreeDiscretiser
from feature_engine.outlier_removers import Winsorizer

######## Feature selection packages 
from sklearn.feature_selection import VarianceThreshold

In [110]:
data = pd.read_csv('exercise_03_train.csv' )

In [111]:
data.shape

(40000, 101)

In [112]:
data.head(3)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x91,x92,x93,x94,x95,x96,x97,x98,x99,y
0,83.81233,-0.122672,65.391785,6.323478,-18.511031,2.122648,0.319472,7.650422,7.713315,2.789856,...,5.482352,12.211997,asia,-3.201565,-9.837196,-21.092011,4.67114,0.588994,-11.417083,0
1,-0.445312,-0.436077,12.981759,30.716674,-34.981679,-16.712862,0.530105,8.740222,56.044533,0.075118,...,-6.083035,-5.098374,asia,5.433036,91.724841,4.305371,4.32913,0.371513,8.474528,0
2,82.927148,0.075277,-29.096012,-20.176841,10.109713,-45.994005,0.618132,7.757838,-69.74906,-1.561335,...,0.200787,-7.174907,asia,2.595005,-14.630603,-3.743052,-14.820376,-2.366675,2.613091,0


In [113]:
##### capture the type of each 
### numeric feature with less than 5 unique values is probably discrete 9 integers and not numeric
discrete= [ var for var in data.columns   if data[var].dtype != 'O' and var!='y' and data[var].nunique() < 5 ]
contin= [ var for var in data.columns     if data[var].dtype != 'O' and var!='y' and var not in discrete]
categorical = [var for var in data.columns if data[var].dtype =='O']


print("there are {} discrete features".format(len(discrete)))
print("there are {} continous or numeric features".format(len(contin)))
print("there are {} categorical features".format(len(categorical)))

there are 0 discrete features
there are 94 continous or numeric features
there are 6 categorical features


In [114]:
categorical

['x34', 'x35', 'x41', 'x45', 'x68', 'x93']

In [115]:
data[categorical].nunique()

x34       10
x35        8
x41    37824
x45       10
x68       12
x93        3
dtype: int64

In [116]:
### drop 'X41' due to extreme high cardinality as it is some sort of primary key

data.drop('x41',axis=1, inplace=True)
categorical = [var for var in data.columns if data[var].dtype =='O']
print(categorical)

['x34', 'x35', 'x45', 'x68', 'x93']


In [117]:
for ft in categorical:
    print( ft ,"   ", data[ft].unique() )

x34     ['Honda' 'volkswagon' 'ford' 'Toyota' 'bmw' 'chrystler' 'tesla' 'nissan'
 nan 'mercades' 'chevrolet']
x35     ['wed' 'wednesday' 'thurday' 'thur' 'friday' 'tuesday' 'monday' 'fri' nan]
x45     ['0.0%' '-0.0%' '-0.02%' '0.01%' '0.02%' '0.03%' '-0.01%' '-0.03%'
 '-0.04%' '0.04%' nan]
x68     ['July' 'Jun' 'Aug' 'sept.' 'May' 'Apr' 'Oct' 'Mar' 'Dev' 'Nov' nan 'Feb'
 'January']
x93     ['asia' 'america' 'euorpe' nan]


In [118]:

data['x35'] = data['x35'].replace('wed','wednesday' )
data['x35'] = data['x35'].replace('thur','thursday')
data['x35'] = data['x35'].replace('thurday','thursday' )
data['x35'] = data['x35'].replace('fri','friday')
data['x68'] = data['x68'].replace('Dev','Dec')
data['x93'] = data['x93'].replace('euorpe','europe')

In [119]:
for ft in categorical:
    print( ft ,"   ", data[ft].unique() )

x34     ['Honda' 'volkswagon' 'ford' 'Toyota' 'bmw' 'chrystler' 'tesla' 'nissan'
 nan 'mercades' 'chevrolet']
x35     ['wednesday' 'thursday' 'friday' 'tuesday' 'monday' nan]
x45     ['0.0%' '-0.0%' '-0.02%' '0.01%' '0.02%' '0.03%' '-0.01%' '-0.03%'
 '-0.04%' '0.04%' nan]
x68     ['July' 'Jun' 'Aug' 'sept.' 'May' 'Apr' 'Oct' 'Mar' 'Dec' 'Nov' nan 'Feb'
 'January']
x93     ['asia' 'america' 'europe' nan]


In [120]:
#### split the data 
X_train, X_test, y_train, y_test = train_test_split(
data.drop('y',axis=1), ### predictors 
data['y'] ,      ### target
test_size=0.1,
    random_state =0
)

In [121]:
for ft in categorical:
    print( ft ,"   ", X_train[ft].unique() )

x34     ['bmw' 'Honda' 'Toyota' 'tesla' 'chrystler' 'volkswagon' 'nissan' 'ford'
 'mercades' 'chevrolet' nan]
x35     ['thursday' 'wednesday' 'friday' 'tuesday' 'monday' nan]
x45     ['0.01%' '-0.01%' '0.0%' '-0.0%' '-0.02%' '0.03%' '0.02%' '-0.03%' nan
 '-0.04%' '0.04%']
x68     ['May' 'Apr' 'July' 'sept.' nan 'Aug' 'Jun' 'Dec' 'Oct' 'Nov' 'Mar' 'Feb'
 'January']
x93     ['asia' 'america' 'europe' nan]


In [122]:
data.isnull().mean().sort_values(ascending= False)

x85    0.000375
x18    0.000350
x65    0.000350
x13    0.000350
x96    0.000325
         ...   
x88    0.000075
x43    0.000050
x83    0.000050
x91    0.000050
y      0.000000
Length: 100, dtype: float64

In [123]:
print(data.shape)

prob_df = data.groupby(['x93'])['y'].mean()
prob_df

(40000, 100)


x93
america    0.210627
asia       0.201710
europe     0.196078
Name: y, dtype: float64

In [124]:
#### load the models that you want to test
from xgboost import XGBClassifier

In [135]:
fe_seq=Pipeline([
    
    ### IMPUTE NUMERIC ########
    ('imputer_num',
    mdi.ArbitraryNumberImputer(arbitrary_number= -100,variables = contin)),
    ##### IMPUTE CATEGORICAL #######
    ('imputer_cat',
    mdi.CategoricalVariableImputer(variables=categorical)),
    ##### REMOVE OUTLIERS##############
    ('outlier_rem',Winsorizer(distribution='skewed',
                             tail='both',
                             fold=2.0,
                             variables=contin)),
    ####### REMOVE RARE LABELS ###########
    ('encoder_rare_label',
     ce.RareLabelCategoricalEncoder(tol=0.02,
                                   n_categories=5,
                                   variables=categorical)),
     ##########  ENCODE CATEGORICAL VARIABLES ##########
     #('categorical_encoder',
     #ce.OrdinalCategoricalEncoder(encoding_method='ordered',
     #                            variables=categorical)),
     ########### ENCODE CATEGORICAL VARIABLES ##############
    ('categorical_encoder',
     WoERatioCategoricalEncoder(encoding_method='woe', variables = categorical)
    ),
    #################  BIN NUMERICAL VARIABLES ##################
    #('BinDTE',
     # DecisionTreeDiscretiser(variables=contin,regression=False)),
     
     ###################### model is xgb
      ('xgb', XGBClassifier(max_depth=4,learning_rate=0.01))   
])

In [136]:
fe_seq.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('imputer_num',
                 ArbitraryNumberImputer(arbitrary_number=-100,
                                        variables=['x0', 'x1', 'x2', 'x3', 'x4',
                                                   'x5', 'x6', 'x7', 'x8', 'x9',
                                                   'x10', 'x11', 'x12', 'x13',
                                                   'x14', 'x15', 'x16', 'x17',
                                                   'x18', 'x19', 'x20', 'x21',
                                                   'x22', 'x23', 'x24', 'x25',
                                                   'x26', 'x27', 'x28', 'x29', ...])),
                ('imputer_cat',
                 CategoricalVariableImputer(variables=['x34', 'x35', 'x45',
                                                       'x6...
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bytree=1, gamma=0,
  

In [137]:
fe_seq.named_steps

{'imputer_num': ArbitraryNumberImputer(arbitrary_number=-100,
                        variables=['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6',
                                   'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13',
                                   'x14', 'x15', 'x16', 'x17', 'x18', 'x19',
                                   'x20', 'x21', 'x22', 'x23', 'x24', 'x25',
                                   'x26', 'x27', 'x28', 'x29', ...]),
 'imputer_cat': CategoricalVariableImputer(variables=['x34', 'x35', 'x45', 'x68', 'x93']),
 'outlier_rem': Winsorizer(distribution='skewed', fold=2.0, tail='both',
            variables=['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8',
                       'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16',
                       'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24',
                       'x25', 'x26', 'x27', 'x28', 'x29', ...]),
 'encoder_rare_label': RareLabelCategoricalEncoder(n_categories=5, tol=0.02,
                

In [138]:
 # GridSearchCV(my_pipe,param_grid,cv=3,iid=False,n_jobs=-1,scoring='roc_auc')
### n_jobs=-1 indicates to use all avilable cpus

In [139]:
#grid_search.fit(X_train, y_train)

In [140]:
X_train_preds = fe_seq.predict_proba (X_train)[:,1]
X_test_preds = fe_seq.predict_proba(X_test)[:,1]

In [141]:
print("Train AUC:{}".format( roc_auc_score(y_train,X_train_preds)))
print("Test AUC:{}".format( roc_auc_score(y_test,X_test_preds)))

Train AUC:0.8909579343688622
Test AUC:0.8691725490196078


In [132]:
print (type(X_train))
X_train.head(3)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x90,x91,x92,x93,x94,x95,x96,x97,x98,x99
31452,169.641904,0.088613,-59.990029,-4.151638,1.84348,4.227388,-1.532988,-0.8536,-29.899352,-6.527814,...,80.894934,1.093422,-0.567839,asia,-3.891708,-17.854918,3.092545,-9.385536,0.419702,-1.228195
17130,-3.526688,-0.2684,37.806397,-17.839056,-25.652887,-12.467933,-0.251797,-6.662036,30.391301,5.768376,...,-44.267839,1.389046,-0.535372,america,-2.130634,20.987745,-8.050412,-18.010767,0.703264,3.347602
38787,161.642454,-0.444992,-7.470376,16.23246,-31.172866,-29.116958,0.820005,-13.101334,23.864927,9.19368,...,79.699387,0.423168,-1.628482,asia,-6.364589,21.12976,-20.371733,0.433292,-0.922341,-8.360633


In [133]:
[ feat for feat in X_train.columns if X_train[feat].isnull().sum()  > 0]

['x0',
 'x1',
 'x2',
 'x3',
 'x4',
 'x5',
 'x6',
 'x7',
 'x8',
 'x9',
 'x10',
 'x11',
 'x12',
 'x13',
 'x14',
 'x15',
 'x16',
 'x17',
 'x18',
 'x19',
 'x20',
 'x21',
 'x22',
 'x23',
 'x24',
 'x25',
 'x26',
 'x27',
 'x28',
 'x29',
 'x30',
 'x31',
 'x32',
 'x33',
 'x34',
 'x35',
 'x36',
 'x37',
 'x38',
 'x39',
 'x40',
 'x42',
 'x43',
 'x44',
 'x45',
 'x46',
 'x47',
 'x48',
 'x49',
 'x50',
 'x51',
 'x52',
 'x53',
 'x54',
 'x55',
 'x56',
 'x57',
 'x58',
 'x59',
 'x60',
 'x61',
 'x62',
 'x63',
 'x64',
 'x65',
 'x66',
 'x67',
 'x68',
 'x69',
 'x70',
 'x71',
 'x72',
 'x73',
 'x74',
 'x75',
 'x76',
 'x77',
 'x78',
 'x79',
 'x80',
 'x81',
 'x82',
 'x83',
 'x84',
 'x85',
 'x86',
 'x87',
 'x88',
 'x89',
 'x90',
 'x91',
 'x92',
 'x93',
 'x94',
 'x95',
 'x96',
 'x97',
 'x98',
 'x99']