# Best practices for the project

# ---------------- Feature engineering-------------------------###

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Sklearn 
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

###### feature engineering packages
from feature_engine import missing_data_imputers as mdi
from feature_engine import discretisers as dsc
from feature_engine import categorical_encoders as ce
from feature_engine.categorical_encoders import WoERatioCategoricalEncoder
from feature_engine.discretisers import DecisionTreeDiscretiser
from feature_engine.outlier_removers import Winsorizer

######## Feature selection packages 

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


#### model selection 
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
data = pd.read_csv('exercise_03_train.csv' )

In [3]:
data.shape

(40000, 101)

In [4]:
data['x41'] = data['x41'].astype(str)
data['x41'] = data['x41'].apply(lambda x: x.strip('$'))
data['x41']= data['x41'].astype(float)

data['x41'].mean()

-3.2865869880482257

In [5]:
##### capture the type of each 
### numeric feature with less than 5 unique values is probably discrete 9 integers and not numeric
discrete= [ var for var in data.columns   if data[var].dtype != 'O' and var!='y' and data[var].nunique() < 5 ]
contin= [ var for var in data.columns     if data[var].dtype != 'O' and var!='y' and var not in discrete]
categorical = [var for var in data.columns if data[var].dtype =='O']


print("there are {} discrete features".format(len(discrete)))
print("there are {} continous or numeric features".format(len(contin)))
print("there are {} categorical features".format(len(categorical)))

there are 0 discrete features
there are 95 continous or numeric features
there are 5 categorical features


In [6]:
categorical

['x34', 'x35', 'x45', 'x68', 'x93']

In [7]:
data[categorical].nunique()

x34    10
x35     8
x45    10
x68    12
x93     3
dtype: int64

In [8]:
for ft in categorical:
    print( ft ,"   ", data[ft].unique() )

x34     ['Honda' 'volkswagon' 'ford' 'Toyota' 'bmw' 'chrystler' 'tesla' 'nissan'
 nan 'mercades' 'chevrolet']
x35     ['wed' 'wednesday' 'thurday' 'thur' 'friday' 'tuesday' 'monday' 'fri' nan]
x45     ['0.0%' '-0.0%' '-0.02%' '0.01%' '0.02%' '0.03%' '-0.01%' '-0.03%'
 '-0.04%' '0.04%' nan]
x68     ['July' 'Jun' 'Aug' 'sept.' 'May' 'Apr' 'Oct' 'Mar' 'Dev' 'Nov' nan 'Feb'
 'January']
x93     ['asia' 'america' 'euorpe' nan]


In [9]:

data['x35'] = data['x35'].replace('wed','wednesday' )
data['x35'] = data['x35'].replace('thur','thursday')
data['x35'] = data['x35'].replace('thurday','thursday' )
data['x35'] = data['x35'].replace('fri','friday')
data['x68'] = data['x68'].replace('Dev','Dec')
data['x93'] = data['x93'].replace('euorpe','europe')

In [10]:
for ft in categorical:
    print( ft ,"   ", data[ft].unique() )

x34     ['Honda' 'volkswagon' 'ford' 'Toyota' 'bmw' 'chrystler' 'tesla' 'nissan'
 nan 'mercades' 'chevrolet']
x35     ['wednesday' 'thursday' 'friday' 'tuesday' 'monday' nan]
x45     ['0.0%' '-0.0%' '-0.02%' '0.01%' '0.02%' '0.03%' '-0.01%' '-0.03%'
 '-0.04%' '0.04%' nan]
x68     ['July' 'Jun' 'Aug' 'sept.' 'May' 'Apr' 'Oct' 'Mar' 'Dec' 'Nov' nan 'Feb'
 'January']
x93     ['asia' 'america' 'europe' nan]


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Columns: 101 entries, x0 to y
dtypes: float64(95), int64(1), object(5)
memory usage: 30.8+ MB


In [17]:
#### split the data 
X_train, X_test, y_train, y_test = train_test_split(
data.drop('y',axis=1), ### predictors 
data['y'] ,      ### target
test_size=0.1,
    random_state =0
)

In [18]:
for ft in categorical:
    print( ft ,"   ", X_train[ft].unique() )

x34     ['bmw' 'Honda' 'Toyota' 'tesla' 'chrystler' 'volkswagon' 'nissan' 'ford'
 'mercades' 'chevrolet' nan]
x35     ['thursday' 'wednesday' 'friday' 'tuesday' 'monday' nan]
x45     ['0.01%' '-0.01%' '0.0%' '-0.0%' '-0.02%' '0.03%' '0.02%' '-0.03%' nan
 '-0.04%' '0.04%']
x68     ['May' 'Apr' 'July' 'sept.' nan 'Aug' 'Jun' 'Dec' 'Oct' 'Nov' 'Mar' 'Feb'
 'January']
x93     ['asia' 'america' 'europe' nan]


In [19]:
data.isnull().mean().sort_values(ascending= False)

x85    0.000375
x18    0.000350
x65    0.000350
x13    0.000350
x96    0.000325
         ...   
x88    0.000075
x43    0.000050
x83    0.000050
x91    0.000050
y      0.000000
Length: 101, dtype: float64

In [20]:
print(data.shape)

prob_df = data.groupby(['x93'])['y'].mean()
prob_df

(40000, 101)


x93
america    0.210627
asia       0.201710
europe     0.196078
Name: y, dtype: float64

In [21]:
fe_seq=Pipeline([
    
    ### IMPUTE NUMERIC ########
    ('imputer_num',
    mdi.ArbitraryNumberImputer(arbitrary_number= -100,variables = contin)),
    ##### IMPUTE CATEGORICAL #######
    ('imputer_cat',
    mdi.CategoricalVariableImputer(variables=categorical)),
    ##### REMOVE OUTLIERS##############
    ('outlier_rem',Winsorizer(distribution='skewed',
                             tail='both',
                             fold=2.0,
                             variables=contin)),
    ####### REMOVE RARE LABELS ###########
    ('encoder_rare_label',
     ce.RareLabelCategoricalEncoder(tol=0.02,
                                   n_categories=5,
                                   variables=categorical)),
     ##########  ENCODE CATEGORICAL VARIABLES ##########
     #('categorical_encoder',
     #ce.OrdinalCategoricalEncoder(encoding_method='ordered',
     #                            variables=categorical)),
     ########### ENCODE CATEGORICAL VARIABLES ##############
    ('categorical_encoder',
     WoERatioCategoricalEncoder(encoding_method='woe', variables = categorical)
    )
    #################  BIN NUMERICAL VARIABLES ##################
    #('BinDTE',
     # DecisionTreeDiscretiser(variables=contin,regression=False)),
     
     ###################### model is xgb
      #('xgb', XGBClassifier(max_depth=4,learning_rate=0.01))   
])

In [27]:
fe_seq.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('imputer_num',
                 ArbitraryNumberImputer(arbitrary_number=-100,
                                        variables=['x0', 'x1', 'x2', 'x3', 'x4',
                                                   'x5', 'x6', 'x7', 'x8', 'x9',
                                                   'x10', 'x11', 'x12', 'x13',
                                                   'x14', 'x15', 'x16', 'x17',
                                                   'x18', 'x19', 'x20', 'x21',
                                                   'x22', 'x23', 'x24', 'x25',
                                                   'x26', 'x27', 'x28', 'x29', ...])),
                ('imputer_cat',
                 CategoricalVariableImputer(variables=['x34', 'x35', 'x45',
                                                       'x6...
                                       'x13', 'x14', 'x15', 'x16', 'x17', 'x18',
                                       'x19', 'x20', 'x21', 'x22', 

In [28]:
fe_seq.named_steps

{'imputer_num': ArbitraryNumberImputer(arbitrary_number=-100,
                        variables=['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6',
                                   'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13',
                                   'x14', 'x15', 'x16', 'x17', 'x18', 'x19',
                                   'x20', 'x21', 'x22', 'x23', 'x24', 'x25',
                                   'x26', 'x27', 'x28', 'x29', ...]),
 'imputer_cat': CategoricalVariableImputer(variables=['x34', 'x35', 'x45', 'x68', 'x93']),
 'outlier_rem': Winsorizer(distribution='skewed', fold=2.0, tail='both',
            variables=['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8',
                       'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16',
                       'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24',
                       'x25', 'x26', 'x27', 'x28', 'x29', ...]),
 'encoder_rare_label': RareLabelCategoricalEncoder(n_categories=5, tol=0.02,
                

# ----------------- FEATURE SELECTION ---------------------------- 

In [29]:
X_train_fe = fe_seq.transform(X_train)
X_test_fe = fe_seq.transform(X_test)

print (X_train_fe.shape)
print(X_test_fe.shape)

type(X_train_fe)

(36000, 100)
(4000, 100)


pandas.core.frame.DataFrame

In [30]:
sel = VarianceThreshold(threshold=0.05)  # 0.1 indicates 99% of observations approximately
 
sel.fit(X_train_fe)  # fit finds the features with low variance
 
print("We are keeping total of {} features".format(sum(sel.get_support())) )

features_to_keep = X_train_fe.columns[sel.get_support()]  ### put the low variance features in a list


X_train_fe = sel.transform(X_train_fe)
X_test_fe = sel.transform(X_test_fe)

X_train_fe= pd.DataFrame(X_train_fe)
X_train_fe.columns = features_to_keep
 
X_test_fe= pd.DataFrame(X_test_fe)
X_test_fe.columns = features_to_keep

We are keeping total of 95 features


In [31]:
X_train_fe.head(3)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x89,x90,x91,x92,x94,x95,x96,x97,x98,x99
0,169.641904,0.088613,-59.990029,-4.151638,1.84348,4.227388,-1.532988,-0.8536,-29.899352,-6.527814,...,-6.41478,80.894934,1.093422,-0.567839,-3.891708,-17.854918,3.092545,-9.385536,0.419702,-1.228195
1,-3.526688,-0.2684,37.806397,-17.839056,-25.652887,-12.467933,-0.251797,-6.662036,30.391301,5.768376,...,-16.473919,-44.267839,1.389046,-0.535372,-2.130634,20.987745,-8.050412,-18.010767,0.703264,3.347602
2,161.642454,-0.444992,-7.470376,16.23246,-31.172866,-29.116958,0.820005,-13.101334,23.864927,9.19368,...,-4.27032,79.699387,0.423168,-1.628482,-6.364589,21.12976,-20.371733,0.433292,-0.922341,-8.360633


In [32]:
#### remove correlated features 
# find and remove correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train_fe, 0.9)
print('correlated features: ', len(set(corr_features)) )

X_train_fe.drop(labels=corr_features, axis=1, inplace=True)
X_test_fe.drop(labels=corr_features, axis=1, inplace=True)

X_train_fe = pd.DataFrame(X_train_fe)
X_test_fe = pd.DataFrame(X_test_fe)

X_train_fe.shape, X_test_fe.shape

correlated features:  0


((36000, 95), (4000, 95))

In [33]:
######## Feature selection with random forest 
sel_ = SelectFromModel(RandomForestClassifier(max_depth = 4 , n_estimators=300 , random_state=0))
sel_.fit(X_train_fe, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True,
                                                 class_weight=None,
                                                 criterion='gini', max_depth=4,
                                                 max_features='auto',
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=300, n_jobs=None,
                                                 oob_score=False,
                                                 random_state=0, verbose=0,
                                                 warm_sta

In [34]:
# sklearn will select those features which importance values
# are greater than the mean of all the coefficients.
sel_.get_support()

array([False,  True,  True,  True, False,  True, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False,  True,  True, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False,  True,  True, False, False,  True, False, False,
       False, False,  True,  True, False, False, False, False,  True,
       False,  True, False, False, False, False,  True, False, False,
        True, False,  True,  True, False,  True,  True, False,  True,
       False, False,  True, False, False, False, False,  True, False,
        True, False, False, False, False, False, False, False, False,
       False,  True,  True, False,  True])

In [35]:
selected_feat = X_train_fe.columns[(sel_.get_support())]
print(len(selected_feat) )

selected_feat

28


Index(['x1', 'x2', 'x3', 'x5', 'x10', 'x21', 'x22', 'x37', 'x40', 'x41', 'x44',
       'x50', 'x51', 'x56', 'x58', 'x63', 'x66', 'x69', 'x70', 'x72', 'x73',
       'x75', 'x78', 'x83', 'x85', 'x96', 'x97', 'x99'],
      dtype='object')

In [36]:
X_train_fe =  X_train_fe[selected_feat]
X_test_fe =  X_test_fe[selected_feat]

print(type(X_train_fe))
print(type(X_test_fe))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [37]:
#model = XGBClassifier(max_depth=4,learning_rate=0.01)
model = CatBoostClassifier(max_depth=4,learning_rate=0.01)

model.fit(X_train_fe,y_train)

0:	learn: 0.6876002	total: 167ms	remaining: 2m 46s
1:	learn: 0.6821042	total: 179ms	remaining: 1m 29s
2:	learn: 0.6773076	total: 191ms	remaining: 1m 3s
3:	learn: 0.6721560	total: 203ms	remaining: 50.6s
4:	learn: 0.6673186	total: 215ms	remaining: 42.9s
5:	learn: 0.6620110	total: 227ms	remaining: 37.7s
6:	learn: 0.6569184	total: 240ms	remaining: 34s
7:	learn: 0.6522135	total: 252ms	remaining: 31.2s
8:	learn: 0.6476014	total: 264ms	remaining: 29.1s
9:	learn: 0.6432540	total: 276ms	remaining: 27.4s
10:	learn: 0.6384598	total: 288ms	remaining: 25.9s
11:	learn: 0.6343397	total: 300ms	remaining: 24.7s
12:	learn: 0.6299898	total: 312ms	remaining: 23.7s
13:	learn: 0.6257428	total: 324ms	remaining: 22.8s
14:	learn: 0.6218758	total: 335ms	remaining: 22s
15:	learn: 0.6178125	total: 347ms	remaining: 21.3s
16:	learn: 0.6139072	total: 365ms	remaining: 21.1s
17:	learn: 0.6102393	total: 380ms	remaining: 20.7s
18:	learn: 0.6066250	total: 392ms	remaining: 20.3s
19:	learn: 0.6027649	total: 406ms	remaining

167:	learn: 0.4147030	total: 2.42s	remaining: 12s
168:	learn: 0.4143128	total: 2.44s	remaining: 12s
169:	learn: 0.4139023	total: 2.45s	remaining: 12s
170:	learn: 0.4133885	total: 2.46s	remaining: 11.9s
171:	learn: 0.4129059	total: 2.48s	remaining: 11.9s
172:	learn: 0.4124743	total: 2.49s	remaining: 11.9s
173:	learn: 0.4120142	total: 2.5s	remaining: 11.9s
174:	learn: 0.4115557	total: 2.51s	remaining: 11.8s
175:	learn: 0.4110754	total: 2.52s	remaining: 11.8s
176:	learn: 0.4106466	total: 2.54s	remaining: 11.8s
177:	learn: 0.4102796	total: 2.55s	remaining: 11.8s
178:	learn: 0.4098983	total: 2.56s	remaining: 11.7s
179:	learn: 0.4095110	total: 2.57s	remaining: 11.7s
180:	learn: 0.4090695	total: 2.58s	remaining: 11.7s
181:	learn: 0.4087482	total: 2.6s	remaining: 11.7s
182:	learn: 0.4083422	total: 2.62s	remaining: 11.7s
183:	learn: 0.4079192	total: 2.63s	remaining: 11.7s
184:	learn: 0.4075470	total: 2.64s	remaining: 11.6s
185:	learn: 0.4071550	total: 2.66s	remaining: 11.6s
186:	learn: 0.406723

336:	learn: 0.3630995	total: 4.86s	remaining: 9.55s
337:	learn: 0.3628383	total: 4.87s	remaining: 9.54s
338:	learn: 0.3625427	total: 4.88s	remaining: 9.52s
339:	learn: 0.3622865	total: 4.89s	remaining: 9.5s
340:	learn: 0.3621022	total: 4.91s	remaining: 9.48s
341:	learn: 0.3618412	total: 4.92s	remaining: 9.47s
342:	learn: 0.3616433	total: 4.93s	remaining: 9.45s
343:	learn: 0.3614393	total: 4.95s	remaining: 9.43s
344:	learn: 0.3612162	total: 4.96s	remaining: 9.42s
345:	learn: 0.3609888	total: 4.97s	remaining: 9.4s
346:	learn: 0.3607236	total: 4.98s	remaining: 9.38s
347:	learn: 0.3604751	total: 5s	remaining: 9.36s
348:	learn: 0.3602651	total: 5.01s	remaining: 9.34s
349:	learn: 0.3600795	total: 5.02s	remaining: 9.33s
350:	learn: 0.3598610	total: 5.03s	remaining: 9.31s
351:	learn: 0.3596113	total: 5.05s	remaining: 9.29s
352:	learn: 0.3594160	total: 5.06s	remaining: 9.28s
353:	learn: 0.3591709	total: 5.08s	remaining: 9.27s
354:	learn: 0.3589683	total: 5.09s	remaining: 9.25s
355:	learn: 0.358

504:	learn: 0.3307708	total: 7.06s	remaining: 6.92s
505:	learn: 0.3305669	total: 7.07s	remaining: 6.91s
506:	learn: 0.3304078	total: 7.09s	remaining: 6.89s
507:	learn: 0.3302221	total: 7.1s	remaining: 6.88s
508:	learn: 0.3300810	total: 7.12s	remaining: 6.87s
509:	learn: 0.3299201	total: 7.13s	remaining: 6.85s
510:	learn: 0.3297479	total: 7.15s	remaining: 6.84s
511:	learn: 0.3295877	total: 7.16s	remaining: 6.82s
512:	learn: 0.3294599	total: 7.17s	remaining: 6.81s
513:	learn: 0.3292894	total: 7.18s	remaining: 6.79s
514:	learn: 0.3290794	total: 7.2s	remaining: 6.78s
515:	learn: 0.3289341	total: 7.21s	remaining: 6.76s
516:	learn: 0.3287863	total: 7.22s	remaining: 6.75s
517:	learn: 0.3286218	total: 7.24s	remaining: 6.73s
518:	learn: 0.3284294	total: 7.26s	remaining: 6.73s
519:	learn: 0.3282401	total: 7.28s	remaining: 6.71s
520:	learn: 0.3280654	total: 7.29s	remaining: 6.7s
521:	learn: 0.3279503	total: 7.31s	remaining: 6.69s
522:	learn: 0.3278208	total: 7.32s	remaining: 6.68s
523:	learn: 0.3

666:	learn: 0.3076868	total: 9.3s	remaining: 4.64s
667:	learn: 0.3075853	total: 9.31s	remaining: 4.63s
668:	learn: 0.3074528	total: 9.32s	remaining: 4.61s
669:	learn: 0.3073171	total: 9.34s	remaining: 4.6s
670:	learn: 0.3071904	total: 9.36s	remaining: 4.59s
671:	learn: 0.3070618	total: 9.38s	remaining: 4.58s
672:	learn: 0.3069585	total: 9.39s	remaining: 4.56s
673:	learn: 0.3068282	total: 9.41s	remaining: 4.55s
674:	learn: 0.3067334	total: 9.42s	remaining: 4.54s
675:	learn: 0.3066305	total: 9.44s	remaining: 4.52s
676:	learn: 0.3065374	total: 9.46s	remaining: 4.51s
677:	learn: 0.3064065	total: 9.47s	remaining: 4.5s
678:	learn: 0.3062753	total: 9.49s	remaining: 4.49s
679:	learn: 0.3061472	total: 9.5s	remaining: 4.47s
680:	learn: 0.3060328	total: 9.52s	remaining: 4.46s
681:	learn: 0.3058861	total: 9.53s	remaining: 4.44s
682:	learn: 0.3057407	total: 9.54s	remaining: 4.43s
683:	learn: 0.3056237	total: 9.56s	remaining: 4.42s
684:	learn: 0.3055108	total: 9.57s	remaining: 4.4s
685:	learn: 0.305

826:	learn: 0.2898133	total: 11.5s	remaining: 2.41s
827:	learn: 0.2897082	total: 11.5s	remaining: 2.4s
828:	learn: 0.2895886	total: 11.6s	remaining: 2.38s
829:	learn: 0.2894592	total: 11.6s	remaining: 2.37s
830:	learn: 0.2893849	total: 11.6s	remaining: 2.35s
831:	learn: 0.2893024	total: 11.6s	remaining: 2.34s
832:	learn: 0.2891819	total: 11.6s	remaining: 2.33s
833:	learn: 0.2890942	total: 11.6s	remaining: 2.31s
834:	learn: 0.2889961	total: 11.6s	remaining: 2.3s
835:	learn: 0.2888913	total: 11.6s	remaining: 2.28s
836:	learn: 0.2888059	total: 11.7s	remaining: 2.27s
837:	learn: 0.2887379	total: 11.7s	remaining: 2.26s
838:	learn: 0.2885965	total: 11.7s	remaining: 2.24s
839:	learn: 0.2884726	total: 11.7s	remaining: 2.23s
840:	learn: 0.2883786	total: 11.7s	remaining: 2.21s
841:	learn: 0.2882904	total: 11.7s	remaining: 2.2s
842:	learn: 0.2881865	total: 11.7s	remaining: 2.19s
843:	learn: 0.2880929	total: 11.8s	remaining: 2.17s
844:	learn: 0.2880041	total: 11.8s	remaining: 2.16s
845:	learn: 0.2

988:	learn: 0.2749535	total: 13.7s	remaining: 153ms
989:	learn: 0.2748610	total: 13.7s	remaining: 139ms
990:	learn: 0.2747893	total: 13.7s	remaining: 125ms
991:	learn: 0.2747159	total: 13.8s	remaining: 111ms
992:	learn: 0.2746404	total: 13.8s	remaining: 97.1ms
993:	learn: 0.2745653	total: 13.8s	remaining: 83.2ms
994:	learn: 0.2744774	total: 13.8s	remaining: 69.4ms
995:	learn: 0.2744028	total: 13.8s	remaining: 55.5ms
996:	learn: 0.2742989	total: 13.8s	remaining: 41.6ms
997:	learn: 0.2742251	total: 13.8s	remaining: 27.7ms
998:	learn: 0.2741184	total: 13.9s	remaining: 13.9ms
999:	learn: 0.2740148	total: 13.9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x17128be3f28>

In [38]:
X_train_preds = model.predict_proba (X_train_fe)[:,1]
X_test_preds = model.predict_proba(X_test_fe)[:,1]

In [39]:
print("Train AUC:{}".format( roc_auc_score(y_train,X_train_preds)))
print("Test AUC:{}".format( roc_auc_score(y_test,X_test_preds)))

Train AUC:0.9448053341081525
Test AUC:0.9371648926237163
