In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Sklearn 
from sklearn.model_selection import train_test_split , GridSearchCV
#from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

###### feature engineering packages
from feature_engine import missing_data_imputers as mdi
from feature_engine import discretisers as dsc
from feature_engine import categorical_encoders as ce
from feature_engine.categorical_encoders import WoERatioCategoricalEncoder
from feature_engine.discretisers import DecisionTreeDiscretiser
from feature_engine.outlier_removers import Winsorizer

######## Feature selection packages 

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

### function transformer
from sklearn.preprocessing import FunctionTransformer

#### model selection 
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [64]:
data = pd.read_csv('exercise_03_train.csv' )

selected_feat = ['x1', 'x2', 'x3', 'x5', 'x10', 'x21', 'x22', 'x37', 'x40', 'x41', 'x44',
       'x50', 'x51', 'x56', 'x58', 'x63', 'x66', 'x69', 'x70', 'x72', 'x73',
       'x75', 'x78', 'x83', 'x85', 'x96', 'x97', 'x99','y']

data = data[selected_feat]

In [65]:
def doll_float(col):
    col = col.astype(str)
    col = col.apply(lambda x: x.strip('$'))
    col= col.astype(float)
    return col

##### create the pipeline object 
#get_doll_float  = FunctionTransformer(doll_float , validate=False)

In [66]:
data['x41'] = doll_float(data['x41'])
data['x41']

0         229.47
1         213.90
2        2207.13
3          82.09
4         273.61
          ...   
39995    -442.47
39996     235.29
39997   -1068.50
39998     786.50
39999    -605.52
Name: x41, Length: 40000, dtype: float64

In [67]:
##### capture the type of each 
### numeric feature with less than 5 unique values is probably discrete 9 integers and not numeric
    discrete= [ var for var in data.columns   if data[var].dtype != 'O' and var!='y' and data[var].nunique() < 5 ]
    contin= [ var for var in data.columns     if data[var].dtype != 'O' and var!='y' and var not in discrete]
    categorical = [var for var in data.columns if data[var].dtype =='O']
    print("there are {} discrete features".format(len(discrete)))
    print("there are {} continous or numeric features".format(len(contin)))
    print("there are {} categorical features".format(len(categorical)))

there are 0 discrete features
there are 28 continous or numeric features
there are 0 categorical features


In [68]:
#### split the data 
X_train, X_test, y_train, y_test = train_test_split(
data.drop('y',axis=1), ### predictors 
data['y'] ,      ### target
test_size=0.1,
    random_state =0
)

In [69]:
seq=Pipeline([
    ### IMPUTE NUMERIC ########
    ('imputer_num',
    mdi.ArbitraryNumberImputer(arbitrary_number= -100,variables = contin)),
    ##### IMPUTE CATEGORICAL #######
    ('imputer_cat',
    mdi.CategoricalVariableImputer(variables=categorical)),
    ##### REMOVE OUTLIERS##############
    ('outlier_rem',Winsorizer(distribution='skewed',
                             tail='both',
                             fold=2.0,
                             variables=contin)),
    ####### REMOVE RARE LABELS ###########
    #('encoder_rare_label',
    # ce.RareLabelCategoricalEncoder(tol=0.02,
     #                              n_categories=5,
      #                             variables=categorical)),
     ##########  ENCODE CATEGORICAL VARIABLES ##########
     #('categorical_encoder',
     #ce.OrdinalCategoricalEncoder(encoding_method='ordered',
     #                            variables=categorical)),
     ########### ENCODE CATEGORICAL VARIABLES ##############
    #('categorical_encoder',
    # WoERatioCategoricalEncoder(encoding_method='woe', variables = categorical)
    #),
     
     ###################### model is catboost
      ('catboost', CatBoostClassifier(max_depth=6,learning_rate=0.01))   
])

In [70]:
seq.fit(X_train,y_train)

0:	learn: 0.6871581	total: 34.9ms	remaining: 34.9s
1:	learn: 0.6811539	total: 54.3ms	remaining: 27.1s
2:	learn: 0.6753258	total: 73.4ms	remaining: 24.4s
3:	learn: 0.6694773	total: 93ms	remaining: 23.1s
4:	learn: 0.6638420	total: 114ms	remaining: 22.6s
5:	learn: 0.6582475	total: 134ms	remaining: 22.2s
6:	learn: 0.6528458	total: 154ms	remaining: 21.9s
7:	learn: 0.6480916	total: 173ms	remaining: 21.5s
8:	learn: 0.6427541	total: 193ms	remaining: 21.2s
9:	learn: 0.6374980	total: 214ms	remaining: 21.2s
10:	learn: 0.6324765	total: 239ms	remaining: 21.5s
11:	learn: 0.6274963	total: 260ms	remaining: 21.4s
12:	learn: 0.6225825	total: 280ms	remaining: 21.2s
13:	learn: 0.6176464	total: 300ms	remaining: 21.1s
14:	learn: 0.6132353	total: 320ms	remaining: 21s
15:	learn: 0.6088327	total: 338ms	remaining: 20.8s
16:	learn: 0.6047657	total: 359ms	remaining: 20.7s
17:	learn: 0.6001965	total: 381ms	remaining: 20.8s
18:	learn: 0.5964299	total: 399ms	remaining: 20.6s
19:	learn: 0.5922992	total: 419ms	remaini

163:	learn: 0.3791607	total: 3.71s	remaining: 18.9s
164:	learn: 0.3785762	total: 3.73s	remaining: 18.9s
165:	learn: 0.3779851	total: 3.76s	remaining: 18.9s
166:	learn: 0.3774338	total: 3.78s	remaining: 18.9s
167:	learn: 0.3768494	total: 3.81s	remaining: 18.9s
168:	learn: 0.3762136	total: 3.83s	remaining: 18.8s
169:	learn: 0.3757294	total: 3.85s	remaining: 18.8s
170:	learn: 0.3752272	total: 3.88s	remaining: 18.8s
171:	learn: 0.3747585	total: 3.9s	remaining: 18.8s
172:	learn: 0.3742144	total: 3.93s	remaining: 18.8s
173:	learn: 0.3737521	total: 3.95s	remaining: 18.8s
174:	learn: 0.3731740	total: 3.97s	remaining: 18.7s
175:	learn: 0.3725889	total: 4s	remaining: 18.7s
176:	learn: 0.3721554	total: 4.02s	remaining: 18.7s
177:	learn: 0.3716681	total: 4.04s	remaining: 18.7s
178:	learn: 0.3711402	total: 4.06s	remaining: 18.6s
179:	learn: 0.3706365	total: 4.09s	remaining: 18.6s
180:	learn: 0.3701344	total: 4.11s	remaining: 18.6s
181:	learn: 0.3696101	total: 4.14s	remaining: 18.6s
182:	learn: 0.36

327:	learn: 0.3156671	total: 8.01s	remaining: 16.4s
328:	learn: 0.3153938	total: 8.03s	remaining: 16.4s
329:	learn: 0.3151183	total: 8.06s	remaining: 16.4s
330:	learn: 0.3148591	total: 8.08s	remaining: 16.3s
331:	learn: 0.3145997	total: 8.12s	remaining: 16.3s
332:	learn: 0.3143383	total: 8.15s	remaining: 16.3s
333:	learn: 0.3140252	total: 8.21s	remaining: 16.4s
334:	learn: 0.3137612	total: 8.25s	remaining: 16.4s
335:	learn: 0.3134808	total: 8.29s	remaining: 16.4s
336:	learn: 0.3131384	total: 8.32s	remaining: 16.4s
337:	learn: 0.3128221	total: 8.34s	remaining: 16.3s
338:	learn: 0.3124618	total: 8.37s	remaining: 16.3s
339:	learn: 0.3121896	total: 8.4s	remaining: 16.3s
340:	learn: 0.3118960	total: 8.44s	remaining: 16.3s
341:	learn: 0.3115972	total: 8.46s	remaining: 16.3s
342:	learn: 0.3113294	total: 8.49s	remaining: 16.3s
343:	learn: 0.3110188	total: 8.53s	remaining: 16.3s
344:	learn: 0.3107288	total: 8.55s	remaining: 16.2s
345:	learn: 0.3104393	total: 8.58s	remaining: 16.2s
346:	learn: 0

486:	learn: 0.2792102	total: 12.1s	remaining: 12.7s
487:	learn: 0.2790295	total: 12.1s	remaining: 12.7s
488:	learn: 0.2788488	total: 12.1s	remaining: 12.7s
489:	learn: 0.2785813	total: 12.2s	remaining: 12.7s
490:	learn: 0.2784153	total: 12.2s	remaining: 12.6s
491:	learn: 0.2782456	total: 12.2s	remaining: 12.6s
492:	learn: 0.2780673	total: 12.2s	remaining: 12.6s
493:	learn: 0.2778698	total: 12.3s	remaining: 12.6s
494:	learn: 0.2776933	total: 12.3s	remaining: 12.5s
495:	learn: 0.2774835	total: 12.3s	remaining: 12.5s
496:	learn: 0.2772766	total: 12.3s	remaining: 12.5s
497:	learn: 0.2770168	total: 12.4s	remaining: 12.5s
498:	learn: 0.2768401	total: 12.4s	remaining: 12.4s
499:	learn: 0.2766311	total: 12.4s	remaining: 12.4s
500:	learn: 0.2764499	total: 12.4s	remaining: 12.4s
501:	learn: 0.2762928	total: 12.5s	remaining: 12.4s
502:	learn: 0.2760842	total: 12.5s	remaining: 12.3s
503:	learn: 0.2758940	total: 12.5s	remaining: 12.3s
504:	learn: 0.2757158	total: 12.5s	remaining: 12.3s
505:	learn: 

647:	learn: 0.2531225	total: 16.2s	remaining: 8.8s
648:	learn: 0.2529992	total: 16.2s	remaining: 8.77s
649:	learn: 0.2528659	total: 16.2s	remaining: 8.75s
650:	learn: 0.2527350	total: 16.3s	remaining: 8.72s
651:	learn: 0.2525682	total: 16.3s	remaining: 8.7s
652:	learn: 0.2524362	total: 16.3s	remaining: 8.67s
653:	learn: 0.2522788	total: 16.3s	remaining: 8.64s
654:	learn: 0.2521772	total: 16.4s	remaining: 8.62s
655:	learn: 0.2520499	total: 16.4s	remaining: 8.59s
656:	learn: 0.2519019	total: 16.4s	remaining: 8.57s
657:	learn: 0.2517736	total: 16.4s	remaining: 8.55s
658:	learn: 0.2516482	total: 16.5s	remaining: 8.52s
659:	learn: 0.2514765	total: 16.5s	remaining: 8.49s
660:	learn: 0.2513092	total: 16.5s	remaining: 8.47s
661:	learn: 0.2511472	total: 16.5s	remaining: 8.44s
662:	learn: 0.2510320	total: 16.6s	remaining: 8.42s
663:	learn: 0.2508772	total: 16.6s	remaining: 8.39s
664:	learn: 0.2507401	total: 16.6s	remaining: 8.36s
665:	learn: 0.2505709	total: 16.6s	remaining: 8.34s
666:	learn: 0.

806:	learn: 0.2334895	total: 20s	remaining: 4.79s
807:	learn: 0.2334096	total: 20.1s	remaining: 4.77s
808:	learn: 0.2333056	total: 20.1s	remaining: 4.74s
809:	learn: 0.2332138	total: 20.1s	remaining: 4.71s
810:	learn: 0.2331154	total: 20.1s	remaining: 4.69s
811:	learn: 0.2330078	total: 20.2s	remaining: 4.67s
812:	learn: 0.2328603	total: 20.2s	remaining: 4.64s
813:	learn: 0.2327427	total: 20.2s	remaining: 4.62s
814:	learn: 0.2326360	total: 20.2s	remaining: 4.59s
815:	learn: 0.2325336	total: 20.2s	remaining: 4.57s
816:	learn: 0.2324222	total: 20.3s	remaining: 4.54s
817:	learn: 0.2323150	total: 20.3s	remaining: 4.51s
818:	learn: 0.2322230	total: 20.3s	remaining: 4.49s
819:	learn: 0.2321310	total: 20.3s	remaining: 4.46s
820:	learn: 0.2319950	total: 20.4s	remaining: 4.44s
821:	learn: 0.2318914	total: 20.4s	remaining: 4.42s
822:	learn: 0.2317311	total: 20.4s	remaining: 4.39s
823:	learn: 0.2316415	total: 20.4s	remaining: 4.37s
824:	learn: 0.2315468	total: 20.5s	remaining: 4.34s
825:	learn: 0.

970:	learn: 0.2172266	total: 24s	remaining: 716ms
971:	learn: 0.2171394	total: 24s	remaining: 692ms
972:	learn: 0.2170462	total: 24s	remaining: 667ms
973:	learn: 0.2169593	total: 24.1s	remaining: 642ms
974:	learn: 0.2168963	total: 24.1s	remaining: 617ms
975:	learn: 0.2167982	total: 24.1s	remaining: 593ms
976:	learn: 0.2167355	total: 24.1s	remaining: 568ms
977:	learn: 0.2166408	total: 24.1s	remaining: 543ms
978:	learn: 0.2165538	total: 24.2s	remaining: 518ms
979:	learn: 0.2164804	total: 24.2s	remaining: 494ms
980:	learn: 0.2163547	total: 24.2s	remaining: 469ms
981:	learn: 0.2162492	total: 24.2s	remaining: 444ms
982:	learn: 0.2161656	total: 24.3s	remaining: 420ms
983:	learn: 0.2160779	total: 24.3s	remaining: 395ms
984:	learn: 0.2159919	total: 24.3s	remaining: 370ms
985:	learn: 0.2159285	total: 24.3s	remaining: 345ms
986:	learn: 0.2158145	total: 24.3s	remaining: 321ms
987:	learn: 0.2157273	total: 24.4s	remaining: 296ms
988:	learn: 0.2156361	total: 24.4s	remaining: 271ms
989:	learn: 0.2155

Pipeline(memory=None,
         steps=[('imputer_num',
                 ArbitraryNumberImputer(arbitrary_number=-100,
                                        variables=['x1', 'x2', 'x3', 'x5',
                                                   'x10', 'x21', 'x22', 'x37',
                                                   'x40', 'x41', 'x44', 'x50',
                                                   'x51', 'x56', 'x58', 'x63',
                                                   'x66', 'x69', 'x70', 'x72',
                                                   'x73', 'x75', 'x78', 'x83',
                                                   'x85', 'x96', 'x97',
                                                   'x99'])),
                ('imputer_cat', CategoricalVariableImputer(variables=[])),
                ('outlier_rem',
                 Winsorizer(distribution='skewed', fold=2.0, tail='both',
                            variables=['x1', 'x2', 'x3', 'x5', 'x10', 'x21',
                      

In [71]:
X_train_preds = seq.predict_proba (X_train)[:,1]
X_test_preds = seq.predict_proba(X_test)[:,1]

print("Train AUC:{}".format( roc_auc_score(y_train,X_train_preds)))
print("Test AUC:{}".format( roc_auc_score(y_test,X_test_preds)))

Train AUC:0.9694186920243907
Test AUC:0.959889075630252


In [72]:
##seq.get_params

In [73]:
from sklearn.externals import joblib
joblib.dump(seq, 'obieSeq.pkl')
joblib.dump(X_test_preds,'X_test_preds.pkl')

['X_test_preds.pkl']