In [2]:
### basic package for data science project
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
import helpers

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set()

In [3]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [4]:
imp_mean = IterativeImputer(random_state=0)

1. Remove columns with missing value more than 75% 
2. Remove col 'x39' and 'x99' since those two columns don't bring additional information 
3. Impute NAN for missing values in numerical methods 
4. We can use Catboost or Imputing categorical features is to replace missing values with the most common class. 


Alternative (if time permitted):
    - Check each features to see if there exists outliers.

In [5]:
train_data = pd.read_csv('data/exercise_40_train.csv')
test_data = pd.read_csv('data/exercise_40_test.csv')

In [6]:
most_missing_cols = list(set(train_data.columns[train_data.isnull().mean() > 0.4]))
remove_cat_col = ['x39', 'x99']
remove_cols = most_missing_cols + remove_cat_col

In [7]:
df = train_data.copy()
df = df.loc[:, (~df.columns.isin(most_missing_cols + remove_cat_col))]

#### find catergorical features
cat_df = df.select_dtypes(include=['object'])

#### clean col x7 and x19
cat_df['x7'] = cat_df['x7'].str.replace('%','')
cat_df["x19"] = cat_df['x19'].str.replace('$','')
cat_df = cat_df.astype({'x7': 'float', 'x19': 'float'})

#### clean x3 col
cat_df['x3'] = cat_df.apply(lambda x: helpers.clean_day_x3_col(x['x3']), axis = 1)

### convert x7 and x19 back to numerical fts 
df['x7'] =  cat_df['x7']
df['x19'] = cat_df["x19"]

#### categorical cols 
cat_cols = cat_df.select_dtypes(include=['object']).columns
cat_df.drop(['x7','x19'], axis = 1, inplace = True)

#### clean numerical features 
numerical_df = df.loc[:, (~df.columns.isin(cat_cols))]

### imputing missing values 
num_df_trasformed = pd.DataFrame(imp_mean.fit_transform(numerical_df.loc[:, ~(numerical_df.columns.isin(["y"]))]))

### if not using catboost model, then imputing categorical features 
### by replacing missing values with the most common class
### use label encoder for col 'x33' and one-hot encoder for the rest
catboost_model = True
if catboost_model == False:
    one_hot_cols = cat_df.columns[cat_df.columns != 'x33']  
    cat_df_transformed = helpers.create_dummy_df(cat_df, list(one_hot_cols), dummy_na= False)
    
    ### user label encoder for col 'x33'
    le = LabelEncoder()
    cat_df_transformed['x33'] = le.fit_transform(cat_df_transformed['x33'])
    
    ### save labelEncoder for test set
    with open('le_dict.pickle', 'wb') as l:
        pickle.dump(le, l, pickle.HIGHEST_PROTOCOL)
else:
    cat_df_transformed =  cat_df
        
cat_cols = cat_df.select_dtypes(include=['object']).columns
cleaned_data = pd.concat([num_df_trasformed, cat_df_transformed], axis = 1)
cleaned_data["y"] = df['y']

In [None]:
import numpy as np
from scipy.stats import randint
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

cleaned_data["y"] = df['y']
cleaned_data = cleaned_data.fillna('NaN')
# target
y = cleaned_data['y']

# features
cleaned_data.drop(['y'], axis = 1, inplace = True)
X = cleaned_data

#Instantiate CatBoostClassifier
cbc = CatBoostClassifier(class_weights={0:1, 1:6})

# # Creating the hyperparameter grid
# param_dist = { "learning_rate": np.linspace(0,0.2,5),
#                "max_depth": randint(3, 10)}
# #Instantiate RandomSearchCV object
# rscv = RandomizedSearchCV(cbc , param_dist, scoring='accuracy', cv =5)


grid = {'max_depth': [3,4,5,6], 'n_estimators':[200, 500, 1000], "learning_rate": [0.01, 0.5, 1, 1.5]}   
#Instantiate GridSearchCV
gscv = GridSearchCV(estimator = cbc, param_grid = grid, scoring ='accuracy', cv = 5)


#Fit the model
categorical_features_indices = np.where(X.dtypes != np.float)[0]
gscv.fit(X,y,cat_features=categorical_features_indices, plot = True)

# Print the tuned parameters and score
print(gscv.best_params_)
print(gscv.best_score_)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6920371	total: 78.4ms	remaining: 15.6s
1:	learn: 0.6907762	total: 103ms	remaining: 10.2s
2:	learn: 0.6896541	total: 131ms	remaining: 8.62s
3:	learn: 0.6883990	total: 157ms	remaining: 7.71s
4:	learn: 0.6871238	total: 188ms	remaining: 7.32s
5:	learn: 0.6859754	total: 215ms	remaining: 6.94s
6:	learn: 0.6849382	total: 249ms	remaining: 6.86s
7:	learn: 0.6839284	total: 315ms	remaining: 7.57s
8:	learn: 0.6825786	total: 343ms	remaining: 7.29s
9:	learn: 0.6813067	total: 371ms	remaining: 7.06s
10:	learn: 0.6799905	total: 398ms	remaining: 6.84s
11:	learn: 0.6789976	total: 425ms	remaining: 6.66s
12:	learn: 0.6780551	total: 452ms	remaining: 6.5s
13:	learn: 0.6768268	total: 488ms	remaining: 6.48s
14:	learn: 0.6757858	total: 567ms	remaining: 6.99s
15:	learn: 0.6746968	total: 592ms	remaining: 6.81s
16:	learn: 0.6735110	total: 618ms	remaining: 6.65s
17:	learn: 0.6725651	total: 644ms	remaining: 6.51s
18:	learn: 0.6716773	total: 672ms	remaining: 6.4s
19:	learn: 0.6709985	total: 701ms	remainin

161:	learn: 0.5993213	total: 6.07s	remaining: 1.42s
162:	learn: 0.5990780	total: 6.1s	remaining: 1.38s
163:	learn: 0.5989050	total: 6.13s	remaining: 1.34s
164:	learn: 0.5986764	total: 6.15s	remaining: 1.3s
165:	learn: 0.5984509	total: 6.18s	remaining: 1.26s
166:	learn: 0.5982299	total: 6.2s	remaining: 1.23s
167:	learn: 0.5979726	total: 6.23s	remaining: 1.19s
168:	learn: 0.5977448	total: 6.3s	remaining: 1.16s
169:	learn: 0.5975186	total: 6.32s	remaining: 1.11s
170:	learn: 0.5973276	total: 6.35s	remaining: 1.08s
171:	learn: 0.5970903	total: 6.37s	remaining: 1.04s
172:	learn: 0.5968586	total: 6.4s	remaining: 999ms
173:	learn: 0.5965892	total: 6.43s	remaining: 962ms
174:	learn: 0.5963220	total: 6.47s	remaining: 924ms
175:	learn: 0.5960474	total: 6.55s	remaining: 893ms
176:	learn: 0.5958024	total: 6.58s	remaining: 855ms
177:	learn: 0.5955370	total: 6.61s	remaining: 817ms
178:	learn: 0.5952971	total: 6.64s	remaining: 779ms
179:	learn: 0.5951026	total: 6.67s	remaining: 741ms
180:	learn: 0.594

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6920047	total: 92.5ms	remaining: 18.4s
1:	learn: 0.6907567	total: 118ms	remaining: 11.7s
2:	learn: 0.6896674	total: 145ms	remaining: 9.51s
3:	learn: 0.6886391	total: 171ms	remaining: 8.38s
4:	learn: 0.6875791	total: 195ms	remaining: 7.6s
5:	learn: 0.6865298	total: 243ms	remaining: 7.87s
6:	learn: 0.6851448	total: 270ms	remaining: 7.45s
7:	learn: 0.6838300	total: 394ms	remaining: 9.46s
8:	learn: 0.6825168	total: 420ms	remaining: 8.91s
9:	learn: 0.6812308	total: 445ms	remaining: 8.45s
10:	learn: 0.6800634	total: 470ms	remaining: 8.08s
11:	learn: 0.6789543	total: 503ms	remaining: 7.89s
12:	learn: 0.6781195	total: 532ms	remaining: 7.66s
13:	learn: 0.6770744	total: 559ms	remaining: 7.43s
14:	learn: 0.6758430	total: 650ms	remaining: 8.02s
15:	learn: 0.6746382	total: 681ms	remaining: 7.83s
16:	learn: 0.6734609	total: 723ms	remaining: 7.78s
17:	learn: 0.6725707	total: 752ms	remaining: 7.6s
18:	learn: 0.6715263	total: 783ms	remaining: 7.46s
19:	learn: 0.6706513	total: 811ms	remainin

165:	learn: 0.5980387	total: 6.45s	remaining: 1.32s
166:	learn: 0.5978200	total: 6.48s	remaining: 1.28s
167:	learn: 0.5974979	total: 6.5s	remaining: 1.24s
168:	learn: 0.5972935	total: 6.57s	remaining: 1.21s
169:	learn: 0.5970096	total: 6.6s	remaining: 1.16s
170:	learn: 0.5967273	total: 6.63s	remaining: 1.12s
171:	learn: 0.5965435	total: 6.65s	remaining: 1.08s
172:	learn: 0.5963388	total: 6.68s	remaining: 1.04s
173:	learn: 0.5960929	total: 6.71s	remaining: 1s
174:	learn: 0.5958628	total: 6.73s	remaining: 962ms
175:	learn: 0.5955929	total: 6.79s	remaining: 926ms
176:	learn: 0.5953312	total: 6.82s	remaining: 886ms
177:	learn: 0.5950710	total: 6.85s	remaining: 847ms
178:	learn: 0.5947836	total: 6.88s	remaining: 807ms
179:	learn: 0.5944985	total: 6.91s	remaining: 767ms
180:	learn: 0.5943108	total: 6.93s	remaining: 728ms
181:	learn: 0.5940400	total: 6.96s	remaining: 688ms
182:	learn: 0.5937404	total: 7.08s	remaining: 658ms
183:	learn: 0.5934611	total: 7.11s	remaining: 619ms
184:	learn: 0.593

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6916782	total: 87.2ms	remaining: 17.4s
1:	learn: 0.6902345	total: 111ms	remaining: 11s
2:	learn: 0.6890242	total: 136ms	remaining: 8.9s
3:	learn: 0.6876798	total: 162ms	remaining: 7.93s
4:	learn: 0.6863088	total: 200ms	remaining: 7.81s
5:	learn: 0.6851644	total: 226ms	remaining: 7.3s
6:	learn: 0.6842087	total: 252ms	remaining: 6.95s
7:	learn: 0.6829075	total: 336ms	remaining: 8.07s
8:	learn: 0.6816271	total: 363ms	remaining: 7.7s
9:	learn: 0.6804220	total: 392ms	remaining: 7.45s
10:	learn: 0.6791779	total: 421ms	remaining: 7.23s
11:	learn: 0.6781225	total: 448ms	remaining: 7.01s
12:	learn: 0.6771597	total: 473ms	remaining: 6.81s
13:	learn: 0.6760667	total: 502ms	remaining: 6.67s
14:	learn: 0.6753478	total: 580ms	remaining: 7.15s
15:	learn: 0.6743856	total: 618ms	remaining: 7.11s
16:	learn: 0.6733653	total: 646ms	remaining: 6.95s
17:	learn: 0.6724495	total: 675ms	remaining: 6.82s
18:	learn: 0.6714765	total: 701ms	remaining: 6.68s
19:	learn: 0.6706703	total: 726ms	remaining: 

161:	learn: 0.6016352	total: 5.75s	remaining: 1.35s
162:	learn: 0.6014038	total: 5.77s	remaining: 1.31s
163:	learn: 0.6010951	total: 5.8s	remaining: 1.27s
164:	learn: 0.6008538	total: 5.82s	remaining: 1.24s
165:	learn: 0.6006824	total: 5.85s	remaining: 1.2s
166:	learn: 0.6004362	total: 5.88s	remaining: 1.16s
167:	learn: 0.6001784	total: 5.91s	remaining: 1.13s
168:	learn: 0.5999415	total: 5.95s	remaining: 1.09s
169:	learn: 0.5997088	total: 5.98s	remaining: 1.05s
170:	learn: 0.5994318	total: 6.01s	remaining: 1.02s
171:	learn: 0.5992325	total: 6.03s	remaining: 982ms
172:	learn: 0.5989679	total: 6.06s	remaining: 947ms
173:	learn: 0.5987550	total: 6.09s	remaining: 910ms
174:	learn: 0.5985692	total: 6.13s	remaining: 875ms
175:	learn: 0.5983696	total: 6.19s	remaining: 844ms
176:	learn: 0.5981136	total: 6.22s	remaining: 808ms
177:	learn: 0.5978864	total: 6.25s	remaining: 772ms
178:	learn: 0.5976972	total: 6.27s	remaining: 736ms
179:	learn: 0.5974229	total: 6.3s	remaining: 700ms
180:	learn: 0.5

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6920458	total: 108ms	remaining: 21.5s
1:	learn: 0.6905843	total: 137ms	remaining: 13.6s
2:	learn: 0.6896844	total: 162ms	remaining: 10.7s
3:	learn: 0.6884287	total: 189ms	remaining: 9.27s
4:	learn: 0.6870441	total: 214ms	remaining: 8.33s
5:	learn: 0.6857292	total: 240ms	remaining: 7.75s
6:	learn: 0.6843925	total: 267ms	remaining: 7.35s
7:	learn: 0.6833203	total: 358ms	remaining: 8.6s
8:	learn: 0.6821581	total: 396ms	remaining: 8.41s
9:	learn: 0.6811720	total: 422ms	remaining: 8.02s
10:	learn: 0.6802375	total: 448ms	remaining: 7.69s
11:	learn: 0.6790464	total: 477ms	remaining: 7.47s
12:	learn: 0.6780791	total: 505ms	remaining: 7.26s
13:	learn: 0.6771889	total: 533ms	remaining: 7.08s
14:	learn: 0.6759832	total: 597ms	remaining: 7.37s
15:	learn: 0.6747684	total: 624ms	remaining: 7.18s
16:	learn: 0.6736072	total: 650ms	remaining: 7s
17:	learn: 0.6725634	total: 677ms	remaining: 6.84s
18:	learn: 0.6715805	total: 704ms	remaining: 6.71s
19:	learn: 0.6707346	total: 731ms	remaining: 

161:	learn: 0.6004611	total: 5.84s	remaining: 1.37s
162:	learn: 0.6002081	total: 5.87s	remaining: 1.33s
163:	learn: 0.5999201	total: 5.89s	remaining: 1.29s
164:	learn: 0.5996651	total: 5.92s	remaining: 1.25s
165:	learn: 0.5993773	total: 5.95s	remaining: 1.22s
166:	learn: 0.5991150	total: 5.97s	remaining: 1.18s
167:	learn: 0.5988667	total: 6.01s	remaining: 1.14s
168:	learn: 0.5985974	total: 6.11s	remaining: 1.12s
169:	learn: 0.5983498	total: 6.14s	remaining: 1.08s
170:	learn: 0.5980678	total: 6.16s	remaining: 1.04s
171:	learn: 0.5978792	total: 6.19s	remaining: 1.01s
172:	learn: 0.5975946	total: 6.21s	remaining: 970ms
173:	learn: 0.5973438	total: 6.24s	remaining: 933ms
174:	learn: 0.5971354	total: 6.27s	remaining: 895ms
175:	learn: 0.5969780	total: 6.31s	remaining: 861ms
176:	learn: 0.5966758	total: 6.34s	remaining: 824ms
177:	learn: 0.5964410	total: 6.37s	remaining: 787ms
178:	learn: 0.5962285	total: 6.39s	remaining: 750ms
179:	learn: 0.5960735	total: 6.43s	remaining: 715ms
180:	learn: 

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6920196	total: 77.2ms	remaining: 15.4s
1:	learn: 0.6908569	total: 103ms	remaining: 10.2s
2:	learn: 0.6894980	total: 128ms	remaining: 8.42s
3:	learn: 0.6883433	total: 154ms	remaining: 7.56s
4:	learn: 0.6869969	total: 181ms	remaining: 7.05s
5:	learn: 0.6856953	total: 208ms	remaining: 6.72s
6:	learn: 0.6846489	total: 234ms	remaining: 6.46s
7:	learn: 0.6835508	total: 281ms	remaining: 6.75s
8:	learn: 0.6824302	total: 311ms	remaining: 6.59s
9:	learn: 0.6813086	total: 337ms	remaining: 6.4s
10:	learn: 0.6800463	total: 364ms	remaining: 6.25s
11:	learn: 0.6790539	total: 393ms	remaining: 6.15s
12:	learn: 0.6780895	total: 419ms	remaining: 6.03s
13:	learn: 0.6769221	total: 454ms	remaining: 6.03s
14:	learn: 0.6760137	total: 517ms	remaining: 6.38s
15:	learn: 0.6749752	total: 561ms	remaining: 6.45s
16:	learn: 0.6738312	total: 588ms	remaining: 6.33s
17:	learn: 0.6727034	total: 614ms	remaining: 6.21s
18:	learn: 0.6715891	total: 640ms	remaining: 6.09s
19:	learn: 0.6706087	total: 667ms	remaini

161:	learn: 0.6008277	total: 5.91s	remaining: 1.39s
162:	learn: 0.6006343	total: 5.94s	remaining: 1.35s
163:	learn: 0.6003381	total: 5.96s	remaining: 1.31s
164:	learn: 0.6000384	total: 6s	remaining: 1.27s
165:	learn: 0.5997731	total: 6.03s	remaining: 1.23s
166:	learn: 0.5994679	total: 6.05s	remaining: 1.2s
167:	learn: 0.5991906	total: 6.09s	remaining: 1.16s
168:	learn: 0.5988798	total: 6.17s	remaining: 1.13s
169:	learn: 0.5986160	total: 6.2s	remaining: 1.09s
170:	learn: 0.5983976	total: 6.23s	remaining: 1.06s
171:	learn: 0.5981970	total: 6.25s	remaining: 1.02s
172:	learn: 0.5979467	total: 6.28s	remaining: 980ms
173:	learn: 0.5977286	total: 6.31s	remaining: 943ms
174:	learn: 0.5974784	total: 6.34s	remaining: 906ms
175:	learn: 0.5972529	total: 6.43s	remaining: 877ms
176:	learn: 0.5969906	total: 6.47s	remaining: 840ms
177:	learn: 0.5967050	total: 6.49s	remaining: 803ms
178:	learn: 0.5964604	total: 6.52s	remaining: 765ms
179:	learn: 0.5962097	total: 6.55s	remaining: 728ms
180:	learn: 0.596

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6920371	total: 91.8ms	remaining: 45.8s
1:	learn: 0.6907762	total: 135ms	remaining: 33.6s
2:	learn: 0.6896541	total: 163ms	remaining: 27.1s
3:	learn: 0.6883990	total: 190ms	remaining: 23.5s
4:	learn: 0.6871238	total: 218ms	remaining: 21.6s
5:	learn: 0.6859754	total: 245ms	remaining: 20.2s
6:	learn: 0.6849382	total: 297ms	remaining: 20.9s
7:	learn: 0.6839284	total: 377ms	remaining: 23.2s
8:	learn: 0.6825786	total: 405ms	remaining: 22.1s
9:	learn: 0.6813067	total: 433ms	remaining: 21.2s
10:	learn: 0.6799905	total: 461ms	remaining: 20.5s
11:	learn: 0.6789976	total: 489ms	remaining: 19.9s
12:	learn: 0.6780551	total: 521ms	remaining: 19.5s
13:	learn: 0.6768268	total: 551ms	remaining: 19.1s
14:	learn: 0.6757858	total: 646ms	remaining: 20.9s
15:	learn: 0.6746968	total: 673ms	remaining: 20.3s
16:	learn: 0.6735110	total: 700ms	remaining: 19.9s
17:	learn: 0.6725651	total: 730ms	remaining: 19.5s
18:	learn: 0.6716773	total: 760ms	remaining: 19.2s
19:	learn: 0.6709985	total: 791ms	remain

161:	learn: 0.5993213	total: 6.04s	remaining: 12.6s
162:	learn: 0.5990780	total: 6.07s	remaining: 12.5s
163:	learn: 0.5989050	total: 6.09s	remaining: 12.5s
164:	learn: 0.5986764	total: 6.12s	remaining: 12.4s
165:	learn: 0.5984509	total: 6.15s	remaining: 12.4s
166:	learn: 0.5982299	total: 6.17s	remaining: 12.3s
167:	learn: 0.5979726	total: 6.2s	remaining: 12.3s
168:	learn: 0.5977448	total: 6.29s	remaining: 12.3s
169:	learn: 0.5975186	total: 6.32s	remaining: 12.3s
170:	learn: 0.5973276	total: 6.35s	remaining: 12.2s
171:	learn: 0.5970903	total: 6.37s	remaining: 12.2s
172:	learn: 0.5968586	total: 6.4s	remaining: 12.1s
173:	learn: 0.5965892	total: 6.43s	remaining: 12s
174:	learn: 0.5963220	total: 6.47s	remaining: 12s
175:	learn: 0.5960474	total: 6.55s	remaining: 12.1s
176:	learn: 0.5958024	total: 6.57s	remaining: 12s
177:	learn: 0.5955370	total: 6.6s	remaining: 11.9s
178:	learn: 0.5952971	total: 6.62s	remaining: 11.9s
179:	learn: 0.5951026	total: 6.65s	remaining: 11.8s
180:	learn: 0.5947757

322:	learn: 0.5718033	total: 11.7s	remaining: 6.44s
323:	learn: 0.5716825	total: 11.8s	remaining: 6.4s
324:	learn: 0.5715582	total: 11.8s	remaining: 6.35s
325:	learn: 0.5714183	total: 11.8s	remaining: 6.31s
326:	learn: 0.5712754	total: 11.9s	remaining: 6.27s
327:	learn: 0.5711967	total: 11.9s	remaining: 6.23s
328:	learn: 0.5711101	total: 11.9s	remaining: 6.19s
329:	learn: 0.5709722	total: 12s	remaining: 6.18s
330:	learn: 0.5708196	total: 12s	remaining: 6.13s
331:	learn: 0.5707193	total: 12s	remaining: 6.09s
332:	learn: 0.5706089	total: 12.1s	remaining: 6.05s
333:	learn: 0.5704682	total: 12.1s	remaining: 6.02s
334:	learn: 0.5703354	total: 12.1s	remaining: 5.98s
335:	learn: 0.5702092	total: 12.2s	remaining: 5.94s
336:	learn: 0.5700994	total: 12.3s	remaining: 5.93s
337:	learn: 0.5699595	total: 12.3s	remaining: 5.89s
338:	learn: 0.5698441	total: 12.3s	remaining: 5.85s
339:	learn: 0.5697085	total: 12.4s	remaining: 5.81s
340:	learn: 0.5695950	total: 12.4s	remaining: 5.77s
341:	learn: 0.56948

483:	learn: 0.5578285	total: 17.5s	remaining: 579ms
484:	learn: 0.5577236	total: 17.6s	remaining: 543ms
485:	learn: 0.5576660	total: 17.6s	remaining: 506ms
486:	learn: 0.5576323	total: 17.6s	remaining: 470ms
487:	learn: 0.5575909	total: 17.6s	remaining: 434ms
488:	learn: 0.5575117	total: 17.7s	remaining: 397ms
489:	learn: 0.5574679	total: 17.7s	remaining: 361ms
490:	learn: 0.5574155	total: 17.8s	remaining: 326ms
491:	learn: 0.5573520	total: 17.8s	remaining: 290ms
492:	learn: 0.5572965	total: 17.9s	remaining: 254ms
493:	learn: 0.5572574	total: 17.9s	remaining: 217ms
494:	learn: 0.5572046	total: 17.9s	remaining: 181ms
495:	learn: 0.5571484	total: 18s	remaining: 145ms
496:	learn: 0.5570999	total: 18s	remaining: 109ms
497:	learn: 0.5570506	total: 18.1s	remaining: 72.6ms
498:	learn: 0.5569450	total: 18.1s	remaining: 36.3ms
499:	learn: 0.5568908	total: 18.1s	remaining: 0us


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6920047	total: 107ms	remaining: 53.3s
1:	learn: 0.6907567	total: 133ms	remaining: 33.1s
2:	learn: 0.6896674	total: 160ms	remaining: 26.4s
3:	learn: 0.6886391	total: 185ms	remaining: 23s
4:	learn: 0.6875791	total: 209ms	remaining: 20.6s
5:	learn: 0.6865298	total: 236ms	remaining: 19.4s
6:	learn: 0.6851448	total: 264ms	remaining: 18.6s
7:	learn: 0.6838300	total: 345ms	remaining: 21.2s
8:	learn: 0.6825168	total: 373ms	remaining: 20.4s
9:	learn: 0.6812308	total: 400ms	remaining: 19.6s
10:	learn: 0.6800634	total: 428ms	remaining: 19s
11:	learn: 0.6789543	total: 456ms	remaining: 18.5s
12:	learn: 0.6781195	total: 486ms	remaining: 18.2s
13:	learn: 0.6770744	total: 513ms	remaining: 17.8s
14:	learn: 0.6758430	total: 616ms	remaining: 19.9s
15:	learn: 0.6746382	total: 659ms	remaining: 19.9s
16:	learn: 0.6734609	total: 687ms	remaining: 19.5s
17:	learn: 0.6725707	total: 715ms	remaining: 19.1s
18:	learn: 0.6715263	total: 743ms	remaining: 18.8s
19:	learn: 0.6706513	total: 769ms	remaining: 

161:	learn: 0.5991838	total: 5.84s	remaining: 12.2s
162:	learn: 0.5988840	total: 5.87s	remaining: 12.1s
163:	learn: 0.5986683	total: 5.89s	remaining: 12.1s
164:	learn: 0.5983335	total: 5.92s	remaining: 12s
165:	learn: 0.5980387	total: 5.95s	remaining: 12s
166:	learn: 0.5978200	total: 5.97s	remaining: 11.9s
167:	learn: 0.5974979	total: 6.01s	remaining: 11.9s
168:	learn: 0.5972935	total: 6.09s	remaining: 11.9s
169:	learn: 0.5970096	total: 6.12s	remaining: 11.9s
170:	learn: 0.5967273	total: 6.14s	remaining: 11.8s
171:	learn: 0.5965435	total: 6.17s	remaining: 11.8s
172:	learn: 0.5963388	total: 6.2s	remaining: 11.7s
173:	learn: 0.5960929	total: 6.22s	remaining: 11.7s
174:	learn: 0.5958628	total: 6.25s	remaining: 11.6s
175:	learn: 0.5955929	total: 6.31s	remaining: 11.6s
176:	learn: 0.5953312	total: 6.34s	remaining: 11.6s
177:	learn: 0.5950710	total: 6.37s	remaining: 11.5s
178:	learn: 0.5947836	total: 6.39s	remaining: 11.5s
179:	learn: 0.5944985	total: 6.42s	remaining: 11.4s
180:	learn: 0.594

322:	learn: 0.5712023	total: 11.3s	remaining: 6.17s
323:	learn: 0.5710325	total: 11.3s	remaining: 6.14s
324:	learn: 0.5708751	total: 11.3s	remaining: 6.1s
325:	learn: 0.5707605	total: 11.3s	remaining: 6.06s
326:	learn: 0.5706410	total: 11.4s	remaining: 6.01s
327:	learn: 0.5705093	total: 11.4s	remaining: 5.97s
328:	learn: 0.5704368	total: 11.4s	remaining: 5.94s
329:	learn: 0.5703174	total: 11.5s	remaining: 5.92s
330:	learn: 0.5702111	total: 11.5s	remaining: 5.88s
331:	learn: 0.5701038	total: 11.6s	remaining: 5.84s
332:	learn: 0.5700095	total: 11.6s	remaining: 5.8s
333:	learn: 0.5698985	total: 11.6s	remaining: 5.76s
334:	learn: 0.5697392	total: 11.6s	remaining: 5.72s
335:	learn: 0.5696038	total: 11.7s	remaining: 5.69s
336:	learn: 0.5694593	total: 11.7s	remaining: 5.67s
337:	learn: 0.5693224	total: 11.8s	remaining: 5.63s
338:	learn: 0.5691653	total: 11.8s	remaining: 5.59s
339:	learn: 0.5690360	total: 11.8s	remaining: 5.56s
340:	learn: 0.5689457	total: 11.8s	remaining: 5.52s
341:	learn: 0.

483:	learn: 0.5567254	total: 16.8s	remaining: 555ms
484:	learn: 0.5566638	total: 16.8s	remaining: 520ms
485:	learn: 0.5565968	total: 16.8s	remaining: 485ms
486:	learn: 0.5565180	total: 16.9s	remaining: 450ms
487:	learn: 0.5564383	total: 16.9s	remaining: 415ms
488:	learn: 0.5563742	total: 16.9s	remaining: 381ms
489:	learn: 0.5562871	total: 16.9s	remaining: 346ms
490:	learn: 0.5562250	total: 17s	remaining: 312ms
491:	learn: 0.5561581	total: 17s	remaining: 277ms
492:	learn: 0.5561083	total: 17.1s	remaining: 242ms
493:	learn: 0.5560525	total: 17.1s	remaining: 208ms
494:	learn: 0.5560066	total: 17.2s	remaining: 173ms
495:	learn: 0.5559602	total: 17.2s	remaining: 139ms
496:	learn: 0.5558940	total: 17.2s	remaining: 104ms
497:	learn: 0.5558543	total: 17.3s	remaining: 69.6ms
498:	learn: 0.5557806	total: 17.4s	remaining: 34.8ms
499:	learn: 0.5557467	total: 17.4s	remaining: 0us


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6916782	total: 97.1ms	remaining: 48.4s
1:	learn: 0.6902345	total: 133ms	remaining: 33.1s
2:	learn: 0.6890242	total: 157ms	remaining: 25.9s
3:	learn: 0.6876798	total: 181ms	remaining: 22.4s
4:	learn: 0.6863088	total: 206ms	remaining: 20.4s
5:	learn: 0.6851644	total: 232ms	remaining: 19.1s
6:	learn: 0.6842087	total: 257ms	remaining: 18.1s
7:	learn: 0.6829075	total: 317ms	remaining: 19.5s
8:	learn: 0.6816271	total: 343ms	remaining: 18.7s
9:	learn: 0.6804220	total: 376ms	remaining: 18.4s
10:	learn: 0.6791779	total: 401ms	remaining: 17.8s
11:	learn: 0.6781225	total: 429ms	remaining: 17.4s
12:	learn: 0.6771597	total: 455ms	remaining: 17s
13:	learn: 0.6760667	total: 483ms	remaining: 16.8s
14:	learn: 0.6753478	total: 573ms	remaining: 18.5s
15:	learn: 0.6743856	total: 599ms	remaining: 18.1s
16:	learn: 0.6733653	total: 625ms	remaining: 17.7s
17:	learn: 0.6724495	total: 652ms	remaining: 17.5s
18:	learn: 0.6714765	total: 679ms	remaining: 17.2s
19:	learn: 0.6706703	total: 711ms	remainin

161:	learn: 0.6016352	total: 5.59s	remaining: 11.7s
162:	learn: 0.6014038	total: 5.62s	remaining: 11.6s
163:	learn: 0.6010951	total: 5.64s	remaining: 11.6s
164:	learn: 0.6008538	total: 5.67s	remaining: 11.5s
165:	learn: 0.6006824	total: 5.69s	remaining: 11.5s
166:	learn: 0.6004362	total: 5.73s	remaining: 11.4s
167:	learn: 0.6001784	total: 5.76s	remaining: 11.4s
168:	learn: 0.5999415	total: 5.83s	remaining: 11.4s
169:	learn: 0.5997088	total: 5.86s	remaining: 11.4s
170:	learn: 0.5994318	total: 5.88s	remaining: 11.3s
171:	learn: 0.5992325	total: 5.91s	remaining: 11.3s
172:	learn: 0.5989679	total: 5.93s	remaining: 11.2s
173:	learn: 0.5987550	total: 5.98s	remaining: 11.2s
174:	learn: 0.5985692	total: 6s	remaining: 11.2s
175:	learn: 0.5983696	total: 6.07s	remaining: 11.2s
176:	learn: 0.5981136	total: 6.09s	remaining: 11.1s
177:	learn: 0.5978864	total: 6.12s	remaining: 11.1s
178:	learn: 0.5976972	total: 6.15s	remaining: 11s
179:	learn: 0.5974229	total: 6.17s	remaining: 11s
180:	learn: 0.59723

322:	learn: 0.5745905	total: 11.4s	remaining: 6.26s
323:	learn: 0.5744868	total: 11.4s	remaining: 6.22s
324:	learn: 0.5743482	total: 11.5s	remaining: 6.18s
325:	learn: 0.5742759	total: 11.5s	remaining: 6.14s
326:	learn: 0.5741440	total: 11.5s	remaining: 6.1s
327:	learn: 0.5740335	total: 11.6s	remaining: 6.06s
328:	learn: 0.5739369	total: 11.6s	remaining: 6.02s
329:	learn: 0.5738561	total: 11.7s	remaining: 6.02s
330:	learn: 0.5737733	total: 11.7s	remaining: 5.97s
331:	learn: 0.5737088	total: 11.7s	remaining: 5.93s
332:	learn: 0.5735741	total: 11.8s	remaining: 5.89s
333:	learn: 0.5734619	total: 11.8s	remaining: 5.85s
334:	learn: 0.5733342	total: 11.8s	remaining: 5.81s
335:	learn: 0.5732580	total: 11.8s	remaining: 5.77s


In [None]:
def clean_train_data(df, remove_cols, data_type = "train", label_encoder_path = "le_dict.pickle", catboost_model = True):
    df = train_data.copy()
    df = df.loc[:, (~df.columns.isin(most_missing_cols + remove_cat_col))]

    #### find catergorical features
    cat_df = df.select_dtypes(include=['object'])

    #### clean col x7 and x19
    cat_df['x7'] = cat_df['x7'].str.replace('%','')
    cat_df["x19"] = cat_df['x19'].str.replace('$','')
    cat_df = cat_df.astype({'x7': 'float', 'x19': 'float'})

    #### clean x3 col
    cat_df['x3'] = cat_df.apply(lambda x: helpers.clean_day_x3_col(x['x3']), axis = 1)

    ### convert x7 and x19 back to numerical fts 
    df['x7'] =  cat_df['x7']
    df['x19'] = cat_df["x19"]

    #### categorical cols 
    cat_cols = cat_df.select_dtypes(include=['object']).columns
    cat_df.drop(['x7','x19'], axis = 1, inplace = True)

    #### clean numerical features 
    numerical_df = df.loc[:, (~df.columns.isin(cat_cols))]

    ### imputing missing values 
    num_df_trasformed = pd.DataFrame(imp_mean.fit_transform(numerical_df.loc[:, ~(numerical_df.columns.isin(["y"]))]))

    ### if not using catboost model, then imputing categorical features 
    ### by replacing missing values with the most common class
    ### use label encoder for col 'x33' and one-hot encoder for the rest
    catboost_model = True
    if catboost_model == False:
        one_hot_cols = cat_df.columns[cat_df.columns != 'x33']  
        cat_df_transformed = helpers.create_dummy_df(cat_df, list(one_hot_cols), dummy_na= False)

        if data_type == "train":
            ### user label encoder for col 'x33'
            le = LabelEncoder()
            cat_df_transformed['x33'] = le.fit_transform(cat_df_transformed['x33'])

            ### save labelEncoder for test set
            with open(label_encoder_path, 'wb') as l:
                pickle.dump(le, l, pickle.HIGHEST_PROTOCOL)
        else: 
            with open(label_encoder_path, 'rb') as f:
                le = pickle.load(f)
            cat_df_transformed =  le.transform(cat_df_transformed['x33'])
    else:
        cat_df_transformed = cat_df
        

    cat_cols = cat_df.select_dtypes(include=['object']).columns
    cleaned_data = pd.concat([num_df_trasformed, cat_df_transformed], axis = 1)
    cleaned_data["y"] = df['y']

    return cat_cols, cleaned_data
