In [50]:
### basic package for data science project
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
import helpers

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set()

In [49]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [3]:
imp_mean = IterativeImputer(random_state=0)

1. Remove columns with missing value more than 75% 
2. Remove col 'x39' and 'x99' since those two columns don't bring additional information 
3. Impute NAN for missing values in numerical methods 
4. We can use Catboost or Imputing categorical features is to replace missing values with the most common class. 


Alternative (if time permitted):
    - Check each features to see if there exists outliers.

In [12]:
train_data = pd.read_csv('data/exercise_40_train.csv')
test_data = pd.read_csv('data/exercise_40_test.csv')

In [20]:
most_missing_cols = list(set(train_data.columns[train_data.isnull().mean() > 0.4]))
remove_cat_col = ['x39', 'x99']
remove_cols = most_missing_cols + remove_cat_col

In [62]:
df = train_data.copy()
df = df.loc[:, (~df.columns.isin(most_missing_cols + remove_cat_col))]

#### find catergorical features
cat_df = df.select_dtypes(include=['object'])

#### clean col x7 and x19
cat_df['x7'] = cat_df['x7'].str.replace('%','')
cat_df["x19"] = cat_df['x19'].str.replace('$','')
cat_df = cat_df.astype({'x7': 'float', 'x19': 'float'})

#### clean x3 col
cat_df['x3'] = cat_df.apply(lambda x: helpers.clean_day_x3_col(x['x3']), axis = 1)

### convert x7 and x19 back to numerical fts 
df['x7'] =  cat_df['x7']
df['x19'] = cat_df["x19"]

#### categorical cols 
cat_cols = cat_df.select_dtypes(include=['object']).columns
cat_df.drop(['x7','x19'], axis = 1, inplace = True)

#### clean numerical features 
numerical_df = df.loc[:, (~df.columns.isin(cat_cols))]

### imputing missing values 
num_df_trasformed = pd.DataFrame(imp_mean.fit_transform(numerical_df.loc[:, ~(numerical_df.columns.isin(["y"]))]))

### if not using catboost model, then imputing categorical features 
### by replacing missing values with the most common class
### use label encoder for col 'x33' and one-hot encoder for the rest
catboost_model = True
if catboost_model == False:
    one_hot_cols = cat_df.columns[cat_df.columns != 'x33']  
    cat_df_transformed = create_dummy_df(cat_df, list(one_hot_cols), dummy_na= False)
    
    ### user label encoder for col 'x33'
    le = LabelEncoder()
    cat_df_transformed['x33'] = le.fit_transform(cat_df_transformed['x33'])
    
    ### save labelEncoder for test set
    with open('le_dict.pickle', 'wb') as l:
        pickle.dump(le, l, pickle.HIGHEST_PROTOCOL)
else:
    cat_df_transformed =  cat_df
        


In [116]:
# cleaned_data = pd.concat([num_df_trasformed, cat_df_transformed], axis = 1)
cleaned_data["y"] = df['y']
cleaned_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,x65_geico,x65_progressive,x77_chevrolet,x77_ford,x77_mercedes,x77_nissan,x77_subaru,x77_toyota,x93_yes,y
0,0.165254,18.060003,1.07738,-1.339233,-1.584341,0.0062,0.220784,1.816481,1.171788,109.626841,...,0,0,0,0,1,0,0,0,0,0
1,2.441471,18.416307,1.482586,0.920817,-0.759931,0.0064,1.192441,3.51395,1.4199,84.079367,...,0,0,0,0,1,0,0,0,0,1
2,4.427278,19.188092,0.145652,0.366093,0.709962,-0.0008,0.952323,0.782974,-1.247022,95.375221,...,1,0,0,0,0,0,1,0,0,1
3,3.925235,19.901257,1.763602,-0.251926,-0.827461,-0.0057,-0.520756,1.825586,2.223038,96.420382,...,1,0,0,0,0,1,0,0,0,0
4,2.868802,22.202473,3.405119,0.083162,1.381504,0.0109,-0.732739,2.15199,-0.275406,90.769952,...,1,0,0,0,0,0,0,1,1,0


In [117]:
import numpy as np
from scipy.stats import randint
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV

# target
y = cleaned_data['y']

# features
cleaned_data.drop(['y'], axis = 1, inplace = True)
X = np.array(cleaned_data)

#Instantiate CatBoostClassifier
cbc = CatBoostClassifier()

# Creating the hyperparameter grid
param_dist = { "learning_rate": np.linspace(0,0.2,5),
               "max_depth": randint(3, 10)}
               
#Instantiate RandomSearchCV object
rscv = RandomizedSearchCV(cbc , param_dist, scoring='accuracy', cv =5)

#Fit the model
rscv.fit(X,y)

# Print the tuned parameters and score
print(rscv.best_params_)
print(rscv.best_score_)

0:	learn: 0.5914223	total: 243ms	remaining: 4m 2s
1:	learn: 0.5254269	total: 276ms	remaining: 2m 17s
2:	learn: 0.4785233	total: 314ms	remaining: 1m 44s
3:	learn: 0.4453913	total: 348ms	remaining: 1m 26s
4:	learn: 0.4178965	total: 391ms	remaining: 1m 17s
5:	learn: 0.4036135	total: 441ms	remaining: 1m 13s
6:	learn: 0.3876468	total: 488ms	remaining: 1m 9s
7:	learn: 0.3770334	total: 519ms	remaining: 1m 4s
8:	learn: 0.3678717	total: 551ms	remaining: 1m
9:	learn: 0.3614766	total: 583ms	remaining: 57.7s
10:	learn: 0.3571263	total: 617ms	remaining: 55.4s
11:	learn: 0.3520285	total: 650ms	remaining: 53.5s
12:	learn: 0.3488597	total: 682ms	remaining: 51.8s
13:	learn: 0.3450000	total: 718ms	remaining: 50.5s
14:	learn: 0.3424745	total: 753ms	remaining: 49.5s
15:	learn: 0.3401973	total: 787ms	remaining: 48.4s
16:	learn: 0.3381940	total: 823ms	remaining: 47.6s
17:	learn: 0.3364835	total: 858ms	remaining: 46.8s
18:	learn: 0.3348802	total: 904ms	remaining: 46.7s
19:	learn: 0.3332099	total: 948ms	remai

162:	learn: 0.2163610	total: 5.95s	remaining: 30.5s
163:	learn: 0.2155528	total: 5.98s	remaining: 30.5s
164:	learn: 0.2148413	total: 6.01s	remaining: 30.4s
165:	learn: 0.2145576	total: 6.04s	remaining: 30.4s
166:	learn: 0.2140736	total: 6.08s	remaining: 30.3s
167:	learn: 0.2136060	total: 6.11s	remaining: 30.3s
168:	learn: 0.2129513	total: 6.14s	remaining: 30.2s
169:	learn: 0.2121336	total: 6.19s	remaining: 30.2s
170:	learn: 0.2112293	total: 6.24s	remaining: 30.2s
171:	learn: 0.2104098	total: 6.27s	remaining: 30.2s
172:	learn: 0.2097964	total: 6.3s	remaining: 30.1s
173:	learn: 0.2090795	total: 6.33s	remaining: 30.1s
174:	learn: 0.2081188	total: 6.37s	remaining: 30s
175:	learn: 0.2073741	total: 6.4s	remaining: 30s
176:	learn: 0.2063363	total: 6.43s	remaining: 29.9s
177:	learn: 0.2056926	total: 6.47s	remaining: 29.9s
178:	learn: 0.2051464	total: 6.5s	remaining: 29.8s
179:	learn: 0.2045428	total: 6.54s	remaining: 29.8s
180:	learn: 0.2037934	total: 6.57s	remaining: 29.7s
181:	learn: 0.20337

327:	learn: 0.1345187	total: 11.6s	remaining: 23.8s
328:	learn: 0.1340024	total: 11.6s	remaining: 23.7s
329:	learn: 0.1337302	total: 11.7s	remaining: 23.7s
330:	learn: 0.1333819	total: 11.7s	remaining: 23.6s
331:	learn: 0.1329903	total: 11.7s	remaining: 23.6s
332:	learn: 0.1326929	total: 11.8s	remaining: 23.5s
333:	learn: 0.1322148	total: 11.8s	remaining: 23.5s
334:	learn: 0.1317500	total: 11.8s	remaining: 23.5s
335:	learn: 0.1313612	total: 11.9s	remaining: 23.4s
336:	learn: 0.1311044	total: 11.9s	remaining: 23.4s
337:	learn: 0.1303929	total: 11.9s	remaining: 23.4s
338:	learn: 0.1300043	total: 12s	remaining: 23.3s
339:	learn: 0.1297585	total: 12s	remaining: 23.3s
340:	learn: 0.1294312	total: 12s	remaining: 23.3s
341:	learn: 0.1290616	total: 12.1s	remaining: 23.2s
342:	learn: 0.1284151	total: 12.1s	remaining: 23.2s
343:	learn: 0.1279897	total: 12.2s	remaining: 23.2s
344:	learn: 0.1275027	total: 12.2s	remaining: 23.1s
345:	learn: 0.1271688	total: 12.2s	remaining: 23.1s
346:	learn: 0.1270

487:	learn: 0.0827604	total: 17.2s	remaining: 18.1s
488:	learn: 0.0825526	total: 17.3s	remaining: 18s
489:	learn: 0.0823621	total: 17.3s	remaining: 18s
490:	learn: 0.0821300	total: 17.3s	remaining: 17.9s
491:	learn: 0.0818845	total: 17.3s	remaining: 17.9s
492:	learn: 0.0815154	total: 17.4s	remaining: 17.9s
493:	learn: 0.0811755	total: 17.4s	remaining: 17.8s
494:	learn: 0.0810252	total: 17.4s	remaining: 17.8s
495:	learn: 0.0809315	total: 17.5s	remaining: 17.8s
496:	learn: 0.0807153	total: 17.5s	remaining: 17.7s
497:	learn: 0.0806481	total: 17.5s	remaining: 17.7s
498:	learn: 0.0803967	total: 17.6s	remaining: 17.6s
499:	learn: 0.0801856	total: 17.6s	remaining: 17.6s
500:	learn: 0.0798584	total: 17.6s	remaining: 17.6s
501:	learn: 0.0795283	total: 17.7s	remaining: 17.5s
502:	learn: 0.0793197	total: 17.7s	remaining: 17.5s
503:	learn: 0.0790644	total: 17.8s	remaining: 17.5s
504:	learn: 0.0787948	total: 17.8s	remaining: 17.4s
505:	learn: 0.0785670	total: 17.8s	remaining: 17.4s
506:	learn: 0.07

649:	learn: 0.0541105	total: 22.9s	remaining: 12.3s
650:	learn: 0.0539937	total: 22.9s	remaining: 12.3s
651:	learn: 0.0537852	total: 23s	remaining: 12.3s
652:	learn: 0.0536808	total: 23s	remaining: 12.2s
653:	learn: 0.0535724	total: 23s	remaining: 12.2s
654:	learn: 0.0534254	total: 23.1s	remaining: 12.2s
655:	learn: 0.0533447	total: 23.1s	remaining: 12.1s
656:	learn: 0.0532381	total: 23.1s	remaining: 12.1s
657:	learn: 0.0531302	total: 23.2s	remaining: 12s
658:	learn: 0.0530515	total: 23.2s	remaining: 12s
659:	learn: 0.0529194	total: 23.2s	remaining: 12s
660:	learn: 0.0528499	total: 23.3s	remaining: 11.9s
661:	learn: 0.0527848	total: 23.3s	remaining: 11.9s
662:	learn: 0.0527004	total: 23.3s	remaining: 11.9s
663:	learn: 0.0525776	total: 23.4s	remaining: 11.8s
664:	learn: 0.0524774	total: 23.4s	remaining: 11.8s
665:	learn: 0.0523461	total: 23.4s	remaining: 11.8s
666:	learn: 0.0521401	total: 23.5s	remaining: 11.7s
667:	learn: 0.0519821	total: 23.5s	remaining: 11.7s
668:	learn: 0.0518188	to

812:	learn: 0.0361251	total: 28.6s	remaining: 6.58s
813:	learn: 0.0359901	total: 28.6s	remaining: 6.54s
814:	learn: 0.0358418	total: 28.7s	remaining: 6.5s
815:	learn: 0.0357395	total: 28.7s	remaining: 6.47s
816:	learn: 0.0356777	total: 28.7s	remaining: 6.43s
817:	learn: 0.0356066	total: 28.8s	remaining: 6.4s
818:	learn: 0.0354879	total: 28.8s	remaining: 6.36s
819:	learn: 0.0353523	total: 28.8s	remaining: 6.33s
820:	learn: 0.0352510	total: 28.9s	remaining: 6.29s
821:	learn: 0.0351882	total: 28.9s	remaining: 6.25s
822:	learn: 0.0351296	total: 28.9s	remaining: 6.22s
823:	learn: 0.0350353	total: 29s	remaining: 6.19s
824:	learn: 0.0349529	total: 29s	remaining: 6.15s
825:	learn: 0.0349063	total: 29s	remaining: 6.12s
826:	learn: 0.0348107	total: 29.1s	remaining: 6.08s
827:	learn: 0.0347265	total: 29.1s	remaining: 6.04s
828:	learn: 0.0346336	total: 29.1s	remaining: 6.01s
829:	learn: 0.0345840	total: 29.2s	remaining: 5.97s
830:	learn: 0.0345082	total: 29.2s	remaining: 5.94s
831:	learn: 0.034427

973:	learn: 0.0249561	total: 34.2s	remaining: 913ms
974:	learn: 0.0249046	total: 34.2s	remaining: 878ms
975:	learn: 0.0248393	total: 34.3s	remaining: 843ms
976:	learn: 0.0247864	total: 34.3s	remaining: 808ms
977:	learn: 0.0247276	total: 34.3s	remaining: 773ms
978:	learn: 0.0246550	total: 34.4s	remaining: 738ms
979:	learn: 0.0245959	total: 34.4s	remaining: 702ms
980:	learn: 0.0245161	total: 34.5s	remaining: 667ms
981:	learn: 0.0244965	total: 34.5s	remaining: 632ms
982:	learn: 0.0244863	total: 34.5s	remaining: 597ms
983:	learn: 0.0244443	total: 34.5s	remaining: 562ms
984:	learn: 0.0244162	total: 34.6s	remaining: 527ms
985:	learn: 0.0243252	total: 34.6s	remaining: 491ms
986:	learn: 0.0242732	total: 34.7s	remaining: 456ms
987:	learn: 0.0242092	total: 34.7s	remaining: 421ms
988:	learn: 0.0241482	total: 34.7s	remaining: 386ms
989:	learn: 0.0240939	total: 34.8s	remaining: 351ms
990:	learn: 0.0240313	total: 34.8s	remaining: 316ms
991:	learn: 0.0240037	total: 34.8s	remaining: 281ms
992:	learn: 

134:	learn: 0.2284113	total: 5.01s	remaining: 32.1s
135:	learn: 0.2277212	total: 5.04s	remaining: 32s
136:	learn: 0.2271651	total: 5.08s	remaining: 32s
137:	learn: 0.2266079	total: 5.12s	remaining: 32s
138:	learn: 0.2256187	total: 5.15s	remaining: 31.9s
139:	learn: 0.2251904	total: 5.18s	remaining: 31.9s
140:	learn: 0.2243698	total: 5.22s	remaining: 31.8s
141:	learn: 0.2237429	total: 5.26s	remaining: 31.8s
142:	learn: 0.2229464	total: 5.29s	remaining: 31.7s
143:	learn: 0.2224583	total: 5.33s	remaining: 31.7s
144:	learn: 0.2215395	total: 5.37s	remaining: 31.7s
145:	learn: 0.2211777	total: 5.4s	remaining: 31.6s
146:	learn: 0.2203749	total: 5.44s	remaining: 31.6s
147:	learn: 0.2198278	total: 5.47s	remaining: 31.5s
148:	learn: 0.2189041	total: 5.51s	remaining: 31.5s
149:	learn: 0.2186255	total: 5.54s	remaining: 31.4s
150:	learn: 0.2180825	total: 5.58s	remaining: 31.4s
151:	learn: 0.2176067	total: 5.61s	remaining: 31.3s
152:	learn: 0.2170347	total: 5.65s	remaining: 31.3s
153:	learn: 0.21637

KeyboardInterrupt: 

In [None]:
def clean_train_data(df, remove_cols, label_encoder_path = "", catboost_model = True):
    
    pass
    return


def clean_test_data(df):
    pass 
    return

In [None]:
def extract_features(df):