In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import re

## Loading the data

In [2]:
# Load Training data using Pandas
train = pd.read_csv('train.csv')
print(train.shape)
train.head()

(595212, 59)


Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [3]:
# Load Test data using Pandas
test = pd.read_csv('test.csv')
print(test.shape)
test.head()

(892816, 58)


Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,0,0,1,8,1,0,0,1,0,0,...,1,1,1,12,0,1,1,0,0,1
1,1,4,2,5,1,0,0,0,0,1,...,2,0,3,10,0,0,1,1,0,1
2,2,5,1,3,0,0,0,0,0,1,...,4,0,2,4,0,0,0,0,0,0
3,3,0,1,6,0,0,1,0,0,0,...,5,1,0,5,1,0,1,0,0,0
4,4,5,1,7,0,0,0,0,0,1,...,4,0,0,4,0,1,1,0,0,1


## Pre-processing

In [4]:
#-1 indicates a missing value per data description
#Make a copy of the training dataset and replace those values with with NaN
train_copy = train
train_copy = train_copy.replace(-1, np.NaN)

In [5]:
# Check for missing (null) values
print(train_copy.isnull().sum())

id                     0
target                 0
ps_ind_01              0
ps_ind_02_cat        216
ps_ind_03              0
ps_ind_04_cat         83
ps_ind_05_cat       5809
ps_ind_06_bin          0
ps_ind_07_bin          0
ps_ind_08_bin          0
ps_ind_09_bin          0
ps_ind_10_bin          0
ps_ind_11_bin          0
ps_ind_12_bin          0
ps_ind_13_bin          0
ps_ind_14              0
ps_ind_15              0
ps_ind_16_bin          0
ps_ind_17_bin          0
ps_ind_18_bin          0
ps_reg_01              0
ps_reg_02              0
ps_reg_03         107772
ps_car_01_cat        107
ps_car_02_cat          5
ps_car_03_cat     411231
ps_car_04_cat          0
ps_car_05_cat     266551
ps_car_06_cat          0
ps_car_07_cat      11489
ps_car_08_cat          0
ps_car_09_cat        569
ps_car_10_cat          0
ps_car_11_cat          0
ps_car_11              5
ps_car_12              1
ps_car_13              0
ps_car_14          42620
ps_car_15              0
ps_calc_01             0


In [6]:
#Do the same for the test data
test_copy = test
test_copy = test_copy.replace(-1, np.NaN)
print(test_copy.isnull().sum())

id                     0
ps_ind_01              0
ps_ind_02_cat        307
ps_ind_03              0
ps_ind_04_cat        145
ps_ind_05_cat       8710
ps_ind_06_bin          0
ps_ind_07_bin          0
ps_ind_08_bin          0
ps_ind_09_bin          0
ps_ind_10_bin          0
ps_ind_11_bin          0
ps_ind_12_bin          0
ps_ind_13_bin          0
ps_ind_14              0
ps_ind_15              0
ps_ind_16_bin          0
ps_ind_17_bin          0
ps_ind_18_bin          0
ps_reg_01              0
ps_reg_02              0
ps_reg_03         161684
ps_car_01_cat        160
ps_car_02_cat          5
ps_car_03_cat     616911
ps_car_04_cat          0
ps_car_05_cat     400359
ps_car_06_cat          0
ps_car_07_cat      17331
ps_car_08_cat          0
ps_car_09_cat        877
ps_car_10_cat          0
ps_car_11_cat          0
ps_car_11              1
ps_car_12              0
ps_car_13              0
ps_car_14          63805
ps_car_15              0
ps_calc_01             0
ps_calc_02             0


In [7]:
# Drop variables that are >40% missing
train_copy.drop('ps_car_03_cat', axis=1, inplace=True)
train_copy.drop('ps_car_05_cat', axis=1, inplace=True)

In [8]:
#Same for test set
test_copy.drop('ps_car_03_cat', axis=1, inplace=True)
test_copy.drop('ps_car_05_cat', axis=1, inplace=True)

In [9]:
#Confirm removed
'ps_car_03_cat'in train_copy.columns

False

In [10]:
'ps_car_05_cat'in train_copy.columns

False

In [11]:
'ps_car_03_cat'in test_copy.columns

False

In [12]:
'ps_car_05_cat'in test_copy.columns

False

In [13]:
#Impute missing values for remaining varaibles
#First make a copy
train_imp = train_copy.copy()
test_imp = test_copy.copy()

In [14]:
#Make new columns to indicate what was imputed
cols_with_missing = (col for col in train_imp.columns 
    if train_imp[col].isnull().any())
for col in cols_with_missing:
    train_imp[col + '_was_missing'] = train_imp[col].isnull()
    
cols_with_missing = (col for col in test_imp.columns 
    if test_imp[col].isnull().any())
for col in cols_with_missing:
    test_imp[col + '_was_missing'] = test_imp[col].isnull()

In [15]:
# Impute missing values
train_imputed = train_imp.interpolate()

test_imputed = test_imp.interpolate()

In [16]:
#Confirm imputed
print(train_imputed.isnull().sum())

id                           0
target                       0
ps_ind_01                    0
ps_ind_02_cat                0
ps_ind_03                    0
ps_ind_04_cat                0
ps_ind_05_cat                0
ps_ind_06_bin                0
ps_ind_07_bin                0
ps_ind_08_bin                0
ps_ind_09_bin                0
ps_ind_10_bin                0
ps_ind_11_bin                0
ps_ind_12_bin                0
ps_ind_13_bin                0
ps_ind_14                    0
ps_ind_15                    0
ps_ind_16_bin                0
ps_ind_17_bin                0
ps_ind_18_bin                0
ps_reg_01                    0
ps_reg_02                    0
ps_reg_03                    0
ps_car_01_cat                0
ps_car_02_cat                0
ps_car_04_cat                0
ps_car_06_cat                0
ps_car_07_cat                0
ps_car_08_cat                0
ps_car_09_cat                0
                            ..
ps_calc_02                   0
ps_calc_

In [17]:
print(test_imputed.isnull().sum())

id                           0
ps_ind_01                    0
ps_ind_02_cat                0
ps_ind_03                    0
ps_ind_04_cat                0
ps_ind_05_cat                0
ps_ind_06_bin                0
ps_ind_07_bin                0
ps_ind_08_bin                0
ps_ind_09_bin                0
ps_ind_10_bin                0
ps_ind_11_bin                0
ps_ind_12_bin                0
ps_ind_13_bin                0
ps_ind_14                    0
ps_ind_15                    0
ps_ind_16_bin                0
ps_ind_17_bin                0
ps_ind_18_bin                0
ps_reg_01                    0
ps_reg_02                    0
ps_reg_03                    0
ps_car_01_cat                0
ps_car_02_cat                0
ps_car_04_cat                0
ps_car_06_cat                0
ps_car_07_cat                0
ps_car_08_cat                0
ps_car_09_cat                0
ps_car_10_cat                0
                            ..
ps_calc_01                   0
ps_calc_

In [18]:
# Split target from features and create modelling sets
y, X_train = train_imputed['target'], train_imputed.iloc[:,2:57]

In [35]:
X_test, id = test_imputed.iloc[:,1:56], test_imputed['id']

In [20]:
X_train.head()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,2,2.0,5,1.0,0.0,0,1,0,0,0,...,9,1,5,8,0,1,1,0,0,1
1,1,1.0,7,0.0,0.0,0,0,1,0,0,...,3,1,1,9,0,1,1,0,1,0
2,5,4.0,9,1.0,0.0,0,0,1,0,0,...,4,2,7,7,0,1,1,0,1,0
3,0,1.0,2,0.0,0.0,1,0,0,0,0,...,2,2,4,9,0,0,0,0,0,0
4,0,2.0,0,1.0,0.0,1,0,0,0,0,...,3,1,1,3,0,0,0,1,1,0


## Modeling

In [77]:
#import packages needed to create classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold, cross_val_score
from sklearn.externals import joblib
from sklearn.metrics import auc, roc_auc_score, roc_curve, precision_recall_curve

In [51]:
#Declare data preprocessing steps
pipeline = make_pipeline(preprocessing.StandardScaler(),
                        RandomForestClassifier(n_estimators=100, class_weight = "balanced"))

In [52]:
pipeline.get_params().keys()

dict_keys(['steps', 'standardscaler', 'randomforestclassifier', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'randomforestclassifier__bootstrap', 'randomforestclassifier__class_weight', 'randomforestclassifier__criterion', 'randomforestclassifier__max_depth', 'randomforestclassifier__max_features', 'randomforestclassifier__max_leaf_nodes', 'randomforestclassifier__min_impurity_split', 'randomforestclassifier__min_samples_leaf', 'randomforestclassifier__min_samples_split', 'randomforestclassifier__min_weight_fraction_leaf', 'randomforestclassifier__n_estimators', 'randomforestclassifier__n_jobs', 'randomforestclassifier__oob_score', 'randomforestclassifier__random_state', 'randomforestclassifier__verbose', 'randomforestclassifier__warm_start'])

In [63]:
#Declare hyperparameters to tune
hyperparameters = {'randomforestclassifier__max_features':['auto','sqrt','log2'],
                  'randomforestclassifier__max_depth':[None,5,3,1]}

In [54]:
#Tune model using cross-validation pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv=10)

In [55]:
#Fit the model
clf.fit(X_train,y)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2'], 'randomforestclassifier__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [56]:
#Evaluate model pipeline on test data
pred = clf.predict(X_test)

In [41]:
#print r2_score(y_test, pred)
#print mean_squared_error(y_test, pred)

In [57]:
#Save model for use in the future
joblib.dump(clf,'rf_regressor.pkl')
#To load
#clf2 = loblib.load('rf_regressor.pkl')

['rf_regressor.pkl']

In [58]:
Final = pd.DataFrame(data=id, index=None, columns=None, dtype=None, copy=False)

In [59]:
Final = Final.assign(target=pred)
print(Final.shape)
Final.head()

(892816, 2)


Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [60]:
Final.to_csv(path_or_buf='PS_Submission.csv', sep=',', na_rep='', float_format=None, columns=None, header=True, index=False, index_label=None, mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', chunksize=None, tupleize_cols=False, date_format=None, doublequote=True, escapechar=None, decimal='.')