In [1]:
# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier

from collections import Counter

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve


# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.utils import shuffle

from lightgbm import LGBMClassifier

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6



In [2]:
# get home price train & test csv files as a DataFrame
train = pd.read_csv("../input/train_sample.csv")
test    = pd.read_csv("../input/test_sample.csv")

print (train.shape, test.shape)

((1000, 59), (500, 58))


In [3]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,772439,1,2,1,7,0,0,1,0,0,...,6,2,3,6,0,1,1,1,0,0
1,850639,0,0,1,6,0,0,1,0,0,...,2,1,2,7,0,0,1,1,1,0
2,460993,0,7,2,7,0,0,0,0,0,...,7,1,1,6,0,1,1,0,0,0
3,963471,0,0,2,1,1,0,1,0,0,...,7,2,3,9,0,1,0,1,0,1
4,670715,0,4,2,1,1,4,0,0,1,...,10,0,3,5,0,1,1,0,1,0


In [4]:
train.describe()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,755315.5,0.044,1.878,1.369,4.457,0.401,0.373,0.395,0.275,0.154,...,5.497,1.476,2.84,7.565,0.13,0.627,0.564,0.309,0.33,0.168
std,424903.5,0.205198,1.969516,0.674755,2.710627,0.490346,1.322732,0.489095,0.446738,0.361129,...,2.305495,1.182718,1.716201,2.700568,0.336472,0.483844,0.496135,0.462312,0.470448,0.374053
min,3624.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,373500.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,4.0,1.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,763798.5,0.0,1.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,...,5.0,1.0,3.0,7.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,1110637.0,0.0,3.0,2.0,7.0,1.0,0.0,1.0,1.0,0.0,...,7.0,2.0,4.0,9.0,0.0,1.0,1.0,1.0,1.0,0.0
max,1487276.0,1.0,7.0,4.0,11.0,1.0,6.0,1.0,1.0,1.0,...,15.0,7.0,9.0,18.0,1.0,1.0,1.0,1.0,1.0,1.0


## Checking for missing values

In [5]:

#Checking for missing data
NAs = pd.concat([train.isnull().sum(), test.isnull().sum()], axis=1, keys=['Train', 'Test'])
NAs[NAs.sum(axis=1) > 0]

Unnamed: 0,Train,Test


In [6]:
train.dtypes

id                  int64
target              int64
ps_ind_01           int64
ps_ind_02_cat       int64
ps_ind_03           int64
ps_ind_04_cat       int64
ps_ind_05_cat       int64
ps_ind_06_bin       int64
ps_ind_07_bin       int64
ps_ind_08_bin       int64
ps_ind_09_bin       int64
ps_ind_10_bin       int64
ps_ind_11_bin       int64
ps_ind_12_bin       int64
ps_ind_13_bin       int64
ps_ind_14           int64
ps_ind_15           int64
ps_ind_16_bin       int64
ps_ind_17_bin       int64
ps_ind_18_bin       int64
ps_reg_01         float64
ps_reg_02         float64
ps_reg_03         float64
ps_car_01_cat       int64
ps_car_02_cat       int64
ps_car_03_cat       int64
ps_car_04_cat       int64
ps_car_05_cat       int64
ps_car_06_cat       int64
ps_car_07_cat       int64
ps_car_08_cat       int64
ps_car_09_cat       int64
ps_car_10_cat       int64
ps_car_11_cat       int64
ps_car_11           int64
ps_car_12         float64
ps_car_13         float64
ps_car_14         float64
ps_car_15   

## Concatenating train and test

In [7]:
train_labels = train.pop('target')
test_id = test.id

features = pd.concat([train, test], keys=['train', 'test'])
features.shape

(1500, 58)

## Converting categorical variables' type to str

In [8]:
for col in features.columns:
    if col[-3:] == "cat":
        features[col] = features[col].astype(str)

In [9]:
features.dtypes

id                  int64
ps_ind_01           int64
ps_ind_02_cat      object
ps_ind_03           int64
ps_ind_04_cat      object
ps_ind_05_cat      object
ps_ind_06_bin       int64
ps_ind_07_bin       int64
ps_ind_08_bin       int64
ps_ind_09_bin       int64
ps_ind_10_bin       int64
ps_ind_11_bin       int64
ps_ind_12_bin       int64
ps_ind_13_bin       int64
ps_ind_14           int64
ps_ind_15           int64
ps_ind_16_bin       int64
ps_ind_17_bin       int64
ps_ind_18_bin       int64
ps_reg_01         float64
ps_reg_02         float64
ps_reg_03         float64
ps_car_01_cat      object
ps_car_02_cat      object
ps_car_03_cat      object
ps_car_04_cat      object
ps_car_05_cat      object
ps_car_06_cat      object
ps_car_07_cat      object
ps_car_08_cat      object
ps_car_09_cat      object
ps_car_10_cat      object
ps_car_11_cat      object
ps_car_11           int64
ps_car_12         float64
ps_car_13         float64
ps_car_14         float64
ps_car_15         float64
ps_calc_01  

## One Hot encoding of categorical variables

In [10]:
# Getting Dummies from all categorical vars
for col in features.dtypes[features.dtypes == 'object'].index:
    for_dummy = features.pop(col)
    features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1)

In [11]:
features.shape

(1500, 221)

## Splitting train and test variables

In [12]:
### Splitting features
train_features = features.loc['train'].drop('id', axis=1).select_dtypes(include=[np.number]).values
test_features = features.loc['test'].drop('id', axis=1).select_dtypes(include=[np.number]).values

## Stacking

In [27]:
class EnsembleStack(object):
    def __init__(self, stacker, base_models):
        self.stacker = stacker
        self.base_models = base_models
        
    def fit_predict(self, train_features, train_target, test_features):
        X = np.array(train_features)
        y = np.array(train_target)
        T = np.array(test_features)
        
        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        
        for i, clf in enumerate(self.base_models):
            clf.fit(X,y)
            S_train[:,i] = clf.predict_proba(X)[:,1]
            S_test[:,i] = clf.predict_proba(T)[:,1]
        
        self.stacker.fit(S_train, y)
        res = self.stacker.predict_proba(S_test)[:,1]
        return res
        

## Modelling

In [14]:
# LightGBM params
lgb_params = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 650
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['seed'] = 99

In [15]:
lgb_params2 = {}
lgb_params2['n_estimators'] = 1090
lgb_params2['learning_rate'] = 0.02
lgb_params2['colsample_bytree'] = 0.3   
lgb_params2['subsample'] = 0.7
lgb_params2['subsample_freq'] = 2
lgb_params2['num_leaves'] = 16
lgb_params2['seed'] = 99

In [16]:
lgb_params3 = {}
lgb_params3['n_estimators'] = 1100
lgb_params3['max_depth'] = 4
lgb_params3['learning_rate'] = 0.02
lgb_params3['seed'] = 99

In [17]:
lgb_model = LGBMClassifier(**lgb_params)

lgb_model2 = LGBMClassifier(**lgb_params2)

lgb_model3 = LGBMClassifier(**lgb_params3)

In [18]:
log_model = LogisticRegression()

In [31]:
stack = EnsembleStack(log_model, (lgb_model, lgb_model2, lgb_model3))        

In [32]:
test_y = stack.fit_predict(train_features, train_labels, test_features)

In [33]:
test_y.shape

(500,)

## Submission

In [30]:
test_submit = pd.DataFrame({'id': test_id, 'target': test_y})
test_submit.shape
test_submit.head()
test_submit.to_csv('safe_driver_rf.csv', index=False)

## History

- Benchmark: RF with the stock data (no data manipulation). Gini score: 0.184
- Converted categorical variables to one-hot encoding. Gini score: 0.194
- Increased number of trees in RF to 2000. Gini score: 0.227

## Remarks

- Training takes a lot of time
- Probability for target = 1 are in second column
- Train off of jupyter and add Del statement to remove unecessary data