# Porto Seguro’s Safe Driver Prediction

I got Felipe Antunes code as a startpack: https://github.com/felipeeeantunes/udacity_live

## Initializing

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import gc
from time import time
from multiprocessing import *

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png')
pd.options.display.float_format = '{:.2f}'.format
rc={'savefig.dpi': 75, 'figure.autolayout': False, 'figure.figsize': [12, 8], 'axes.labelsize': 18,\
   'axes.titlesize': 18, 'font.size': 18, 'lines.linewidth': 2.0, 'lines.markersize': 8, 'legend.fontsize': 16,\
   'xtick.labelsize': 16, 'ytick.labelsize': 16}

sns.set(style='dark',rc=rc)

In [None]:
default_color = '#56B4E9'
colormap = plt.cm.cool

In [None]:
# Setting working directory
path = '../data/raw/'

## Loading Files

In [None]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

In [None]:
y = train['target']
del train['target']

In [None]:
y.head(5)

In [None]:
id_train = train['id'].values
id_test = test['id'].values

In [None]:
columns_original = list(train.columns)
columns_original

In [None]:
train.head(5)

### Transforming -1 'null's in np.NaN

In [None]:
train = train.replace(-1, np.NaN)
test = test.replace(-1, np.NaN)

## Target Analysis

In [None]:
plt.figure(figsize=(5,5))

ax = sns.countplot(x=y, color=default_color)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(100*p.get_height()/len(y)), (p.get_x()+ 0.3, p.get_height()+10000))

## Data Analysis

In [None]:
def get_meta(train):
    data = []
    for col in train.columns:
        # Defining the role
        if col == 'target':
            role = 'target'
        elif col == 'id':
            role = 'id'
        else:
            role = 'input'

        # Defining the level
        if 'bin' in col or col == 'target':
            level = 'binary'
        elif 'cat' in col or col == 'id':
            level = 'nominal'
        elif train[col].dtype == np.float64:
            level = 'interval'
        elif train[col].dtype == np.int64:
            level = 'ordinal'

        # Initialize keep to True for all variables except for id
        keep = True
        if col == 'id':
            keep = False

        # Defining the data type 
        dtype = train[col].dtype

        source = 'id'
        if '_ind_' in col:
            source = 'ind'
        if '_reg_' in col:
            source = 'reg'
        elif '_car_' in col:
            source = 'car'
        elif '_calc_' in col:
            source = 'calc'
        
        # Creating a Dict that contains all the metadata for the variable
        col_dict = {
            'varname': col,
            'role'   : role,
            'level'  : level,
            'keep'   : keep,
            'dtype'  : dtype,
            'source' : source
        }
        data.append(col_dict)
    meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype', 'source'])
    meta.set_index('varname', inplace=True)
    return meta
        

In [None]:
meta_data = get_meta(train)
meta_data

In [None]:
meta_counts = meta_data.groupby(['role', 'level']).agg({'dtype': lambda x: x.count()}).reset_index()
meta_counts

In [None]:
fig,ax = plt.subplots()
fig.set_size_inches(7,5)
sns.barplot(data=meta_counts[(meta_counts.role != 'target') & (meta_counts.role != 'id') ],x="level",y="dtype",ax=ax,color=default_color)
ax.set(xlabel='Variable Type', ylabel='Count',title="Variables Count Across Datatype")

In [None]:
col_ordinal   = meta_data[(meta_data.level == 'ordinal') & (meta_data.keep)].index
col_nominal   = meta_data[(meta_data.level == 'nominal') & (meta_data.keep)].index
col_internval = meta_data[(meta_data.level == 'interval') & (meta_data.keep)].index
col_binary    = meta_data[(meta_data.level == 'binary') & (meta_data.keep) & (meta_data.role != 'target')].index

In [None]:
msno.dendrogram(train[columns_original],figsize=(20,20))

### Missing Values

In [None]:
missingValueColumns = train.columns[train.isnull().any()].tolist()
df_null = train[missingValueColumns]

In [None]:
msno.bar(df_null,figsize=(20,8),color=default_color,fontsize=18,labels=True)

### Correlations between missing Values

In [None]:
msno.heatmap(df_null,figsize=(10,10),cmap=colormap)

In [None]:
msno.dendrogram(df_null,figsize=(10,8))

In [None]:
sorted_data = msno.nullity_sort(df_null, sort='descending') # or sort='ascending'
msno.matrix(sorted_data,figsize=(20,8),fontsize=14)

### Continuous Features Analysis

In [None]:
plt.figure(figsize=(18,16))
plt.title('Pearson correlation of continuous features', y=1.05, size=15)
sns.heatmap(train[col_internval].corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)

### Use -1 instead of NaN

In [None]:
train = train.fillna(-1)
test = test.fillna(-1)

### Simple Baseline RF Model and Feature Importance

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
conf_rf_featimp = {
    'n_estimators': 200,
    'max_depth': 6,
    'min_samples_leaf': 10,
    'max_features': 0.2,
    'n_jobs': -1,
    'random_state': 0
}

In [None]:
rf_featimp = RandomForestClassifier(**conf_rf_featimp)

In [None]:
t0 = time()
rf_featimp.fit(train, y)
print("----- Training Time: %  secs. -----" % (time()-t0))

In [None]:
features = columns_original

In [None]:
def get_feature_importance_df(feature_importances, 
                              column_names, 
                              top_n=25):
    """Get feature importance data frame.
 
    Parameters
    ----------
    feature_importances : numpy ndarray
        Feature importances computed by an ensemble 
            model like random forest or boosting
    column_names : array-like
        Names of the columns in the same order as feature 
            importances
    top_n : integer
        Number of top features
 
    Returns
    -------
    df : a Pandas data frame
 
    """
     
    imp_dict = dict(zip(column_names, 
                        feature_importances))
    top_features = sorted(imp_dict, 
                          key=imp_dict.get, 
                          reverse=True)[0:top_n]
    top_importances = [imp_dict[feature] for feature 
                          in top_features]
    df = pd.DataFrame(data={'feature': top_features, 
                            'importance': top_importances})
    return df

In [None]:
feature_importance = get_feature_importance_df(rf_featimp.feature_importances_, features)

In [None]:
feature_importance

In [None]:
fig,ax = plt.subplots()
fig.set_size_inches(20,10)
g=sns.barplot(data=feature_importance,x="feature",y="importance",ax=ax,color=default_color,)
for item in g.get_xticklabels():
    item.set_rotation(45)
ax.set(xlabel='Variable name', ylabel='Importance',title="Variable importances")

## Cross Val function and Other Models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [None]:
def cross_val_model(X, y, model, n_splits=3):
   
    X = np.array(X)
    y = np.array(y)

    folds = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42).split(X, y))

    cross_score_mean = 0.0

    t0 = time()
    
    for j, (train_idx, test_idx) in enumerate(folds):
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_holdout = X[test_idx]
        y_holdout = y[test_idx]

        print ("Fit %s fold %d" % (str(model).split('(')[0], j+1))
        model.fit(X_train, y_train)
        cross_score = cross_val_score(model, X_holdout, y_holdout, cv=3, scoring='roc_auc')
        print("    cross_score: %.5f (%.5f)" % (cross_score.mean(), cross_score.mean()*2-1)) 
        print("    [%10d secs elapsed]: cross_score: %.5f (%.5f)" % (time()-t0, cross_score.mean(), cross_score.mean()*2-1)) 
        cross_score_mean += cross_score.mean()
        
    cross_score_mean /= n_splits
    print("cross_score_mean: %.5f (%.5f)" % (cross_score_mean, cross_score_mean*2-1))

#### Cross Val - Random Forest

In [None]:
conf_rf_model = {
    'n_estimators': 200,
    'max_depth': 6,
    'min_samples_split': 70,
    'min_samples_leaf': 30,
    'n_jobs': -1,
}

In [None]:
rf_model = RandomForestClassifier(**conf_rf_model)

In [None]:
cross_val_model(train, y, rf_model)

#### Cross Val - XGBoost

In [None]:
# parameters from https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283/code
conf_xgb_model = {
    'n_estimators': 200,
    'max_depth': 4,
    'objective': 'binary:logistic',
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 1,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'nthread': 2,
    'min_child_weight': 100
}

In [None]:
xgb_model = XGBClassifier(**conf_xgb_model)

In [None]:
cross_val_model(train, y, xgb_model)

#### Cross Val - LGBM

In [None]:
conf_lgb_model = {
    'boosting_type': 'gbdt',
    'n_estimators': 200,
    'max_depth': 4,
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'sub_feature': 0.8,
    'num_leaves': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 3,
}

In [None]:
lgb_model = LGBMClassifier(**conf_lgb_model)

In [None]:
cross_val_model(train, y, lgb_model)

## Feature Engineering & Selection

In [None]:
# Selected features from https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283/code
selected_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
    "ps_reg_03",  #            : 1408.42 / shadow  511.15
    "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
    "ps_ind_03",  #            : 1219.47 / shadow  230.55
    "ps_ind_15",  #            :  922.18 / shadow  242.00
    "ps_reg_02",  #            :  920.65 / shadow  267.50
    "ps_car_14",  #            :  798.48 / shadow  549.58
    "ps_car_12",  #            :  731.93 / shadow  293.62
    "ps_car_01_cat",  #        :  698.07 / shadow  178.72
    "ps_car_07_cat",  #        :  694.53 / shadow   36.35
    "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
    "ps_car_03_cat",  #        :  611.73 / shadow   50.67
    "ps_reg_01",  #            :  598.60 / shadow  178.57
    "ps_car_15",  #            :  593.35 / shadow  226.43
    "ps_ind_01",  #            :  547.32 / shadow  154.58
    "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
    "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
    "ps_car_06_cat",  #        :  398.02 / shadow  212.43
    "ps_car_04_cat",  #        :  376.87 / shadow   76.98
    "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
    "ps_car_09_cat",  #        :  214.12 / shadow   81.38
    "ps_car_02_cat",  #        :  203.03 / shadow   26.67
    "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
    "ps_car_11",  #            :  173.28 / shadow   76.45
    "ps_car_05_cat",  #        :  172.75 / shadow   62.92
    "ps_calc_09",  #           :  169.13 / shadow  129.72
    "ps_calc_05",  #           :  148.83 / shadow  120.68
    "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
    "ps_car_08_cat",  #        :  120.87 / shadow   28.82
    "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
    "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
    "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
    "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
    "ps_ind_14",  #            :   37.37 / shadow   16.65
    "ps_car_11_cat" # Very nice spot from Tilii : https://www.kaggle.com/tilii7
]

In [None]:
cross_val_model(train[selected_features], y, rf_model)

In [None]:
cross_val_model(train[selected_features], y, xgb_model)

In [None]:
cross_val_model(train[selected_features], y, lgb_model)

### Adding Combs

In [None]:
train.head(5)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# add combinations from https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283/code
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]
start = time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f' % (name1, n_c + 1, (time() - start) / 60), end='')
    print('\r' * 75, end='')
    train[name1] = train[f1].apply(lambda x: str(x)) + "_" + train[f2].apply(lambda x: str(x))
    test[name1] = test[f1].apply(lambda x: str(x)) + "_" + test[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train[name1].values) + list(test[name1].values))
    train[name1] = lbl.transform(list(train[name1].values))
    test[name1] = lbl.transform(list(test[name1].values))

In [None]:
new_features = [f1 + '_plus_' + f2 for (f1, f2) in combs]
selected_features.extend(new_features)
new_features

In [None]:
train.head(5)

In [None]:
selected_features

In [None]:
cross_val_model(train[selected_features], y, rf_model)

In [None]:
cross_val_model(train[selected_features], y, xgb_model)

In [None]:
cross_val_model(train[selected_features], y, lgb_model)

### Reconstructing ps_reg_03

In [None]:
### from Pascal's (https://www.kaggle.com/pnagel/reconstruction-of-ps-reg-03)
def recon(reg):
    integer = int(np.round((40*reg)**2)) 
    for a in range(32):
        if (integer - a) % 31 == 0:
            A = a
    M = (integer - A)//31
    return A, M

train['ps_reg_A'] = train['ps_reg_03'].apply(lambda x: recon(x)[0] )
train['ps_reg_M'] = train['ps_reg_03'].apply(lambda x: recon(x)[1])
train['ps_reg_A'].replace(19, -1, inplace=True) # replace -1 with np.NaN
train['ps_reg_M'].replace(51, -1, inplace=True) # replace -1 with np.NaN

test['ps_reg_A'] = test['ps_reg_03'].apply(lambda x: recon(x)[0])
test['ps_reg_M'] = test['ps_reg_03'].apply(lambda x: recon(x)[1])
test['ps_reg_A'].replace(19, np.NaN, inplace=True) # replace -1 with np.NaN
test['ps_reg_M'].replace(51, np.NaN, inplace=True) # replace -1 with np.NaN

In [None]:
new_features = ['ps_reg_A', 'ps_reg_M']
selected_features.extend(new_features)
selected_features.remove('ps_reg_03')

In [None]:
selected_features

In [None]:
cross_val_model(train[selected_features], y, rf_model)

In [None]:
cross_val_model(train[selected_features], y, xgb_model)

In [None]:
cross_val_model(train[selected_features], y, lgb_model)

In [None]:
for x in ['ps_reg_A', 'ps_reg_M']: selected_features.remove(x)
selected_features.append('ps_reg_03')

In [None]:
selected_features

### One HOT and Categorical Target Encoding

In [None]:
train = train.replace(-1, np.NaN)
test = test.replace(-1, np.NaN)

In [None]:
one_hot = {c: len(list(train[c].unique())) for c in selected_features}
sorted( ((v,k) for k,v in one_hot.items()), reverse=True)

In [None]:
train.head(10)

In [None]:
one_hot_lt_than_5_unique = { k:v for k, v in one_hot.items() if v < 5 }
one_hot_me_than_5_unique = { k:v for k, v in one_hot.items() if v >= 5}
one_hot_me_than_5_unique_cat = { k:v for k, v in one_hot.items() if v >= 5 and 'cat' in k}
one_hot_lt_than_5_unique, one_hot_me_than_5_unique, one_hot_me_than_5_unique_cat

In [None]:
def OHE_by_unique(train, one_hot, limit):
    
    #ONE-HOT enconde features with more than 2 and less than 'limit' unique values
    df = train.copy()
    for c in one_hot.keys():
        if len(one_hot[c]) > 2 and len(one_hot[c]) <= limit:
            for val in one_hot[c]:
                df[c+'_oh_' + str(val)] = (df[c].values == val).astype(np.int)
            print(c)
    return df

In [None]:
one_hot_values = {c: list(train[c].unique()) for c in selected_features}
list(one_hot_values.items())[:2]

In [313]:
one_hot.values()

dict_values([14, 5, 850, 3, 3, 100, 2, 2, 2, 8, 5, 2, 2, 18, 13, 5, 6, 10, 7, 12, 2, 19, 5013, 24, 184, 8, 10, 8, 15, 3, 2, 3, 2, 104, 2, 3, 70482])

In [None]:
train.head(5)

In [None]:
oh_train = OHE_by_unique(train, one_hot_values, 3)
oh_test = OHE_by_unique(test, one_hot_values, 3)

In [None]:
oh_onehotted_columns = ['ps_car_03_cat','ps_car_07_cat','ps_car_02_cat','ps_ind_04_cat','ps_car_05_cat','ps_car_03_cat','ps_car_07_cat','ps_car_02_cat','ps_ind_04_cat','ps_car_05_cat']
selected_features_oh = selected_features.copy()
selected_features_oh, oh_onehotted_columns

In [None]:
for c in oh_onehotted_columns:
    if c in selected_features_oh:
        selected_features_oh.remove(c)
oh_columns = [c for c in oh_train.columns if c in selected_features or '_oh_' in c]
print(oh_columns)
selected_features_oh.extend(oh_columns)
print(selected_features_oh)

In [None]:
oh_train.head(5)

In [None]:
oh_train = oh_train.fillna(-1)
oh_test = oh_test.fillna(-1)

In [None]:
cross_val_model(oh_train[selected_features], y, rf_model)

In [None]:
cross_val_model(oh_train[selected_features], y, xgb_model)

In [None]:
cross_val_model(oh_train[selected_features], y, lgb_model)

Fit LGBMClassifier fold 1


In [None]:
cross_val_model(train[selected_features], y, rf_model)

In [None]:
cross_val_model(train[selected_features], y, xgb_model)

In [None]:
cross_val_model(train[selected_features], y, lgb_model)

In [None]:
f_cats = [f for f in trn_df.columns if "_cat" in f]

In [None]:
for f in f_cats:
    trn_df[f + "_avg"], sub_df[f + "_avg"] = target_encode(trn_series=trn_df[f],
                                         tst_series=sub_df[f],
                                         target=target,
                                         min_samples_leaf=200,
                                         smoothing=10,
                                         noise_level=0)