In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load packages & data

import matplotlib.pyplot as plt
import seaborn as sns

# SimpleImputer replaces the previous sklearn.preprocessing.Imputer estimator which is now removed.
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 100)

train=pd.read_csv('../input/porto-seguro-safe-driver-prediction/train.csv')
test=pd.read_csv('../input/porto-seguro-safe-driver-prediction/test.csv')

1. Data overall

In [None]:
train.head()

In [None]:
train.tail()

-binary variables

-categorical variables (integer)

-integer / float variables

* -1 은 missing value를 의미한다

In [None]:
train.shape

In [None]:
train.drop_duplicates()
train.shape

In [None]:
train.info()

2. Metadata

    column 에 대해서 metadata를 따로 저장

        role (ID, target, input)

        level (binary, nominal, interval, ordinal)

        keep (True, False)

        dtype (int, float, str)

In [None]:
data = []
for c in train.columns:
    # role
    if c == 'id':
        role = 'id'
    elif c == 'target':
        role = 'target'
    else :
        role = 'input'
        
    # level
    if 'bin' in c or c == 'target':
        level = 'binary'
    elif 'cat' in c or c == 'id':
        level = 'nominal'
    elif train[c].dtype == float:
        level = 'interval'
    elif train[c].dtype == int :
        level = 'ordinal'
        
    # keep
    keep = True
    if c == 'id':
        keep = False
    
    # dtype
    dtype = train[c].dtype
    
    c_dict = {
        'varname' : c,
        'role' : role,
        'level' : level,
        'keep' : keep,
        'dtype' : dtype
    }
    data.append(c_dict)
    
meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace = True)

In [None]:
meta

In [None]:
meta[(meta.level == 'nominal')&(meta.keep)].index

3. Descriptive statistics

In [None]:
x = meta[(meta.level=='interval') & (meta.keep)].index
train[x].describe()

missing value가 존재하는 column (min이 -1) 의 경우 그렇지 않은 column들과

범위, 평균 등에서 차이가 난다 -> scaling 필요!

In [None]:
x = meta[(meta.level=='ordinal') & (meta.keep)].index
train[x].describe()

In [None]:
v = meta[(meta.level == 'binary') & (meta.keep)].index
train[v].describe()

target의 mean이 0.0365인 것으로 보아, target value가 불균형적임을 알 수 있다.

(target=0인 record가 target=1인 record보다 훨씬 많음)

4. Handling Imbalanced classes

    - target=1 인 record를 oversampling
    
    - target=0 인 record undersampling
    
    - 등등...

In [None]:
# large training set -> undersampling 사용

desired_apriori = 0.10

idx_0 = train[train.target == 0].index
idx_1 = train[train.target == 1].index

nb_0 = len(train.loc[idx_0])
nb_1 = len(train.loc[idx_1])

# Calculate the undersampling rate and resulting number of records with target=0
undersampling_rate = ((1-desired_apriori)*nb_1)/(nb_0*desired_apriori)
undersampled_nb_0 = int(undersampling_rate*nb_0)
print('Rate to undersample records with target=0: {}'.format(undersampling_rate))
print('Number of records with target=0 after undersampling: {}'.format(undersampled_nb_0))

# Randomly select records with target=0 to get at the desired a priori
undersampled_idx = shuffle(idx_0, random_state=37, n_samples=undersampled_nb_0)

# Construct list with remaining indices
idx_list = list(undersampled_idx) + list(idx_1)

# Return undersample data frame
train = train.loc[idx_list].reset_index(drop=True)

5. Data quality Checks

    5.1 Check missing values
    
    5.2 Check cardinality of categorical variables
    
        cardinality = number of different values in a variable
        
        categorical column은 dummy variable로 바꿀 예정
        
        check whether there are many distinct values, as they would result in many dummy variables.

In [None]:
cols_with_missing = []

for c in train.columns:
    missings = train[train[c]==-1][c].count()
    if missings > 0:
        cols_with_missing.append(c)
        missings_perc = missings / train.shape[0]
        
        print('Column {} has {} records ({:.2%}) with missing values'.format(c, missings, missings_perc))
        
print('In total, {} columns with missing values'.format(len(cols_with_missing)))

ps_car_03_cat, ps_car_05_cat은 missing value의 비율이 크므로 열 제거

그 외의 경우, -1을 mean, mode 등으로 대체하는 방법을 사용할 수 있다.

In [None]:
cols_to_drop = ['ps_car_03_cat', 'ps_car_05_cat']
# train.drop(cols_to_drop, inplace=True, axis=1)
meta.loc[cols_to_drop, 'keep'] = False

# Imputing with the mean or mode
mean_imp = SimpleImputer(missing_values=-1, strategy='mean')
mode_imp = SimpleImputer(missing_values=-1, strategy='most_frequent')
train['ps_reg_03'] = mean_imp.fit_transform(train[['ps_reg_03']]).ravel()
train['ps_car_12'] = mean_imp.fit_transform(train[['ps_car_12']]).ravel()
train['ps_car_14'] = mean_imp.fit_transform(train[['ps_car_14']]).ravel()
train['ps_car_11'] = mode_imp.fit_transform(train[['ps_car_11']]).ravel()

In [None]:
v = meta[(meta.level=='nominal')&(meta.keep)].index
for c in v:
    dist_values = train[c].value_counts().shape[0]
    print('Column {} has {} distinct values'.format(c, dist_values))

In [None]:
# Script by https://www.kaggle.com/ogrellier
# Code: https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)
train_encoded, test_encoded = target_encode(train["ps_car_11_cat"], 
                             test["ps_car_11_cat"], 
                             target=train.target, 
                             min_samples_leaf=100,
                             smoothing=10,
                             noise_level=0.01)
    
train['ps_car_11_cat_te'] = train_encoded
train.drop('ps_car_11_cat', axis=1, inplace=True)
meta.loc['ps_car_11_cat','keep'] = False  # Updating the meta
test['ps_car_11_cat_te'] = test_encoded
test.drop('ps_car_11_cat', axis=1, inplace=True)

6. EDA

In [None]:
# Categorical variables

v = meta[(meta.level == 'nominal')&(meta.keep)].index
for c in v:
    plt.figure()
    fig, ax = plt.subplots(figsize=(20,10))
    
    # calculate the percentage of target=1 per category
    cat_perc = train[[c, 'target']].groupby([c], as_index=False).mean()
    cat_perc.sort_values(by='target', ascending=False, inplace=True)
    
    # Bar plot
    sns.barplot(ax=ax, x=c, y='target', data=cat_perc, order=cat_perc[c])
    plt.ylabel('% target', fontsize=18)
    plt.xlabel(c, fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.show();

몇 categorical column에서, missing value의 row에서 보험 청구 확률 (target=0의 비율)이 높게 나타난다

->  It is a good idea to keep the missing values as a separate category value, instead of replacing them by the mode for instance

In [None]:
# Correlations between Interval variables

def corr_heatmap(v):
    correlations = train[v].corr()
    
    # Create color map ranging between two colors
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
    plt.show();
    
v = meta[(meta.level == 'interval') & (meta.keep)].index
corr_heatmap(v)

There are a strong correlations between the variables:

    ps_reg_02 and ps_reg_03 (0.7)

    ps_car_12 and ps_car13 (0.67)

    ps_car_12 and ps_car14 (0.58)

    ps_car_13 and ps_car15 (0.67)

In [None]:
s = train.sample(frac=0.1)
# train data의 일부만 sampling해서 correlation 살펴보기

In [None]:
sns.lmplot(x='ps_reg_02', y='ps_reg_03', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()

In [None]:
sns.lmplot(x='ps_car_12', y='ps_car_13', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()

In [None]:
sns.lmplot(x='ps_car_12', y='ps_car_14', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()

In [None]:
sns.lmplot(x='ps_car_15', y='ps_car_13', data=s, hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()


In [None]:
# Correlation between ordinal variables

v = meta[(meta.level == 'ordinal')&(meta.keep)].index
corr_heatmap(v)

7. Feature Engineering

    - Create dummy variables
    - Create interaction variables

In [None]:
v = meta[(meta.level == 'nominal')&(meta.keep)].index
print('Before dummification : {} variables'.format(train.shape[1]))
train = pd.get_dummies(train, columns=v, drop_first=True)
print('After dummification : {} variables'.format(train.shape[1]))

In [None]:
v = meta[(meta.level == 'interval') & (meta.keep)].index
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
interactions = pd.DataFrame(data=poly.fit_transform(train[v]), columns=poly.get_feature_names(v))
interactions.drop(v, axis=1, inplace=True)  # Remove the original columns
# Concat the interaction variables to the train data
print('Before creating interactions we have {} variables in train'.format(train.shape[1]))
train = pd.concat([train, interactions], axis=1)
print('After creating interactions we have {} variables in train'.format(train.shape[1]))

8. Feature selection

    - Removing features with zero or low variance
      
      sklearn "VarianceThreshold"

In [None]:
selector = VarianceThreshold(threshold=.01)
selector.fit(train.drop(['id','target'], axis=1))

f = np.vectorize(lambda x : not x)

v = train.drop(['id', 'target'], axis=1).columns[f(selector.get_support())]
print('{} variables have too low variance.'.format(len(v)))
print('These variables are {}'.format(list(v)))

Selecting features with a Random Forest and sklearn "SelectFromModel"

In [None]:
X_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']

feat_labels = X_train.columns

# Use feature importance of a random forest
rf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
rf.fit(X_train, y_train)
importances = rf.feature_importances_

indices = np.argsort(rf.feature_importances_)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,feat_labels[indices[f]], importances[indices[f]]))

In [None]:
# Set threshold on the level of feature importance
sfm = SelectFromModel(rf, threshold='median', prefit=True) # top 50% 선택
print('Before selection: {} features'.format(X_train.shape[1]))
n_features = sfm.transform(X_train).shape[1]
print('After selection: {} features'.format(n_features))
selected_vars = list(feat_labels[sfm.get_support()])

9. Feature scaling

In [None]:
scaler = StandardScaler()
scaler.fit_transform(train.drop(['target'], axis=1))