In this notebook:

    -- Balance the dataset
    -- SelecKBest on numeric and categorical features
    -- Model (SVC)

Install & import packages

In [3]:
!pip install --quiet tqdm

In [4]:
!pip install --upgrade pandas

Requirement already up-to-date: pandas in /opt/conda/lib/python3.6/site-packages
Requirement already up-to-date: pytz>=2011k in /opt/conda/lib/python3.6/site-packages (from pandas)
Requirement already up-to-date: numpy>=1.9.0 in /opt/conda/lib/python3.6/site-packages (from pandas)
Requirement already up-to-date: python-dateutil>=2 in /opt/conda/lib/python3.6/site-packages (from pandas)
Requirement already up-to-date: six>=1.5 in /opt/conda/lib/python3.6/site-packages (from python-dateutil>=2->pandas)


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import skewtest, boxcox
from sklearn.base import TransformerMixin, BaseEstimator
from tqdm import tqdm

# Load the data

In [6]:
# raw data
train_raw = pd.read_csv('./train.csv')

In [7]:
train_raw.shape

(595212, 59)

In [8]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595212 entries, 0 to 595211
Data columns (total 59 columns):
id                595212 non-null int64
target            595212 non-null int64
ps_ind_01         595212 non-null int64
ps_ind_02_cat     595212 non-null int64
ps_ind_03         595212 non-null int64
ps_ind_04_cat     595212 non-null int64
ps_ind_05_cat     595212 non-null int64
ps_ind_06_bin     595212 non-null int64
ps_ind_07_bin     595212 non-null int64
ps_ind_08_bin     595212 non-null int64
ps_ind_09_bin     595212 non-null int64
ps_ind_10_bin     595212 non-null int64
ps_ind_11_bin     595212 non-null int64
ps_ind_12_bin     595212 non-null int64
ps_ind_13_bin     595212 non-null int64
ps_ind_14         595212 non-null int64
ps_ind_15         595212 non-null int64
ps_ind_16_bin     595212 non-null int64
ps_ind_17_bin     595212 non-null int64
ps_ind_18_bin     595212 non-null int64
ps_reg_01         595212 non-null float64
ps_reg_02         595212 non-null float64
ps_re

In [9]:
train = pd.read_pickle('./train.pkl')
# test = pd.read_pickle('./test.pkl')

In [10]:
train.shape

(595212, 246)

In [11]:
print('Train dataset has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
# print('Test dataset has {} rows and {} columns'.format(test.shape[0], test.shape[1]))

Train dataset has 595212 rows and 246 columns


Create a combined dataset to deskew, then split out again

In [12]:
target = train['target']
train.drop('target', axis=1, inplace=True)
train.drop('id', axis=1, inplace=True)

In [13]:
print('Train dataset has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
# print('Test dataset has {} rows and {} columns'.format(test.shape[0], test.shape[1]))

Train dataset has 595212 rows and 244 columns


In [14]:
numeric_cols = []
for col in train.columns:
    if '_bin' in col:
        pass
    elif '_cat' in col:
        pass
    else:
        numeric_cols.append(col)
    
        

In [15]:
len(numeric_cols)

26

# Feature Selection from model

In [21]:
from sklearn.feature_selection import SelectKBest, SelectFromModel, chi2
from sklearn.linear_model import RidgeClassifierCV

In [17]:
train.shape


(595212, 244)

In [18]:
train[numeric_cols].shape

(595212, 26)

In [19]:
# f-classif for continuous variables
skb_num = SelectKBest(k = 26)
train_num_skb = skb_num.fit_transform(train[numeric_cols], target)

# chi-2 for categorical vars
skb_chi = SelectKBest(chi2, k = 218)
train_class_skb = skb_chi.fit_transform(train[[col
                                              for col in train.columns
                                              if col not in numeric_cols]], target)


In [125]:
num_to_remove = []
for i, value in enumerate(skb_num.scores_):
    if value < 1.0:
        num_to_remove.append(i)

cat_to_remove = []
for i, value in enumerate(skb_chi.scores_):
    if value < 1.0:
        cat_to_remove.append(i)

In [126]:
len(num_to_remove)

11

In [127]:
len(cat_to_remove)

50

In [128]:
num_sub = pd.DataFrame(train_num_skb).drop(num_to_remove, axis = 1)

In [129]:
cat_sub = pd.DataFrame(train_class_skb).drop(cat_to_remove, axis = 1)

In [130]:
train_skb = pd.concat([num_sub, cat_sub], axis=1)

In [132]:
train.shape

(595212, 244)

In [131]:
train_skb.shape

(595212, 183)

# Balance and Split Data

In [153]:
print("The number of positvie classes is: {}".format(sum(target)))
print("The number of negative classes is: {}".format(len(target) - sum(target)))
print("The balance percent: {}".format(sum(target) / len(target)))

The number of positvie classes is: 21694
The number of negative classes is: 573518
The balance percent: 0.036447517859182946


In [154]:
def rebalance(train, target, pos_perc = 0.5, random_state = 42):
    """
    Return a balanced dataset where the positive class is minority
    
    train: 
    target:
    pos_perc: 
    random_state: 
    
    """
    # combine train and target
    combined = pd.concat([train, target], axis=1)
    
    # Minority class
    min_class_sample = combined[combined['target'] == 1]
    
    # Sample equal number 
    maj_class_sample = combined[combined['target']==0].sample(min_class_sample.shape[0])
    
    # Combined balanced into one dataframe
    balanced = pd.concat([maj_class_sample, min_class_sample], axis=0)
    
    # Shuffle the data
    balanced = balanced.sample(frac=1).reset_index(drop=True)
    
    return balanced


In [155]:
balanced = rebalance(train_skb, target)

In [156]:
# verify target is balanced
balanced.shape[0]/2 == sum(balanced['target'])

True

In [157]:
target = balanced['target']
balanced.drop('target', axis=1, inplace=True)

In [158]:
print('Train dataset has {} rows and {} columns'.format(balanced.shape[0], balanced.shape[1]))
# print('Test dataset has {} rows and {} columns'.format(test.shape[0], test.shape[1]))

Train dataset has 43388 rows and 183 columns


In [183]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
     balanced, target, test_size=0.33, random_state=42)


# Models

## SVC

In [161]:
import time

In [162]:
from sklearn.svm import SVC
svc = SVC(probability=True, verbose=True)
start = time.time()
svc.fit(X_train, y_train)
end = time.time()
print("Fit took: {} mins".format((end-start) / 60))

[LibSVM]Fit took: 19.88777727683385 mins


In [168]:
svc.score(X_test,y_test)

0.59354703540750053

## XGBoost

In [177]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-0.6a2.tar.gz (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 574kB/s eta 0:00:01
Building wheels for collected packages: xgboost
  Running setup.py bdist_wheel for xgboost ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/5e/c1/d6/522af54e5cc001fad4dd855117f8bf61b11d56443e06672e26
Successfully built xgboost
Installing collected packages: xgboost
Successfully installed xgboost-0.6a2


In [178]:
import xgboost as xgb



In [233]:
mdl = xgb.XGBClassifier(n_estimators=400, reg_alpha=1.0, reg_lambda=0.0)
start = time.time()
mdl.fit(X_train.values, y_train.values)
end = time.time()
print("Fit took: {} mins".format((end-start) / 60))

Fit took: 0.08701473077138265 mins


In [235]:
mdl.score(X_train.values, y_train.values)

0.65003956104441152

In [234]:
mdl.score(X_test.values, y_test.values)

0.60234653257909077

# Metrics

In [236]:
from sklearn.metrics import (classification_report, auc, confusion_matrix, recall_score, 
                             precision_score, roc_curve, roc_auc_score)

from sklearn.metrics import precision_recall_curve

In [237]:
# svc_preds = svc.predict_proba(X_test)
# svc_p = svc.predict(X_test)
xgb_preds = mdl.predict_proba(X_test.values)
xgb_p = mdl.predict(X_test.values)

In [238]:
precision_recall_curve(y_test, svc_p)

(array([ 0.5020602 ,  0.61212121,  1.        ]),
 array([ 1.        ,  0.51982195,  0.        ]),
 array([0, 1]))

In [239]:
precision_recall_curve(y_test, xgb_p)

(array([ 0.5020602 ,  0.61379205,  1.        ]),
 array([ 1.        ,  0.56085686,  0.        ]),
 array([0, 1]))

In [240]:
2*roc_auc_score(y_test, svc_p) - 1

0.18770413813998599

In [241]:
2*roc_auc_score(y_test, xgb_p) - 1

0.20503638779598776

In [242]:
print(classification_report(y_test, svc_p))

             precision    recall  f1-score   support

          0       0.58      0.67      0.62      7130
          1       0.61      0.52      0.56      7189

avg / total       0.60      0.59      0.59     14319



In [243]:
print(classification_report(y_test, xgb_p))

             precision    recall  f1-score   support

          0       0.59      0.64      0.62      7130
          1       0.61      0.56      0.59      7189

avg / total       0.60      0.60      0.60     14319

