In [2]:
import pandas as pd
import sklearn

In [3]:
sklearn.__version__

'0.19.1'

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [5]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)
 
def test_gini():
    def fequ(a,b):
        return abs( a -b) < 1e-6
    def T(a, p, g, n):
        assert( fequ(gini(a,p), g) )
        assert( fequ(gini_normalized(a,p), n) )
    T([1, 2, 3], [10, 20, 30], 0.111111, 1)
    T([1, 2, 3], [30, 20, 10], -0.111111, -1)
    T([1, 2, 3], [0, 0, 0], -0.111111, -1)
    T([3, 2, 1], [0, 0, 0], 0.111111, 1)
    T([1, 2, 4, 3], [0, 0, 0, 0], -0.1, -0.8)
    T([2, 1, 4, 3], [0, 0, 2, 1], 0.125, 1)
    T([0, 20, 40, 0, 10], [40, 40, 10, 5, 5], 0, 0)
    T([40, 0, 20, 0, 10], [1000000, 40, 40, 5, 5], 0.171428,
      0.6)
    T([40, 20, 10, 0, 0], [40, 20, 10, 0, 0], 0.285714, 1)
    T([1, 1, 0, 1], [0.86, 0.26, 0.52, 0.32], -0.041666,
      -0.333333)
    
test_gini()

In [6]:
df_train = pd.read_csv('../../train.csv', index_col=0)

In [7]:
df_train['target'].value_counts()

0    573518
1     21694
Name: target, dtype: int64

In [8]:
df_train.columns

Index([u'target', u'ps_ind_01', u'ps_ind_02_cat', u'ps_ind_03',
       u'ps_ind_04_cat', u'ps_ind_05_cat', u'ps_ind_06_bin', u'ps_ind_07_bin',
       u'ps_ind_08_bin', u'ps_ind_09_bin', u'ps_ind_10_bin', u'ps_ind_11_bin',
       u'ps_ind_12_bin', u'ps_ind_13_bin', u'ps_ind_14', u'ps_ind_15',
       u'ps_ind_16_bin', u'ps_ind_17_bin', u'ps_ind_18_bin', u'ps_reg_01',
       u'ps_reg_02', u'ps_reg_03', u'ps_car_01_cat', u'ps_car_02_cat',
       u'ps_car_03_cat', u'ps_car_04_cat', u'ps_car_05_cat', u'ps_car_06_cat',
       u'ps_car_07_cat', u'ps_car_08_cat', u'ps_car_09_cat', u'ps_car_10_cat',
       u'ps_car_11_cat', u'ps_car_11', u'ps_car_12', u'ps_car_13',
       u'ps_car_14', u'ps_car_15', u'ps_calc_01', u'ps_calc_02', u'ps_calc_03',
       u'ps_calc_04', u'ps_calc_05', u'ps_calc_06', u'ps_calc_07',
       u'ps_calc_08', u'ps_calc_09', u'ps_calc_10', u'ps_calc_11',
       u'ps_calc_12', u'ps_calc_13', u'ps_calc_14', u'ps_calc_15_bin',
       u'ps_calc_16_bin', u'ps_calc_17_bin', u'ps_c

** Default handling nan **

For now, just use columns distribution to fill in empty cells

In [9]:
df_train.replace(-1, np.nan, inplace=True)
null_vals = df_train.isnull().sum(axis=0)[df_train.isnull().sum(axis=0)>0]
value_count_columns = {i: df_train[i].value_counts(1) for i in df_train.columns}    
df_train_copy = df_train.copy()
for c in df_train.columns:
    total_num = df_train[c].isnull().sum()
    if(total_num == 0):
        continue
    print(c)
    
    random_vals = np.random.choice(list(value_count_columns[c].index), total_num, list(value_count_columns[c].values))
    
    df_train_copy[c].loc[df_train[c].isnull()] = random_vals


ps_ind_02_cat
ps_ind_04_cat
ps_ind_05_cat


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


ps_reg_03
ps_car_01_cat
ps_car_02_cat
ps_car_03_cat
ps_car_05_cat
ps_car_07_cat
ps_car_09_cat
ps_car_11
ps_car_12
ps_car_14


In [12]:
print('null values')
df_train_copy.isnull().sum(axis=0)

null values


target            0
ps_ind_01         0
ps_ind_02_cat     0
ps_ind_03         0
ps_ind_04_cat     0
ps_ind_05_cat     0
ps_ind_06_bin     0
ps_ind_07_bin     0
ps_ind_08_bin     0
ps_ind_09_bin     0
ps_ind_10_bin     0
ps_ind_11_bin     0
ps_ind_12_bin     0
ps_ind_13_bin     0
ps_ind_14         0
ps_ind_15         0
ps_ind_16_bin     0
ps_ind_17_bin     0
ps_ind_18_bin     0
ps_reg_01         0
ps_reg_02         0
ps_reg_03         0
ps_car_01_cat     0
ps_car_02_cat     0
ps_car_03_cat     0
ps_car_04_cat     0
ps_car_05_cat     0
ps_car_06_cat     0
ps_car_07_cat     0
ps_car_08_cat     0
ps_car_09_cat     0
ps_car_10_cat     0
ps_car_11_cat     0
ps_car_11         0
ps_car_12         0
ps_car_13         0
ps_car_14         0
ps_car_15         0
ps_calc_01        0
ps_calc_02        0
ps_calc_03        0
ps_calc_04        0
ps_calc_05        0
ps_calc_06        0
ps_calc_07        0
ps_calc_08        0
ps_calc_09        0
ps_calc_10        0
ps_calc_11        0
ps_calc_12        0


** Test full dataset without subsampling **

In [13]:
df_train_copy.shape

(595212, 58)

In [17]:
print('Current dist of 0/1')
df_train_copy['target'].value_counts()

Current dist of 0/1


0    573518
1     21694
Name: target, dtype: int64

In [23]:
from datetime import datetime
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [20]:
with open('potentially_unnecessary_features.txt') as r:
    features_to_remove = [l.strip() for l in r]

cols_for_random_forest = list(set(df_train_copy.columns) - set(features_to_remove + ['target']))

cols_for_lr = [
    'ps_ind_16_bin', 'ps_ind_09_bin', 'ps_ind_01', 'ps_ind_08_bin', 'ps_ind_06_bin', 
    'ps_car_12', 'ps_ind_17_bin', 'ps_calc_02', 'ps_calc_03', 'ps_calc_01', 'ps_reg_02',
    'ps_reg_03', 'ps_reg_01', 'ps_ind_04_cat', 'ps_car_07_cat', 'ps_car_02_cat', 'ps_car_11',
    'ps_car_13', 'ps_car_15', 'ps_car_14', 'ps_ind_07_bin', 'ps_ind_02_cat', 'ps_ind_15', 'ps_car_09_cat', 
    'ps_car_08_cat', 'ps_ind_05_cat', 'ps_ind_18_bin'
]

cols_for_bc = [
    u'ps_ind_09_bin', u'ps_ind_01', u'ps_ind_03', u'ps_ind_08_bin',
    u'ps_ind_06_bin', u'ps_car_12', u'ps_ind_17_bin', u'ps_calc_03',
    u'ps_reg_02', u'ps_reg_03', u'ps_calc_04', u'ps_reg_01',
    u'ps_ind_04_cat', u'ps_calc_09', u'ps_car_07_cat', u'ps_car_02_cat',
    u'ps_car_13', u'ps_ind_07_bin', u'ps_ind_02_cat', u'ps_ind_15',
    u'ps_car_09_cat', u'ps_calc_07', u'ps_calc_13', u'ps_calc_14',
    u'ps_car_06_cat', u'ps_calc_05', u'ps_car_08_cat', u'ps_ind_05_cat',
    u'ps_ind_18_bin'
]

In [117]:
from sklearn.model_selection import train_test_split


In [None]:
from sklearn.base.BaseEstimator

In [108]:
class ExtractFeatures(sklearn.base.BaseEstimator):
    """
    simply remove the sets of features we want to use for a specific model
    """
    def __init__(self, cols=[]):
        """
        define which columns to extract from dataframe
        """
        self.use_cols = cols
    
    #def get_params(self, deep=True):
    #    return {
    #        'cols': getattr(self, 'use_cols', None)
    #    }
    
    def dummy_extract(self, dfX):
        """
        extract cols and return as np.array
        """
        return dfX[self.use_cols].values
    def transform(self, dfX):
        return self.dummy_extract(dfX)
    def fit(self, dfX, _y):
        self.dummy_extract(dfX)
        return self

def TestExtract():
    a = pd.DataFrame({
        'a': ['a', 'b', 'c'],
        'b': ['d', 'e', 'f'],
        'c': ['g', 'h', 'i'],
    })
    col = ['a', 'c']
    assert ((a[col].values != ExtractFeatures(col).transform(a)).sum()==0)
    col = ['a']
    assert ((a[col].values != ExtractFeatures(col).transform(a)).sum()==0)
    col = ['c']
    assert ((a[col].values != ExtractFeatures(col).transform(a)).sum()==0)

TestExtract()

In [96]:
## LINE OF CODE TO EXTRACT THE CODE FROM A MODULE
# import inspect
# lines = inspect.getsourcelines(StandardScaler)
# print('\n'.join(lines[0]))

In [132]:
from sklearn.pipeline import Pipeline

pipe_rf = Pipeline(
    [
        ('extract', ExtractFeatures(cols_for_random_forest)),
        ('rfc', RandomForestClassifier(n_estimators=100, max_features='sqrt', max_depth=None, n_jobs=4))
    ]
)

pipe_lr = Pipeline(
    [
        ('extract', ExtractFeatures(cols_for_lr)),
        ('scl', StandardScaler()),
        ('lr', LogisticRegression(n_jobs=-1))
    ]
)

pipe_bc = Pipeline(
    [
        ('extract', ExtractFeatures(cols_for_bc)),        
        ('bc', BaggingClassifier(
                base_estimator=DecisionTreeClassifier(criterion='entropy', max_depth=100),
                n_estimators=100,
                max_features=1.0,
                bootstrap=True,
                bootstrap_features=True,
                n_jobs=8
            )
        )
    ]
)

Naively use train-test-split for now because easier to compare with the 'randomized' subsampling method below

In [140]:
acc = []

for i in range(10):
    print(i)
    X_train, X_test, y_train, y_test = train_test_split(
        df_train_copy, df_train_copy['target'], test_size=0.2
    )
    pipe_rf.fit(X_train, y_train)
    rf_vals = pipe_rf.predict(X_test)
    pipe_lr.fit(X_train, y_train)
    lr_vals = pipe_lr.predict_proba(X_test)[:,1]# .max(axis=1)
    pipe_bc.fit(X_train, y_train)
    bc_vals = pipe_bc.predict(X_test)
    acc.append({
        'rf_gini': gini_normalized(y_test, rf_vals),
        'lr_gini': gini_normalized(y_test, lr_vals),
        'bc_gini': gini_normalized(y_test, bc_vals),
        'averaged': gini_normalized(y_test, pd.DataFrame([rf_vals, lr_vals, bc_vals]).mean(axis=0))   
    })
    

0
1
2
3
4
5
6
7
8
9


In [142]:
pd.DataFrame(acc)

Unnamed: 0,averaged,bc_gini,lr_gini,rf_gini
0,-0.231304,0.00514,-0.231304,0.00514
1,-0.255207,-0.000453,-0.255207,-0.000453
2,-0.22825,-0.006096,-0.22825,-0.006096
3,-0.23307,0.00354,-0.23307,0.00354
4,-0.231312,0.001384,-0.231312,0.001384
5,-0.240506,0.001478,-0.240506,0.001478
6,-0.246159,-0.023071,-0.246159,-0.023071
7,-0.25152,-0.000194,-0.25152,-0.000194
8,-0.252621,0.001935,-0.252621,0.001935
9,-0.231274,-0.01051,-0.231274,-0.01051


In [145]:
acc_without_sub_sampling = pd.DataFrame(acc)
acc_without_sub_sampling.to_pickle('acc_no_sub_sample.pkl')

**Try to train on subsampled data**

1. use train_test_split to create a training dataset and a testing dataset
2. in the trainining dataset, subsample 10000 elements multiple times
    * for each subset training data, fit to models
3. for the testing set, predict using all models created from subsets
4. evaluate gini coeff

In [201]:
max_values = 10000
acc_with_sub_samp = []

for _ in range(10):
    print('a', _)
    X_train_for_sub, X_test_for_sub, y_train_for_sub, y_test_for_sub = train_test_split(
        df_train_copy, df_train_copy['target'], test_size=0.2
    )

    all_subsets = []
    all_models = []
    for __ in range(10):
        print('b', __)
        # create series of subsampled datasets
        assert ((X_train_for_sub.index != y_train_for_sub.index).sum() == 0)
        sampled_indexes = np.hstack(list(pd.DataFrame(y_train_for_sub).groupby(by='target').apply(lambda x: x.sample(min(max_values, x.shape[0])).index)))

        new_x_train = X_train_for_sub.loc[sampled_indexes]
        new_y_train = y_train_for_sub.loc[sampled_indexes]
        all_models.extend(
            [
                # train each subsampled model
                pipe_rf.fit(new_x_train, new_y_train),
                pipe_lr.fit(new_x_train, new_y_train),
                pipe_bc.fit(new_x_train, new_y_train)
            ]
        )

    # fit data using all subsamples
    all_predicted_vals_rf = pd.DataFrame([all_models[i].predict(X_test_for_sub) for i in range(0, len(all_models), 3)]).mean(axis=0)
    # or choose argmax....??
    all_predicted_vals_lr = pd.DataFrame([all_models[i].predict_proba(X_test_for_sub)[:,1] for i in range(1, len(all_models), 3)]).mean(axis=0)
    all_predicted_vals_bc = pd.DataFrame([all_models[i].predict(X_test_for_sub) for i in range(2, len(all_models), 3)]).mean(axis=0)
    all_values_predicted = pd.concat([all_predicted_vals_rf,all_predicted_vals_lr, all_predicted_vals_bc], axis=1).mean(axis=1)
    acc_with_sub_samp.append({
        'rf_gini': gini_normalized(y_test_for_sub, all_predicted_vals_rf),
        'lr_gini': gini_normalized(y_test_for_sub, all_predicted_vals_lr),
        'bc_gini': gini_normalized(y_test_for_sub, all_predicted_vals_bc),
        'averaged': gini_normalized(y_test_for_sub,all_values_predicted)
    })
    

('a', 0)
('b', 0)
('b', 1)
('b', 2)
('b', 3)
('b', 4)
('b', 5)
('b', 6)
('b', 7)
('b', 8)
('b', 9)
('a', 1)
('b', 0)
('b', 1)
('b', 2)
('b', 3)
('b', 4)
('b', 5)
('b', 6)
('b', 7)
('b', 8)
('b', 9)
('a', 2)
('b', 0)
('b', 1)
('b', 2)
('b', 3)
('b', 4)
('b', 5)
('b', 6)
('b', 7)
('b', 8)
('b', 9)
('a', 3)
('b', 0)
('b', 1)
('b', 2)
('b', 3)
('b', 4)
('b', 5)
('b', 6)
('b', 7)
('b', 8)
('b', 9)
('a', 4)
('b', 0)
('b', 1)
('b', 2)
('b', 3)
('b', 4)
('b', 5)
('b', 6)
('b', 7)
('b', 8)
('b', 9)
('a', 5)
('b', 0)
('b', 1)
('b', 2)
('b', 3)
('b', 4)
('b', 5)
('b', 6)
('b', 7)
('b', 8)
('b', 9)
('a', 6)
('b', 0)
('b', 1)
('b', 2)
('b', 3)
('b', 4)
('b', 5)
('b', 6)
('b', 7)
('b', 8)
('b', 9)
('a', 7)
('b', 0)
('b', 1)
('b', 2)
('b', 3)
('b', 4)
('b', 5)
('b', 6)
('b', 7)
('b', 8)
('b', 9)
('a', 8)
('b', 0)
('b', 1)
('b', 2)
('b', 3)
('b', 4)
('b', 5)
('b', 6)
('b', 7)
('b', 8)
('b', 9)
('a', 9)
('b', 0)
('b', 1)
('b', 2)
('b', 3)
('b', 4)
('b', 5)
('b', 6)
('b', 7)
('b', 8)
('b', 9)


In [203]:
pd.DataFrame(acc_with_sub_samp)

Unnamed: 0,averaged,bc_gini,lr_gini,rf_gini
0,0.255151,0.157132,0.252415,0.156841
1,0.249068,0.151016,0.252121,0.155272
2,0.23051,0.1477,0.229635,0.16373
3,0.235002,0.155562,0.228198,0.14079
4,0.238146,0.140436,0.240538,0.14944
5,0.243857,0.144852,0.243409,0.159457
6,0.239681,0.156499,0.236141,0.163166
7,0.239429,0.153807,0.231204,0.16051
8,0.254107,0.159629,0.256301,0.164851
9,0.24282,0.150408,0.247106,0.158329


** Evaluate test dataset **

Stick with method above except this time evaluate how well it does on the real test data

In [205]:
all_models = []
max_values = 10000
for __ in range(20):
    print('b', __)
    # create series of subsampled datasets
    assert ((df_train_copy.index != df_train_copy.index).sum() == 0)
    sampled_indexes = np.hstack(list(pd.DataFrame(df_train_copy['target']).groupby(by='target').apply(lambda x: x.sample(min(max_values, x.shape[0])).index)))

    new_x_train = df_train_copy.loc[sampled_indexes]
    new_y_train = df_train_copy.loc[sampled_indexes]['target']
    
    all_models.extend(
        [
            # train each subsampled model
            pipe_rf.fit(new_x_train, new_y_train),
            pipe_lr.fit(new_x_train, new_y_train),
            pipe_bc.fit(new_x_train, new_y_train)
        ]
    )


('b', 0)
('b', 1)
('b', 2)
('b', 3)
('b', 4)
('b', 5)
('b', 6)
('b', 7)
('b', 8)
('b', 9)
('b', 10)
('b', 11)
('b', 12)
('b', 13)
('b', 14)
('b', 15)
('b', 16)
('b', 17)
('b', 18)
('b', 19)


In [210]:
df_test = pd.read_csv('../../test.csv', index_col=0)
df_test.replace(-1, np.nan, inplace=True)
df_test_copy = df_test.copy()
for c in df_test.columns:
    total_num = df_test[c].isnull().sum()
    if(total_num == 0):
        continue
    print(c)    
    random_vals = np.random.choice(list(value_count_columns[c].index), total_num, list(value_count_columns[c].values))
    df_test_copy[c].loc[df_test[c].isnull()] = random_vals

ps_ind_02_cat
ps_ind_04_cat
ps_ind_05_cat
ps_reg_03
ps_car_01_cat
ps_car_02_cat
ps_car_03_cat
ps_car_05_cat
ps_car_07_cat
ps_car_09_cat
ps_car_11
ps_car_14


In [214]:
 # fit data using all subsamples
df_test_all_predicted_vals_rf = pd.DataFrame([all_models[i].predict(df_test_copy) for i in range(0, len(all_models), 3)]).mean(axis=0)
# or choose argmax....??
df_test_all_predicted_vals_lr = pd.DataFrame([all_models[i].predict_proba(df_test_copy)[:,1] for i in range(1, len(all_models), 3)]).mean(axis=0)
df_test_all_predicted_vals_bc = pd.DataFrame([all_models[i].predict(df_test_copy) for i in range(2, len(all_models), 3)]).mean(axis=0)

df_test_all_values_predicted = pd.concat(
    [df_test_all_predicted_vals_rf,df_test_all_predicted_vals_lr, df_test_all_predicted_vals_bc], axis=1
).mean(axis=1)

In [257]:
df_test_all_values_predicted

0         0.141361
1         0.150017
2         0.480311
3         0.115642
4         0.498543
5         0.159062
6         0.161345
7         0.463788
8         0.878808
9         0.879075
10        0.148392
11        0.128527
12        0.199561
13        0.870162
14        0.524938
15        0.144294
16        0.169487
17        0.813746
18        0.101367
19        0.879613
20        0.481535
21        0.512359
22        0.847146
23        0.108962
24        0.160101
25        0.130096
26        0.908189
27        0.499367
28        0.490589
29        0.133205
            ...   
892786    0.131552
892787    0.156015
892788    0.836491
892789    0.157008
892790    0.502249
892791    0.122191
892792    0.127930
892793    0.506299
892794    0.486426
892795    0.147162
892796    0.179320
892797    0.551693
892798    0.536903
892799    0.834557
892800    0.832582
892801    0.469978
892802    0.145995
892803    0.811452
892804    0.122176
892805    0.159644
892806    0.136277
892807    0.

In [263]:
pd.DataFrame(df_test_all_values_predicted.values, index=df_test.index, columns=['target']).to_csv('submission_test.csv', sep=',')

*Final prediction is 0.234 with test data which is slightly similar to what we were predicting from train_test_split*

Basic point, simply using undersampling to rebalance data shows boost in gini accuracy based on simple train_test_split

** Next steps **

Need to proceed and try on the **imblearn library**