In [2]:
import pandas as pd
import sklearn

In [3]:
sklearn.__version__

'0.19.1'

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [5]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)
 
def test_gini():
    def fequ(a,b):
        return abs( a -b) < 1e-6
    def T(a, p, g, n):
        assert( fequ(gini(a,p), g) )
        assert( fequ(gini_normalized(a,p), n) )
    T([1, 2, 3], [10, 20, 30], 0.111111, 1)
    T([1, 2, 3], [30, 20, 10], -0.111111, -1)
    T([1, 2, 3], [0, 0, 0], -0.111111, -1)
    T([3, 2, 1], [0, 0, 0], 0.111111, 1)
    T([1, 2, 4, 3], [0, 0, 0, 0], -0.1, -0.8)
    T([2, 1, 4, 3], [0, 0, 2, 1], 0.125, 1)
    T([0, 20, 40, 0, 10], [40, 40, 10, 5, 5], 0, 0)
    T([40, 0, 20, 0, 10], [1000000, 40, 40, 5, 5], 0.171428,
      0.6)
    T([40, 20, 10, 0, 0], [40, 20, 10, 0, 0], 0.285714, 1)
    T([1, 1, 0, 1], [0.86, 0.26, 0.52, 0.32], -0.041666,
      -0.333333)
    
test_gini()

In [6]:
df_train = pd.read_csv('../../train.csv', index_col=0)

In [7]:
df_train['target'].value_counts()

0    573518
1     21694
Name: target, dtype: int64

In [8]:
df_train.columns

Index([u'target', u'ps_ind_01', u'ps_ind_02_cat', u'ps_ind_03',
       u'ps_ind_04_cat', u'ps_ind_05_cat', u'ps_ind_06_bin', u'ps_ind_07_bin',
       u'ps_ind_08_bin', u'ps_ind_09_bin', u'ps_ind_10_bin', u'ps_ind_11_bin',
       u'ps_ind_12_bin', u'ps_ind_13_bin', u'ps_ind_14', u'ps_ind_15',
       u'ps_ind_16_bin', u'ps_ind_17_bin', u'ps_ind_18_bin', u'ps_reg_01',
       u'ps_reg_02', u'ps_reg_03', u'ps_car_01_cat', u'ps_car_02_cat',
       u'ps_car_03_cat', u'ps_car_04_cat', u'ps_car_05_cat', u'ps_car_06_cat',
       u'ps_car_07_cat', u'ps_car_08_cat', u'ps_car_09_cat', u'ps_car_10_cat',
       u'ps_car_11_cat', u'ps_car_11', u'ps_car_12', u'ps_car_13',
       u'ps_car_14', u'ps_car_15', u'ps_calc_01', u'ps_calc_02', u'ps_calc_03',
       u'ps_calc_04', u'ps_calc_05', u'ps_calc_06', u'ps_calc_07',
       u'ps_calc_08', u'ps_calc_09', u'ps_calc_10', u'ps_calc_11',
       u'ps_calc_12', u'ps_calc_13', u'ps_calc_14', u'ps_calc_15_bin',
       u'ps_calc_16_bin', u'ps_calc_17_bin', u'ps_c

** Default handling nan **

For now, just use columns distribution to fill in empty cells

In [9]:
df_train.replace(-1, np.nan, inplace=True)
null_vals = df_train.isnull().sum(axis=0)[df_train.isnull().sum(axis=0)>0]
value_count_columns = {i: df_train[i].value_counts(1) for i in df_train.columns}    
df_train_copy = df_train.copy()
for c in df_train.columns:
    total_num = df_train[c].isnull().sum()
    if(total_num == 0):
        continue
    print(c)
    
    random_vals = np.random.choice(list(value_count_columns[c].index), total_num, list(value_count_columns[c].values))
    
    df_train_copy[c].loc[df_train[c].isnull()] = random_vals


ps_ind_02_cat
ps_ind_04_cat
ps_ind_05_cat


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


ps_reg_03
ps_car_01_cat
ps_car_02_cat
ps_car_03_cat
ps_car_05_cat
ps_car_07_cat
ps_car_09_cat
ps_car_11
ps_car_12
ps_car_14


** Test full dataset without subsampling **

In [10]:
df_train_copy.shape

(595212, 58)

In [11]:
print('Current dist of 0/1')
df_train_copy['target'].value_counts()

Current dist of 0/1


0    573518
1     21694
Name: target, dtype: int64

In [10]:
from datetime import datetime
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [73]:
with open('potentially_unnecessary_features.txt') as r:
    features_to_remove = [l.strip() for l in r]

cols_for_random_forest = list(set(df_train_copy.columns) - set(features_to_remove + ['target']))

cols_for_lr = [
    'ps_ind_16_bin', 'ps_ind_09_bin', 'ps_ind_01', 'ps_ind_08_bin', 'ps_ind_06_bin', 
    'ps_car_12', 'ps_ind_17_bin', 'ps_calc_02', 'ps_calc_03', 'ps_calc_01', 'ps_reg_02',
    'ps_reg_03', 'ps_reg_01', 'ps_ind_04_cat', 'ps_car_07_cat', 'ps_car_02_cat', 'ps_car_11',
    'ps_car_13', 'ps_car_15', 'ps_car_14', 'ps_ind_07_bin', 'ps_ind_02_cat', 'ps_ind_15', 'ps_car_09_cat', 
    'ps_car_08_cat', 'ps_ind_05_cat', 'ps_ind_18_bin'
]

cols_for_bc = [
    u'ps_ind_09_bin', u'ps_ind_01', u'ps_ind_03', u'ps_ind_08_bin',
    u'ps_ind_06_bin', u'ps_car_12', u'ps_ind_17_bin', u'ps_calc_03',
    u'ps_reg_02', u'ps_reg_03', u'ps_calc_04', u'ps_reg_01',
    u'ps_ind_04_cat', u'ps_calc_09', u'ps_car_07_cat', u'ps_car_02_cat',
    u'ps_car_13', u'ps_ind_07_bin', u'ps_ind_02_cat', u'ps_ind_15',
    u'ps_car_09_cat', u'ps_calc_07', u'ps_calc_13', u'ps_calc_14',
    u'ps_car_06_cat', u'ps_calc_05', u'ps_car_08_cat', u'ps_ind_05_cat',
    u'ps_ind_18_bin'
]

all_cols = list(set(df_train_copy.columns) - set(['target']))

In [12]:
from sklearn.model_selection import train_test_split


In [13]:
class ExtractFeatures(sklearn.base.BaseEstimator):
    """
    simply remove the sets of features we want to use for a specific model
    """
    def __init__(self, cols=[], df_col_names=None):
        """
        define which columns to extract from dataframe
        """        
        self.use_idx_vals = False
        self.use_cols = cols
        if not (df_col_names is None):
            self.use_cols = [i for i, c in enumerate(df_col_names) if c in cols]                
            self.use_idx_vals = True
    
    #def get_params(self, deep=True):
    #    return {
    #        'cols': getattr(self, 'use_cols', None)
    #    }
    
    def dummy_extract(self, dfX):
        """
        extract cols and return as np.array
        """
        if self.use_idx_vals:
            if isinstance(dfX, pd.DataFrame):
                return dfX.values[:, self.use_cols]
            else:
                return dfX[:, self.use_cols]
        else:
            return dfX[self.use_cols].values
    def transform(self, dfX):
        return self.dummy_extract(dfX)
    def fit(self, dfX, _y):
        self.dummy_extract(dfX)
        return self

def TestExtract():
    a = pd.DataFrame({
        'a': ['a', 'b', 'c'],
        'b': ['d', 'e', 'f'],
        'c': ['g', 'h', 'i'],
    })
    col = ['a', 'c']
    assert ((a[col].values != ExtractFeatures(col).transform(a)).sum()==0)
    col = ['a']
    assert ((a[col].values != ExtractFeatures(col).transform(a)).sum()==0)
    col = ['c']
    assert ((a[col].values != ExtractFeatures(col).transform(a)).sum()==0)    
    assert ((a[col].values != ExtractFeatures(col, a.columns).transform(a)).sum()==0)
    assert ((a[col].values != ExtractFeatures(col, a.columns).transform(a.values)).sum()==0)

TestExtract()

In [16]:
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.pipeline import make_pipeline


In [19]:
p1 = make_pipeline(
    SMOTE(kind='borderline1'),
    ExtractFeatures(cols_for_lr, df_train_copy.columns),
    StandardScaler(),
    # ExtractFeatures(cols_for_random_forest, X_train.columns),
    # RandomForestClassifier(n_estimators=100, max_features='sqrt', max_depth=None, n_jobs=4)
    LogisticRegression(n_jobs=-1)
)

acc = []

for i in range(10):
    print(i, 'SMOTE')
    X_train, X_test, y_train, y_test = train_test_split(
        df_train_copy, df_train_copy['target'], test_size=0.2
    )
    p1.fit(X_train, y_train)
    acc.append(gini_normalized(y_test, p1.predict_proba(X_test)[:,1]))
    
    

(0, 'SMOTE')


  " = {}.".format(self.n_jobs))


(1, 'SMOTE')
(2, 'SMOTE')
(3, 'SMOTE')
(4, 'SMOTE')
(5, 'SMOTE')
(6, 'SMOTE')
(7, 'SMOTE')
(8, 'SMOTE')
(9, 'SMOTE')


In [22]:
acc_smot = np.array(acc)

In [30]:
acc_smot.mean()

0.23488851786018133

In [23]:
p1 = make_pipeline(
    RandomOverSampler(),
    ExtractFeatures(cols_for_lr, df_train_copy.columns),
    StandardScaler(),
    # ExtractFeatures(cols_for_random_forest, X_train.columns),
    # RandomForestClassifier(n_estimators=100, max_features='sqrt', max_depth=None, n_jobs=4)
    LogisticRegression(n_jobs=-1)
)

acc = []

for i in range(10):
    print(i, 'RandomOS')
    X_train, X_test, y_train, y_test = train_test_split(
        df_train_copy, df_train_copy['target'], test_size=0.2
    )
    p1.fit(X_train, y_train)
    acc.append(gini_normalized(y_test, p1.predict_proba(X_test)[:,1]))
    
    

(0, 'SMOTE')
(1, 'SMOTE')
(2, 'SMOTE')
(3, 'SMOTE')
(4, 'SMOTE')
(5, 'SMOTE')
(6, 'SMOTE')
(7, 'SMOTE')
(8, 'SMOTE')
(9, 'SMOTE')


In [24]:
acc_random_os = np.array(acc)

In [29]:
acc_random_os.mean()

0.2433658426353956

In [26]:
p1 = make_pipeline(
    ADASYN(),
    ExtractFeatures(cols_for_lr, df_train_copy.columns),
    StandardScaler(),
    # ExtractFeatures(cols_for_random_forest, X_train.columns),
    # RandomForestClassifier(n_estimators=100, max_features='sqrt', max_depth=None, n_jobs=4)
    LogisticRegression(n_jobs=-1)
)

acc = []

for i in range(10):
    print(i, 'adasyn')
    X_train, X_test, y_train, y_test = train_test_split(
        df_train_copy, df_train_copy['target'], test_size=0.2
    )
    p1.fit(X_train, y_train)
    acc.append(gini_normalized(y_test, p1.predict_proba(X_test)[:,1]))
    
    

(0, 'adasyn')
(1, 'adasyn')
(2, 'adasyn')
(3, 'adasyn')
(4, 'adasyn')
(5, 'adasyn')
(6, 'adasyn')
(7, 'adasyn')
(8, 'adasyn')
(9, 'adasyn')


In [27]:
acc_random_ada = np.array(acc)

In [28]:
acc_random_ada.mean()

0.24147230463576355

In [31]:
from imblearn.under_sampling import RandomUnderSampler    


In [32]:
p1 = make_pipeline(
    RandomUnderSampler(),
    ExtractFeatures(cols_for_lr, df_train_copy.columns),
    StandardScaler(),
    # ExtractFeatures(cols_for_random_forest, X_train.columns),
    # RandomForestClassifier(n_estimators=100, max_features='sqrt', max_depth=None, n_jobs=4)
    LogisticRegression(n_jobs=-1)
)

acc = []

for i in range(10):
    print(i, 'randomus')
    X_train, X_test, y_train, y_test = train_test_split(
        df_train_copy, df_train_copy['target'], test_size=0.2
    )
    p1.fit(X_train, y_train)
    acc.append(gini_normalized(y_test, p1.predict_proba(X_test)[:,1]))
    
    

(0, 'randomus')
(1, 'randomus')
(2, 'randomus')
(3, 'randomus')
(4, 'randomus')
(5, 'randomus')
(6, 'randomus')
(7, 'randomus')
(8, 'randomus')
(9, 'randomus')


In [33]:
acc_random_us = np.array(acc)

In [34]:
acc_random_us.mean()

0.24620052221528463

In [36]:
from imblearn.combine import SMOTEENN

In [37]:
p1 = make_pipeline(
    SMOTEENN(),
    ExtractFeatures(cols_for_lr, df_train_copy.columns),
    StandardScaler(),
    # ExtractFeatures(cols_for_random_forest, X_train.columns),
    # RandomForestClassifier(n_estimators=100, max_features='sqrt', max_depth=None, n_jobs=4)
    LogisticRegression(n_jobs=-1)
)

acc = []

for i in range(10):
    print(i, 'smotten')
    X_train, X_test, y_train, y_test = train_test_split(
        df_train_copy, df_train_copy['target'], test_size=0.2
    )
    p1.fit(X_train, y_train)
    acc.append(gini_normalized(y_test, p1.predict_proba(X_test)[:,1]))
    
    

(0, 'randomus')
(1, 'randomus')
(2, 'randomus')
(3, 'randomus')
(4, 'randomus')
(5, 'randomus')
(6, 'randomus')
(7, 'randomus')
(8, 'randomus')
(9, 'randomus')


In [39]:
acc_smotten = np.array(acc)

In [42]:
acc_smotten.mean()

0.23725155204523718

In [43]:
pd.concat([
    pd.Series(acc_random_ada),
    pd.Series(acc_random_os),
    pd.Series(acc_random_us),
    pd.Series(acc_smot),
    pd.Series(acc_smotten)
], axis=1, keys=['ada', 'oversamp', 'undersample', 'smote', 'smotten'])

Unnamed: 0,ada,oversamp,undersample,smote,smotten
0,0.253428,0.237684,0.25183,0.235577,0.244569
1,0.240331,0.26189,0.240529,0.244412,0.239521
2,0.23715,0.2448,0.250779,0.22679,0.241326
3,0.235944,0.245608,0.245333,0.236361,0.237616
4,0.236073,0.224721,0.258767,0.242088,0.237371
5,0.238398,0.250222,0.24058,0.217712,0.236523
6,0.235966,0.242819,0.242224,0.242894,0.241098
7,0.252922,0.245038,0.235973,0.240498,0.23318
8,0.237245,0.231596,0.253828,0.234168,0.247017
9,0.247267,0.249281,0.242164,0.228385,0.214295


In [203]:
pd.DataFrame(acc_with_sub_samp)

Unnamed: 0,averaged,bc_gini,lr_gini,rf_gini
0,0.255151,0.157132,0.252415,0.156841
1,0.249068,0.151016,0.252121,0.155272
2,0.23051,0.1477,0.229635,0.16373
3,0.235002,0.155562,0.228198,0.14079
4,0.238146,0.140436,0.240538,0.14944
5,0.243857,0.144852,0.243409,0.159457
6,0.239681,0.156499,0.236141,0.163166
7,0.239429,0.153807,0.231204,0.16051
8,0.254107,0.159629,0.256301,0.164851
9,0.24282,0.150408,0.247106,0.158329


In [107]:
from imblearn.ensemble import EasyEnsemble
acc = []
for i in range(10):
    print(i, 'random undersampling')
    X_train, X_test, y_train, y_test = train_test_split(
        df_train_copy, df_train_copy['target'], test_size=0.2
    )
    ee = EasyEnsemble(n_subsets=20)
    X_resampled, y_resampled = ee.fit_sample(X_train, y_train)
    p1 = make_pipeline(    
        ExtractFeatures(cols_for_lr, df_train_copy.columns),
        StandardScaler(),
        # ExtractFeatures(cols_for_random_forest, X_train.columns),
        # RandomForestClassifier(n_estimators=100, max_features='sqrt', max_depth=None, n_jobs=4)
        LogisticRegression(n_jobs=-1)
    )
    m = []
    for j in range(X_resampled.shape[0]):
        m.append(p1.fit(X_resampled[j], y_resampled[j]))

    r = []
    for j in m:
        r.append(j.predict(X_test))
    ans = np.array(r).mean(axis=0)    
    acc.append(gini_normalized(y_test, ans))

(0, 'random undersampling')
(1, 'random undersampling')
(2, 'random undersampling')
(3, 'random undersampling')
(4, 'random undersampling')
(5, 'random undersampling')
(6, 'random undersampling')
(7, 'random undersampling')
(8, 'random undersampling')
(9, 'random undersampling')


In [109]:
acc

[0.16472553786600316,
 0.19146966415267491,
 0.1772546486143953,
 0.17172927882928207,
 0.16447852716697131,
 0.17018646254723979,
 0.18827503245692345,
 0.17236914391368621,
 0.18193425571658708,
 0.17623813149013859]

In [None]:
from sklearn.ensemble import BaggingClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier

p1 = make_pipeline(
    ExtractFeatures(all_cols, df_train_copy.columns),
    BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
        ratio='auto',
        n_estimators=100,
        replacement=False,
    ) 
)

acc = []

for i in range(10):
    print(i, 'bagging balanced')
    X_train, X_test, y_train, y_test = train_test_split(
        df_train_copy, df_train_copy['target'], test_size=0.2
    )
    p1.fit(X_train, y_train)
    acc.append(gini_normalized(y_test, p1.predict(X_test)))

In [75]:
acc

[0.1693896728459835,
 0.16618195722301302,
 0.18371921123840745,
 0.16490023213808078,
 0.17476471469413715,
 0.16081644773843329,
 0.16542185049146743,
 0.17106598879042093,
 0.16310255002402649,
 0.17469009764631929]

In [78]:
from sklearn.ensemble import BaggingClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier

p1 = make_pipeline(
    ExtractFeatures(cols_for_lr, df_train_copy.columns),
    BalancedBaggingClassifier(base_estimator=LogisticRegression(),
        ratio='auto',
        n_estimators=100,
        replacement=False,
    ) 
)

acc = []

for i in range(10):
    print(i, 'bagging balanced')
    X_train, X_test, y_train, y_test = train_test_split(
        df_train_copy, df_train_copy['target'], test_size=0.2
    )
    p1.fit(X_train, y_train)
    acc.append(gini_normalized(y_test, p1.predict(X_test)))

(0, 'bagging balanced')
(1, 'bagging balanced')
(2, 'bagging balanced')
(3, 'bagging balanced')
(4, 'bagging balanced')
(5, 'bagging balanced')
(6, 'bagging balanced')
(7, 'bagging balanced')
(8, 'bagging balanced')
(9, 'bagging balanced')


In [79]:
acc

[0.17421833087875949,
 0.1657204548527598,
 0.17218184557625865,
 0.17476176883100758,
 0.18068998999732605,
 0.18658202836245627,
 0.1758742111837176,
 0.1707358348557296,
 0.174572490215683,
 0.18025772703885518]