In [2]:
import pandas as pd
import sklearn

In [3]:
sklearn.__version__

'0.19.1'

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [5]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)
 
def test_gini():
    def fequ(a,b):
        return abs( a -b) < 1e-6
    def T(a, p, g, n):
        assert( fequ(gini(a,p), g) )
        assert( fequ(gini_normalized(a,p), n) )
    T([1, 2, 3], [10, 20, 30], 0.111111, 1)
    T([1, 2, 3], [30, 20, 10], -0.111111, -1)
    T([1, 2, 3], [0, 0, 0], -0.111111, -1)
    T([3, 2, 1], [0, 0, 0], 0.111111, 1)
    T([1, 2, 4, 3], [0, 0, 0, 0], -0.1, -0.8)
    T([2, 1, 4, 3], [0, 0, 2, 1], 0.125, 1)
    T([0, 20, 40, 0, 10], [40, 40, 10, 5, 5], 0, 0)
    T([40, 0, 20, 0, 10], [1000000, 40, 40, 5, 5], 0.171428,
      0.6)
    T([40, 20, 10, 0, 0], [40, 20, 10, 0, 0], 0.285714, 1)
    T([1, 1, 0, 1], [0.86, 0.26, 0.52, 0.32], -0.041666,
      -0.333333)
    
test_gini()

In [6]:
df_train = pd.read_csv('../../train.csv', index_col=0)

In [10]:
df_train['target'].value_counts()

0    573518
1     21694
Name: target, dtype: int64

In [11]:
df_train.columns

Index([u'target', u'ps_ind_01', u'ps_ind_02_cat', u'ps_ind_03',
       u'ps_ind_04_cat', u'ps_ind_05_cat', u'ps_ind_06_bin', u'ps_ind_07_bin',
       u'ps_ind_08_bin', u'ps_ind_09_bin', u'ps_ind_10_bin', u'ps_ind_11_bin',
       u'ps_ind_12_bin', u'ps_ind_13_bin', u'ps_ind_14', u'ps_ind_15',
       u'ps_ind_16_bin', u'ps_ind_17_bin', u'ps_ind_18_bin', u'ps_reg_01',
       u'ps_reg_02', u'ps_reg_03', u'ps_car_01_cat', u'ps_car_02_cat',
       u'ps_car_03_cat', u'ps_car_04_cat', u'ps_car_05_cat', u'ps_car_06_cat',
       u'ps_car_07_cat', u'ps_car_08_cat', u'ps_car_09_cat', u'ps_car_10_cat',
       u'ps_car_11_cat', u'ps_car_11', u'ps_car_12', u'ps_car_13',
       u'ps_car_14', u'ps_car_15', u'ps_calc_01', u'ps_calc_02', u'ps_calc_03',
       u'ps_calc_04', u'ps_calc_05', u'ps_calc_06', u'ps_calc_07',
       u'ps_calc_08', u'ps_calc_09', u'ps_calc_10', u'ps_calc_11',
       u'ps_calc_12', u'ps_calc_13', u'ps_calc_14', u'ps_calc_15_bin',
       u'ps_calc_16_bin', u'ps_calc_17_bin', u'ps_c

In [7]:
df_train.replace(-1, np.nan, inplace=True)

In [8]:
null_vals = df_train.isnull().sum(axis=0)[df_train.isnull().sum(axis=0)>0]

In [9]:
value_count_columns = {i: df_train[i].value_counts(1) for i in df_train.columns}    

In [10]:
df_train_copy = df_train.copy()
for c in df_train.columns:
    total_num = df_train[c].isnull().sum()
    if(total_num == 0):
        continue
    print(c)
    
    random_vals = np.random.choice(list(value_count_columns[c].index), total_num, list(value_count_columns[c].values))
    
    df_train_copy[c].loc[df_train[c].isnull()] = random_vals


ps_ind_02_cat
ps_ind_04_cat
ps_ind_05_cat


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


ps_reg_03
ps_car_01_cat
ps_car_02_cat
ps_car_03_cat
ps_car_05_cat
ps_car_07_cat
ps_car_09_cat
ps_car_11
ps_car_12
ps_car_14


In [11]:
df_train.columns

Index([u'target', u'ps_ind_01', u'ps_ind_02_cat', u'ps_ind_03',
       u'ps_ind_04_cat', u'ps_ind_05_cat', u'ps_ind_06_bin', u'ps_ind_07_bin',
       u'ps_ind_08_bin', u'ps_ind_09_bin', u'ps_ind_10_bin', u'ps_ind_11_bin',
       u'ps_ind_12_bin', u'ps_ind_13_bin', u'ps_ind_14', u'ps_ind_15',
       u'ps_ind_16_bin', u'ps_ind_17_bin', u'ps_ind_18_bin', u'ps_reg_01',
       u'ps_reg_02', u'ps_reg_03', u'ps_car_01_cat', u'ps_car_02_cat',
       u'ps_car_03_cat', u'ps_car_04_cat', u'ps_car_05_cat', u'ps_car_06_cat',
       u'ps_car_07_cat', u'ps_car_08_cat', u'ps_car_09_cat', u'ps_car_10_cat',
       u'ps_car_11_cat', u'ps_car_11', u'ps_car_12', u'ps_car_13',
       u'ps_car_14', u'ps_car_15', u'ps_calc_01', u'ps_calc_02', u'ps_calc_03',
       u'ps_calc_04', u'ps_calc_05', u'ps_calc_06', u'ps_calc_07',
       u'ps_calc_08', u'ps_calc_09', u'ps_calc_10', u'ps_calc_11',
       u'ps_calc_12', u'ps_calc_13', u'ps_calc_14', u'ps_calc_15_bin',
       u'ps_calc_16_bin', u'ps_calc_17_bin', u'ps_c

In [12]:
df_train.shape

(595212, 58)

In [13]:
df_train.target.value_counts()

0    573518
1     21694
Name: target, dtype: int64

** Feature Importance **

Identify the most important features for training the data

Using a random forest classifier:
* fit all data (excluding the removed binary features above)
* After fit report feature_importances_ which produces weighted list of how much each feature contributed to classification

In [14]:
from datetime import datetime
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [15]:
with open('potentially_unnecessary_features.txt') as r:
    features_to_remove = [l.strip() for l in r]


In [26]:
df_features = df_train_copy.drop('target', axis=1)
# df_features_trimmed = df_features[list(set(df_features.columns) - set(features_to_remove) - set(['target']))]

In [60]:
test = pd.read_csv('../../test.csv', dtype={'id': np.int32}, index_col=0)

In [25]:
test.replace(-1, np.nan, inplace=True)

In [32]:
df_test_copy = test.copy()
for c in test.columns:
    total_num = test[c].isnull().sum()
    if(total_num == 0):
        continue
    print(c)
    
    random_vals = np.random.choice(list(value_count_columns[c].index), total_num, list(value_count_columns[c].values))
    
    df_test_copy[c].loc[test[c].isnull()] = random_vals
    
    

ps_ind_02_cat
ps_ind_04_cat
ps_ind_05_cat
ps_reg_03
ps_car_01_cat
ps_car_02_cat
ps_car_03_cat
ps_car_05_cat
ps_car_07_cat
ps_car_09_cat
ps_car_11
ps_car_14


In [35]:
test.columns

Index([u'ps_ind_01', u'ps_ind_02_cat', u'ps_ind_03', u'ps_ind_04_cat',
       u'ps_ind_05_cat', u'ps_ind_06_bin', u'ps_ind_07_bin', u'ps_ind_08_bin',
       u'ps_ind_09_bin', u'ps_ind_10_bin', u'ps_ind_11_bin', u'ps_ind_12_bin',
       u'ps_ind_13_bin', u'ps_ind_14', u'ps_ind_15', u'ps_ind_16_bin',
       u'ps_ind_17_bin', u'ps_ind_18_bin', u'ps_reg_01', u'ps_reg_02',
       u'ps_reg_03', u'ps_car_01_cat', u'ps_car_02_cat', u'ps_car_03_cat',
       u'ps_car_04_cat', u'ps_car_05_cat', u'ps_car_06_cat', u'ps_car_07_cat',
       u'ps_car_08_cat', u'ps_car_09_cat', u'ps_car_10_cat', u'ps_car_11_cat',
       u'ps_car_11', u'ps_car_12', u'ps_car_13', u'ps_car_14', u'ps_car_15',
       u'ps_calc_01', u'ps_calc_02', u'ps_calc_03', u'ps_calc_04',
       u'ps_calc_05', u'ps_calc_06', u'ps_calc_07', u'ps_calc_08',
       u'ps_calc_09', u'ps_calc_10', u'ps_calc_11', u'ps_calc_12',
       u'ps_calc_13', u'ps_calc_14', u'ps_calc_15_bin', u'ps_calc_16_bin',
       u'ps_calc_17_bin', u'ps_calc_18_bin'

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df_features.values, df_train_copy['target'].values, test_size=0.2
)

In [28]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([458725,  17444]))

In [36]:
X_test_pd = pd.DataFrame(X_test, columns=df_features.columns)

In [33]:
training_df = pd.concat([pd.DataFrame(X_train, columns=df_features.columns), pd.Series(y_train, name='target')], axis=1)

In [41]:
all_subsets = []
for i in range(10):
    all_subsets.append(training_df.groupby(by=['target']).apply(lambda x: x.sample(min(10000, x.shape[0]))))
    

In [42]:
models = []
for s in all_subsets:    
    df_features = s.drop('target', axis=1, errors='ignore')
    df_features_trimmed = df_features[list(set(s.columns) - set(['target'] + features_to_remove))]
    df_response = s[['target']]
    
    models.append({})
    models[-1]['rfc'] = RandomForestClassifier(n_estimators=100, max_features='sqrt', max_depth=None, n_jobs=4)
        
    X = df_features_trimmed.values
    y = df_response.values.ravel()    
    models[-1]['rfc'].fit(X,y)
    
    
    df_features_trimmed2 = df_features[[
        'ps_ind_16_bin', 'ps_ind_09_bin', 'ps_ind_01', 'ps_ind_08_bin', 'ps_ind_06_bin', 
        'ps_car_12', 'ps_ind_17_bin', 'ps_calc_02', 'ps_calc_03', 'ps_calc_01', 'ps_reg_02',
        'ps_reg_03', 'ps_reg_01', 'ps_ind_04_cat', 'ps_car_07_cat', 'ps_car_02_cat', 'ps_car_11',
        'ps_car_13', 'ps_car_15', 'ps_car_14', 'ps_ind_07_bin', 'ps_ind_02_cat', 'ps_ind_15', 'ps_car_09_cat', 
        'ps_car_08_cat', 'ps_ind_05_cat', 'ps_ind_18_bin'
    ]]
    X = df_features_trimmed2.values        
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression

    models[-1]['scaler'] = StandardScaler()
    scaled_data = models[-1]['scaler'].fit_transform(X)
    models[-1]['lr'] = LogisticRegression(n_jobs=-1)
    models[-1]['lr'].fit(scaled_data, y)
    
    df_features_trimmed = df_features[[u'ps_ind_09_bin', u'ps_ind_01', u'ps_ind_03', u'ps_ind_08_bin',
       u'ps_ind_06_bin', u'ps_car_12', u'ps_ind_17_bin', u'ps_calc_03',
       u'ps_reg_02', u'ps_reg_03', u'ps_calc_04', u'ps_reg_01',
       u'ps_ind_04_cat', u'ps_calc_09', u'ps_car_07_cat', u'ps_car_02_cat',
       u'ps_car_13', u'ps_ind_07_bin', u'ps_ind_02_cat', u'ps_ind_15',
       u'ps_car_09_cat', u'ps_calc_07', u'ps_calc_13', u'ps_calc_14',
       u'ps_car_06_cat', u'ps_calc_05', u'ps_car_08_cat', u'ps_ind_05_cat',
       u'ps_ind_18_bin']]
    X = df_features_trimmed.values        
    
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.base import BaseEstimator
    from sklearn.ensemble import BaggingClassifier
    from sklearn.ensemble import RandomForestClassifier
        
    tree = DecisionTreeClassifier(criterion='entropy', max_depth=100)

    models[-1]['bg'] =BaggingClassifier(
        base_estimator=tree,
        n_estimators=100,
        max_features=1.0,
        bootstrap=True,
        bootstrap_features=True,
        n_jobs=8
    )
    models[-1]['bg'].fit(X,y)



In [43]:
final_scores = []
for i,s in enumerate(models):
    Xt = X_test_pd[list(set(X_test_pd.columns) - set(['target'] + features_to_remove))]
    final_scores.append(s['rfc'].predict(Xt))
    Xt_trans = Xt[[
       'ps_ind_16_bin', 'ps_ind_09_bin', 'ps_ind_01', 'ps_ind_08_bin', 'ps_ind_06_bin', 
        'ps_car_12', 'ps_ind_17_bin', 'ps_calc_02', 'ps_calc_03', 'ps_calc_01', 'ps_reg_02',
        'ps_reg_03', 'ps_reg_01', 'ps_ind_04_cat', 'ps_car_07_cat', 'ps_car_02_cat', 'ps_car_11',
        'ps_car_13', 'ps_car_15', 'ps_car_14', 'ps_ind_07_bin', 'ps_ind_02_cat', 'ps_ind_15', 'ps_car_09_cat', 
        'ps_car_08_cat', 'ps_ind_05_cat', 'ps_ind_18_bin'
    ]]
    Xt_trans = s['scaler'].transform(Xt_trans)
    final_scores.append(s['lr'].predict(Xt_trans))
    Xt_trans_3 = Xt[[u'ps_ind_09_bin', u'ps_ind_01', u'ps_ind_03', u'ps_ind_08_bin',
       u'ps_ind_06_bin', u'ps_car_12', u'ps_ind_17_bin', u'ps_calc_03',
       u'ps_reg_02', u'ps_reg_03', u'ps_calc_04', u'ps_reg_01',
       u'ps_ind_04_cat', u'ps_calc_09', u'ps_car_07_cat', u'ps_car_02_cat',
       u'ps_car_13', u'ps_ind_07_bin', u'ps_ind_02_cat', u'ps_ind_15',
       u'ps_car_09_cat', u'ps_calc_07', u'ps_calc_13', u'ps_calc_14',
       u'ps_car_06_cat', u'ps_calc_05', u'ps_car_08_cat', u'ps_ind_05_cat',
       u'ps_ind_18_bin'        
    ]]
    final_scores.append(s['bg'].predict(Xt_trans_3))

In [44]:
all_counts = pd.DataFrame(final_scores).T

In [45]:
r = all_counts.mean(axis=1)

In [46]:
gini_normalized(y_test, r)

0.2513329189471995

In [108]:
all_subsets = []
for i in range(5):
    all_subsets.append(df_train_copy.groupby(by=['target']).apply(lambda x: x.sample(min(10000, x.shape[0]))))

(595212, 58)

In [107]:
df_test_copy

Unnamed: 0_level_0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1.0,8,1.0,0.0,0,1,0,0,0,...,1,1,1,12,0,1,1,0,0,1
1,4,2.0,5,1.0,0.0,0,0,0,1,0,...,2,0,3,10,0,0,1,1,0,1
2,5,1.0,3,0.0,0.0,0,0,0,1,0,...,4,0,2,4,0,0,0,0,0,0
3,0,1.0,6,0.0,0.0,1,0,0,0,0,...,5,1,0,5,1,0,1,0,0,0
4,5,1.0,7,0.0,0.0,0,0,0,1,0,...,4,0,0,4,0,1,1,0,0,1
5,0,1.0,6,0.0,0.0,1,0,0,0,0,...,8,1,4,9,1,0,1,0,1,0
6,0,1.0,3,0.0,0.0,0,1,0,0,0,...,2,0,4,6,1,1,0,0,0,0
8,0,1.0,0,0.0,0.0,1,0,0,0,0,...,3,1,4,9,0,1,0,0,0,0
10,0,1.0,7,0.0,0.0,0,1,0,0,0,...,5,1,4,6,0,0,1,0,0,0
11,1,1.0,6,0.0,0.0,0,0,0,1,0,...,6,1,6,10,0,1,1,0,0,0
