** Objectives **

* How to load a large file into memory using Pandas ?
* How to take a representative sample from a population ?
    * Stratified sample
* Feature Selection

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os, sys

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Imputer, MinMaxScaler
from sklearn.metrics import roc_auc_score

from scipy import stats

import warnings
warnings.filterwarnings('ignore')

np.random.seed(1)

basepath = os.path.expanduser('~/Desktop/src/Loan_Default_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

** Stratified Sample **

In [2]:
def get_stratified_sample(X, y, train_size, random_state=10):
    """
    Takes in a feature set and target with percentage of training size and a seed for reproducability.
    Returns indices for the training and test sets.
    """
    
    itrain, itest = train_test_split(range(len(X)), stratify=y, train_size=train_size, random_state=random_state)
    return itrain, itest

In [3]:
# load files
chunksize = 10 ** 4

train_chunks = pd.read_table(os.path.join(basepath, 'data/raw/train_v2.csv'), \
                             chunksize=chunksize, \
                             sep=',', \
                             index_col='id'
                            )

In [4]:
train = pd.concat(train_chunks)

In [5]:
# create a binary variable based on the target
train['is_default'] = (train.loss > 0).astype(np.int)

In [6]:
itrain, itest = get_stratified_sample(train, train.is_default, 0.4)

train_sample = train.iloc[itrain]
del train

In [7]:
print('Shape of the sample: ', (train_sample.shape))

Shape of the sample:  (42188, 771)


In [8]:
features = train_sample.columns.drop(['is_default', 'loss'])

** Histogram of features ( training set ) **

In [None]:
start_index = 760
end_index   = 770

train_sample.ix[:, start_index:end_index].hist(figsize=(16, 12), bins=50)
plt.savefig(os.path.join(basepath, 'reports/figures/feat_%s-%s'%(start_index, end_index)))

** Save the histograms to disk so that we can observe the distribution. **

In [9]:
itrain, itest = get_stratified_sample(train_sample, train_sample.is_default, train_size=0.7, random_state=11)

X_train = train_sample.iloc[itrain][features]
X_test  = train_sample.iloc[itest][features]

y_train = train_sample.is_default.iloc[itrain]
y_test  = train_sample.is_default.iloc[itest]

In [10]:
class GoldenFeature(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['f528-f527'] = X['f528'] - X['f527'] 
        return X

** Feature Selection **

In [11]:
class TreeBasedSelection(object):
    def __init__(self, estimator, target, n_features_to_select=None):
        self.estimator            =  estimator
        self.n_features_to_select =  n_features_to_select
        self.target               =  target
        
    def fit(self, X, y=None):
        self.estimator.fit(X, self.target)
        
        self.importances = self.estimator.feature_importances_
        self.indices     = np.argsort(self.importances)[::-1]
        
        return self
    
    def transform(self, X):
        return X[:, self.indices[:self.n_features_to_select]]

In [14]:
pipeline = Pipeline([
        ('union', FeatureUnion([
                    ('golden_feature', GoldenFeature()) 
                ])),
        ('imputer', Imputer()),
        ('scaler', MinMaxScaler()),
        ('select', TreeBasedSelection(ExtraTreesClassifier(), y_train, n_features_to_select=20)),
        ('model', RandomForestClassifier(n_estimators=50, n_jobs=2, random_state=5))
    ])

pipeline.fit(X_train, y_train)

Pipeline(steps=[('union', FeatureUnion(n_jobs=1, transformer_list=[('golden_feature', GoldenFeature())],
       transformer_weights=None)), ('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('select', <__main_...estimators=50, n_jobs=2,
            oob_score=False, random_state=5, verbose=0, warm_start=False))])

In [15]:
preds = pipeline.predict_proba(X_test)[:, 1]
print('AUC score on unseen examples %f'%(roc_auc_score(y_test, preds)))

AUC score on unseen examples 0.924790


In [None]:
class FeatureExtractor:
    def __init__(self, train, test):
        self.train = train
        self.test  = test
    
    def extract(self):
        self.round_values()
        self.create_features()
        
        return self.get_train(), self.get_test()
    
    def round_values(self):
        self.train = np.around(self.train, decimals=1)
        self.test  = np.around(self.test, decimals=1)
    
    def create_features(self):
        # feature based out of f1
        self.train['f1_cat'] = (self.train['f1'] < 140).astype(np.int)
        self.test['f1_cat']  = (self.test['f1'] < 140).astype(np.int)
        
        # feature based out of f9
        self.train['f9_cat'] = (self.train['f9'] < 140).astype(np.int)
        self.test['f9_cat']  = (self.test['f9'] < 140).astype(np.int)
        
        # feature based out of 10
        self.train['f10_cat'] = (self.train['f10'] < 140).astype(np.int)
        self.test['f10_cat']  = (self.test['f10'] < 140).astype(np.int)
        
        # feature out of f14
        self.train['f14_cat'] = (self.train['f14'] == 0.0).astype(np.int)
        self.test['f14_cat']  = (self.test['f14'] == 0.0).astype(np.int)
        
        # feature out of f6
        self.train['f6_cat'] = (self.train['f6'] < 2e4).astype(np.int)
        self.test['f6_cat']  = (self.test['f6'] < 2e4).astype(np.int)
         
    def get_train(self):
        return self.train
    
    def get_test(self):
        return self.test

In [None]:
feat = FeatureExtractor(train[train.columns[:12]], test[test.columns[:12]])
train_sub, test_sub = feat.extract()

In [None]:
train_sub.to_csv(os.path.join(basepath, 'data/processed/train_sub.csv'), index=False)
test_sub.to_csv(os.path.join(basepath, 'data/processed/test_sub.csv'), index=False)

train[['loss']].to_csv(os.path.join(basepath, 'data/processed/target.csv'), index=False)