In [37]:
import pandas as pd
import numpy as np
import os, sys

from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso

import warnings
warnings.filterwarnings('ignore')

np.random.seed(3)

basepath = os.path.expanduser('~/Desktop/src/Loan_Default_Prediction/')
sys.path.append(os.path.join(basepath, 'src'))

In [2]:
train  = pd.read_csv(os.path.join(basepath, 'data/processed/train_sub.csv'))
test   = pd.read_csv(os.path.join(basepath, 'data/processed/test_sub.csv'))
target = pd.read_csv(os.path.join(basepath, 'data/processed/target.csv'))

** Training examples are in the chronological order but not in the test set. **

In [3]:
train = train.iloc[np.random.permutation(len(train))] # shuffle training set.

** Fill missing values. **

In [4]:
train = train.fillna(-999) # fill missing values
test  = test.fillna(-999)

** Split dataset **

In [5]:
itrain, itest = train_test_split(range(len(train)), test_size=0.75, random_state=10)

X_train = train.iloc[itrain]
X_test  = train.iloc[itest]

y_train = target['loss'].iloc[itrain]
y_test  = target['loss'].iloc[itest]

In [27]:
def cv(estimators, X, y):
    skf = StratifiedKFold(y, n_folds=3, shuffle=True, random_state=12)
    best_estimators = [] # list of best estimators 
    scores = [] # mae score for every fold
    
    for itrain, itest in skf:
        Xtr = X[itrain]
        Xte = X[itest]
        
        ytr = y[itrain]
        yte = y[itest]
        
        min_score = np.Infinity
        
        for est in estimators:
            est.fit(Xtr, ytr)
            pred = est.predict(Xte)
            score = mean_absolute_error(yte, pred)
            
            if score < min_score:
                min_score      = score
                best_estimator = est
        
        best_estimators.append(best_estimator)
        scores.append(min_score)
    
    return best_estimators, scores

In [39]:
pipeline_rf = Pipeline([
        ('scale', StandardScaler()),
        ('model', RandomForestRegressor(n_estimators=50, max_depth=3, random_state=11, n_jobs=3))
    ])

pipeline_linear = Pipeline([
        ('scale', StandardScaler()),
        ('model', Lasso())
    ])

pipeline_gbr = Pipeline([
        ('model', GradientBoostingRegressor(n_estimators=50))
    ])

In [40]:
best_ests, scores = cv([pipeline_linear, pipeline_rf, pipeline_gbr], X_train.values, y_train.values)

In [41]:
best_ests

[Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=3, oob_score=False, random_state=11,
            verbose=0, warm_start=False))]),
 Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
    normalize=False, positive=False, precompute=False, random_state=None,
    selection='cyclic', tol=0.0001, warm_start=False))]),
 Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
    normalize=False, positive=False, precompute=False, random_state=None,
    selection='cyclic', tol=0.0001, warm_start=False)

In [42]:
scores

[1.4759880409888966, 1.4798356951500189, 1.4687342162506365]

In [43]:
# test on the held out examples
pipeline_rf.fit(X_train, y_train)
print('MAE Random Forest Regressor ', mean_absolute_error(y_test, pipeline_rf.predict(X_test)))

MAE Random Forest Regressor  1.46366763205


In [45]:
pipeline_linear.fit(X_train, y_train)
print('MAE Lasso Regression ', mean_absolute_error(y_test, pipeline_linear.predict(X_test)))

MAE Lasso Regression  1.45967873808


In [46]:
# train on the full dataset
pipeline_linear.fit(train, target)
loss = pipeline_linear.predict(test)

In [47]:
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sampleSubmission.csv'))

In [50]:
sample_sub['loss'] = loss
sample_sub.to_csv(os.path.join(basepath, 'submissions/baseline.csv'), index=False)