# Credit Scoring Prediction -Model 1

## 1. Goal and Objective

## 2. Model Description

XGBoost, stands for eXtreme Gradient Boosting and is a relatively new algorithm that implements  gradient boosting decision tree algorithm. Boosting is a technique that uses an ensemble( a collection) of models. New models are added to correct the errors of previous models. Gradient Boosting uses the gradient descent algorithm to minimize the residuals(loss) of prior models. Unlike other boosting algoritms, XGBoost is particularly fast and accurate. For more about the algorithm, please see: https://arxiv.org/pdf/1603.02754.pdf

## 3. Data Description


### 1. Load Data

In [62]:
%pylab inline

#Importations
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from typing import Tuple
import pandas as pd
import json
from sklearn.externals import joblib
import pickle

#Variables
random_state = 42
seed = 7
test_size = 0.33
reg_lambda = 2 #XG Boost's L2 regularization term on weights, increasing it makes the model more conservative.default=1


# load data and the features to be used in the classification
data = pd.read_csv('../data/fraud_data.csv')
features  = ['gender', 'Age_at_joindate','loans_completed','primary_credit_score','education', 'employed']

#Define the input matrix and output/target 
X = data.filter(items = features)
y= data['fraudulent']

#Split X and Y into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)


Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


### Train the XGBoost Model

In [56]:
#Code excerpts from:  https://github.com/berkeley-biosense

#Function that creates a new classifier
def fresh_clf () -> XGBClassifier:
    return XGBClassifier(
        objective= 'binary:logistic',#'binary:logitraw',
        seed=seed,
        reg_lambda=reg_lambda
    )

#Function that returns the classifier and the resulting dataframe
def xgb_cross_validate (
    X: np.array,
    y: np.array,
    nfold: int=7
) -> Tuple[XGBClassifier, pd.DataFrame]:
    # eval_metrics:
    # http://xgboost.readthedocs.io/en/latest//parameter.html
    metrics = ['error@0.1', 'auc']
#     metrics = [ 'auc' ]
    # we use the @ syntax to override the default of 0.5 as the threshold for 0 / 1 classification
    # the intent here to to minimize FAR at the expense of FRR
    alg = fresh_clf()
    xgtrain = xgb.DMatrix(X,y)
    param = alg.get_xgb_params()
    cvresults = xgb.cv(param,
                      xgtrain,
                      num_boost_round=alg.get_params()['n_estimators'],
                      nfold=nfold,
                      metrics=metrics,
                      early_stopping_rounds=100
                      )
    alg.set_params(n_estimators=cvresults.shape[0])
    alg.fit(X,y,eval_metric=metrics)
    return alg, cvresults

In [57]:

X_train, X_validate, y_train, y_validate = train_test_split(
    X, y, 
    test_size=0.33, 
    random_state=42)

clf, cvres = xgb_cross_validate(X_train, y_train)
#clf.score(X_test, y_test)
m = clf.predict(X_test)
c = clf.score(X_test, y_test)

In [58]:
c, m

(1.0,
 array([1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1,
        0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
        0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
        1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
        1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
        1, 0, 0, 0, 1]))

In [52]:
# fit model no training data
model = XGBClassifier(objective = "binary:logitraw", seed =27)
model.fit(X_train, y_train)


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logitraw', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=1)

In [53]:

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [value for value in y_pred]

In [54]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 100.00%


In [55]:
y_pred

array([1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 1])

Model Persistence 

To save the model for later, we can use serialization(pickling) or we can use JobLib

In [65]:
joblib.dump(model, 'XGBoost_model.pkl')

['XGBoost_model.pkl']

In [69]:
loaded_clf = joblib.load('XGBoost_model.pkl')
real_world_X = [0,43,0,527,1,1] #gender, age_at_joindate, loans_completed, primary_credit_score, education, employed
new_case = loaded_clf.predict(real_world_X)
new_case_prob = loaded_clf.predict_proba(real_world_X)
print('For this new case we predicted ', new_case[0],' with the probability estimates: ',new_case_prob)

ValueError: feature_names mismatch: ['gender', 'Age_at_joindate', 'loans_completed', 'primary_credit_score', 'education', 'employed'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5']
expected employed, Age_at_joindate, primary_credit_score, loans_completed, education, gender in input data
training data did not have the following fields: f2, f1, f5, f3, f0, f4

In [66]:
# save the model to disk
filename = '.XGBoost_model.sav'
pickle.dump(model, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

NameError: name 'Y_test' is not defined