<h1><center>Credit Risk Analysis using IBM Snap ML</center></h1>

# Imports

In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
pd.options.display.max_columns = 999
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_curve, roc_auc_score
from scipy.stats import chi2_contingency,ttest_ind
from sklearn.utils import shuffle
import time

import warnings
warnings.filterwarnings('ignore')


# Dataset Visualization

In [2]:
cust_pd_full = pd.read_csv('./credit_customer_data.csv')

rows=1000000
cust_pd = cust_pd_full.head(rows)
print("There are " + str(len(cust_pd)) + " observations in the customer history dataset.")
print("There are " + str(len(cust_pd.columns)) + " variables in the dataset.")

cust_pd.head()

There are 1000000 observations in the customer history dataset.
There are 19 variables in the dataset.


Unnamed: 0,EMI_TENURE,CREDIT_HISTORY,TRANSACTION_CATEGORY,TRANSACTION_AMOUNT,ACCOUNT_TYPE,ACCOUNT_AGE,STATE,IS_URBAN,IS_STATE_BORDER,HAS_CO_APPLICANT,HAS_GUARANTOR,OWN_REAL_ESTATE,OTHER_INSTALMENT_PLAN,OWN_RESIDENCE,NUMBER_CREDITS,RFM_SCORE,OWN_CAR,SHIP_INTERNATIONAL,IS_DEFAULT
0,77,EXISTING CREDITS PAID BACK,EDUCATION,27630,UNKNOWN/NONE,above 7 YRS,CT,NO,YES,YES,YES,NO,YES,NO,0,4,NO,NO,No
1,119,EXISTING CREDITS PAID BACK,ELECTRONICS,31314,above 1000 K USD,4 to 7 YRS,CT,YES,YES,YES,YES,NO,NO,YES,0,3,YES,YES,No
2,84,EXISTING CREDITS PAID BACK,FURNITURE,27630,above 1000 K USD,4 to 7 YRS,PA,NO,NO,YES,YES,YES,NO,YES,0,3,YES,YES,No
3,119,DELAY IN PAST,FURNITURE,33156,above 1000 K USD,up to 1 YR,PA,YES,NO,YES,NO,NO,NO,YES,0,3,NO,NO,Yes
4,105,DELAY IN PAST,FURNITURE,23946,above 1000 K USD,up to 1 YR,CT,NO,YES,YES,YES,YES,YES,NO,0,3,YES,YES,No


# Data Preprocessing

In [3]:
# Split dataframe into Features (X) and Labels (y)
cust_pd_Y = cust_pd[['IS_DEFAULT']]
cust_pd_X = cust_pd.drop(['IS_DEFAULT'],axis=1)

print('cust_pd_X.shape=', cust_pd_X.shape, 'cust_pd_Y.shape=', cust_pd_Y.shape)

cust_pd_X.shape= (1000000, 18) cust_pd_Y.shape= (1000000, 1)


# Transform Labels (y)

In [4]:
cust_pd_Y.head()

Unnamed: 0,IS_DEFAULT
0,No
1,No
2,No
3,Yes
4,No


In [5]:
le = LabelEncoder()
cust_pd_Y["IS_DEFAULT"] = le.fit_transform(cust_pd_Y['IS_DEFAULT'])
cust_pd_Y.head()

Unnamed: 0,IS_DEFAULT
0,0
1,0
2,0
3,1
4,0


# Transform Features (X)

In [6]:
print('features X dataframe shape = ', cust_pd_X.shape)
cust_pd_X.head()

features X dataframe shape =  (1000000, 18)


Unnamed: 0,EMI_TENURE,CREDIT_HISTORY,TRANSACTION_CATEGORY,TRANSACTION_AMOUNT,ACCOUNT_TYPE,ACCOUNT_AGE,STATE,IS_URBAN,IS_STATE_BORDER,HAS_CO_APPLICANT,HAS_GUARANTOR,OWN_REAL_ESTATE,OTHER_INSTALMENT_PLAN,OWN_RESIDENCE,NUMBER_CREDITS,RFM_SCORE,OWN_CAR,SHIP_INTERNATIONAL
0,77,EXISTING CREDITS PAID BACK,EDUCATION,27630,UNKNOWN/NONE,above 7 YRS,CT,NO,YES,YES,YES,NO,YES,NO,0,4,NO,NO
1,119,EXISTING CREDITS PAID BACK,ELECTRONICS,31314,above 1000 K USD,4 to 7 YRS,CT,YES,YES,YES,YES,NO,NO,YES,0,3,YES,YES
2,84,EXISTING CREDITS PAID BACK,FURNITURE,27630,above 1000 K USD,4 to 7 YRS,PA,NO,NO,YES,YES,YES,NO,YES,0,3,YES,YES
3,119,DELAY IN PAST,FURNITURE,33156,above 1000 K USD,up to 1 YR,PA,YES,NO,YES,NO,NO,NO,YES,0,3,NO,NO
4,105,DELAY IN PAST,FURNITURE,23946,above 1000 K USD,up to 1 YR,CT,NO,YES,YES,YES,YES,YES,NO,0,3,YES,YES


# One-Hot Encoding of Categorical Features

In [7]:
categoricalColumns = ['CREDIT_HISTORY', 'TRANSACTION_CATEGORY', 'ACCOUNT_TYPE', 'ACCOUNT_AGE',
                      'STATE', 'IS_URBAN', 'IS_STATE_BORDER', 'HAS_CO_APPLICANT', 'HAS_GUARANTOR',
                      'OWN_REAL_ESTATE', 'OTHER_INSTALMENT_PLAN',
                      'OWN_RESIDENCE', 'RFM_SCORE', 'OWN_CAR', 'SHIP_INTERNATIONAL']
cust_pd_X = pd.get_dummies(cust_pd_X, columns=categoricalColumns)
cust_pd_X.head()

print('features X dataframe shape = ', cust_pd_X.shape)

features X dataframe shape =  (1000000, 51)


# Normalize Features

In [8]:
min_max_scaler = MinMaxScaler()
features = min_max_scaler.fit_transform(cust_pd_X)
features = normalize(features, axis=1, norm='l1')

cust_pd_X = pd.DataFrame(features,columns=cust_pd_X.columns)
cust_pd_X.head()

Unnamed: 0,EMI_TENURE,TRANSACTION_AMOUNT,NUMBER_CREDITS,CREDIT_HISTORY_ALL CREDITS PAID BACK,CREDIT_HISTORY_CRITICAL ACCOUNT,CREDIT_HISTORY_DELAY IN PAST,CREDIT_HISTORY_EXISTING CREDITS PAID BACK,CREDIT_HISTORY_NONE TAKEN,TRANSACTION_CATEGORY_EDUCATION,TRANSACTION_CATEGORY_ELECTRONICS,TRANSACTION_CATEGORY_FURNITURE,TRANSACTION_CATEGORY_NEW CAR,TRANSACTION_CATEGORY_OTHER,TRANSACTION_CATEGORY_RETRAINING,TRANSACTION_CATEGORY_USED CAR,ACCOUNT_TYPE_100 to 500 K USD,ACCOUNT_TYPE_500 to 1000 K USD,ACCOUNT_TYPE_UNKNOWN/NONE,ACCOUNT_TYPE_above 1000 K USD,ACCOUNT_TYPE_up to 100 K USD,ACCOUNT_AGE_1 to 4 YRS,ACCOUNT_AGE_4 to 7 YRS,ACCOUNT_AGE_TBD,ACCOUNT_AGE_above 7 YRS,ACCOUNT_AGE_up to 1 YR,STATE_CT,STATE_NJ,STATE_NY,STATE_PA,IS_URBAN_NO,IS_URBAN_YES,IS_STATE_BORDER_NO,IS_STATE_BORDER_YES,HAS_CO_APPLICANT_NO,HAS_CO_APPLICANT_YES,HAS_GUARANTOR_NO,HAS_GUARANTOR_YES,OWN_REAL_ESTATE_NO,OWN_REAL_ESTATE_YES,OTHER_INSTALMENT_PLAN_NO,OTHER_INSTALMENT_PLAN_YES,OWN_RESIDENCE_NO,OWN_RESIDENCE_YES,RFM_SCORE_1,RFM_SCORE_2,RFM_SCORE_3,RFM_SCORE_4,OWN_CAR_NO,OWN_CAR_YES,SHIP_INTERNATIONAL_NO,SHIP_INTERNATIONAL_YES
0,0.027248,0.032698,0.0,0.0,0.0,0.0,0.06267,0.0,0.06267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06267,0.0,0.0,0.0,0.0,0.0,0.06267,0.0,0.06267,0.0,0.0,0.0,0.06267,0.0,0.0,0.06267,0.0,0.06267,0.0,0.06267,0.06267,0.0,0.0,0.06267,0.06267,0.0,0.0,0.0,0.0,0.06267,0.06267,0.0,0.06267,0.0
1,0.042667,0.037333,0.0,0.0,0.0,0.0,0.061333,0.0,0.0,0.061333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061333,0.0,0.0,0.061333,0.0,0.0,0.0,0.061333,0.0,0.0,0.0,0.0,0.061333,0.0,0.061333,0.0,0.061333,0.0,0.061333,0.061333,0.0,0.061333,0.0,0.0,0.061333,0.0,0.0,0.061333,0.0,0.0,0.061333,0.0,0.061333
2,0.029891,0.032609,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0,0.0625,0.0,0.0,0.0625,0.0,0.0625,0.0,0.0625,0.0625,0.0,0.0,0.0625,0.0,0.0,0.0625,0.0,0.0,0.0625,0.0,0.0625
3,0.042553,0.039894,0.0,0.0,0.0,0.06117,0.0,0.0,0.0,0.0,0.06117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06117,0.0,0.0,0.0,0.0,0.0,0.06117,0.0,0.0,0.0,0.06117,0.0,0.06117,0.06117,0.0,0.0,0.06117,0.06117,0.0,0.06117,0.0,0.06117,0.0,0.0,0.06117,0.0,0.0,0.06117,0.0,0.06117,0.0,0.06117,0.0
4,0.03794,0.0271,0.0,0.0,0.0,0.062331,0.0,0.0,0.0,0.0,0.062331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062331,0.0,0.0,0.0,0.0,0.0,0.062331,0.062331,0.0,0.0,0.0,0.062331,0.0,0.0,0.062331,0.0,0.062331,0.0,0.062331,0.0,0.062331,0.0,0.062331,0.062331,0.0,0.0,0.0,0.062331,0.0,0.0,0.062331,0.0,0.062331


# Generate Train and Test Datasets

In [9]:
labels    = cust_pd_Y.values
features  = cust_pd_X.values

labels = np.reshape(labels,(-1,1))

X_train,X_test,y_train,y_test = \
       train_test_split(features, labels, test_size=0.3, random_state=42, stratify=labels)
                    
print('X_train.shape=', X_train.shape, 'Y_train.shape=', y_train.shape)
print('X_test.shape=', X_test.shape, 'Y_test.shape=', y_test.shape)

X_train.shape= (700000, 51) Y_train.shape= (700000, 1)
X_test.shape= (300000, 51) Y_test.shape= (300000, 1)


# Train a Logistic Regression Model using Scikit-Learn

In [10]:
# While we are importing from SnapML we are using the Scikit-learn 'liblinear' solver
# You could choose to import the model from Scikit-learn if you don't believe us!
from pai4sk.linear_model import LogisticRegression
sklearn_lr = LogisticRegression(solver='liblinear') 
print(sklearn_lr)

LogisticRegression(C=1.0, batch_size=100, class_weight=None, device_ids=[],
                   dual=False, eta=0.3, fit_intercept=True, grad_clip=1,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, num_threads=1, penalty='l2',
                   privacy=False, privacy_epsilon=10, random_state=None,
                   return_training_history=None, solver='liblinear', tol=0.0001,
                   use_gpu=True, verbose=0, warm_start=False)


In [11]:
# Train a logistic regression model using Scikit-Learn
t0 = time.time()
sklearn_lr.fit(X_train, y_train)
sklearn_time = time.time() - t0
print("[sklearn] Training time (s):  {0:.2f}".format(sklearn_time))

# Evaluate accuracy on test set
sklearn_pred = sklearn_lr.predict(X_test)
print('[sklearn] Accuracy score : {0:.6f}'.format(accuracy_score(y_test, sklearn_pred)))

[sklearn] Training time (s):  5.29
[sklearn] Accuracy score : 0.957530


# Train a Logistic Regression Model using Snap ML

In [12]:
from pai4sk import LogisticRegression
snapml_lr = LogisticRegression(use_gpu=True, device_ids=[0,1])
print(snapml_lr.get_params())

[Info] If set num_threads should be a multiple of 32. GPU training will run with num_threads=256.
{'max_iter': 1000, 'regularizer': 1.0, 'use_gpu': True, 'device_ids': array([0, 1], dtype=uint32), 'class_weight': None, 'dual': True, 'verbose': False, 'num_threads': 256, 'penalty': 'l2', 'tol': 0.001, 'return_training_history': None, 'privacy': False, 'eta': 0.3, 'batch_size': 100, 'grad_clip': 1, 'privacy_epsilon': 10, 'fit_intercept': False, 'intercept_scaling': 1.0}


In [13]:
# Train a logistic regression model using Snap ML
t0 = time.time()
model = snapml_lr.fit(X_train, y_train)
snapml_time = time.time() - t0
print("[Snap ML] Training time (s):  {0:.2f}".format(snapml_time))

# Evaluate accuracy on test set
snapml_pred = snapml_lr.predict(X_test)
print('[Snap ML] Accuracy score : {0:.6f}'.format(accuracy_score(y_test, snapml_pred)))
print('[Logistic Regression] Snap ML vs. sklearn speedup : {0:.2f}x '.format(sklearn_time/snapml_time))

[Snap ML] Training time (s):  0.62
[Snap ML] Accuracy score : 0.957513
[Logistic Regression] Snap ML vs. sklearn speedup : 8.50x 


# Train a Random Forest Model using Scikit-Learn

In [14]:
# Import the Random Forest model from the pai4sk package
from sklearn.ensemble import RandomForestClassifier
sklearn_rf = RandomForestClassifier(n_estimators=160, n_jobs=160, random_state=0)

In [15]:
# Training a random forest model using scikit-learn
t0 = time.time()
sklearn_rf.fit(X_train, y_train)
sklearn_time = time.time() - t0
print("[sklearn] Training time (s):  {0:.5f}".format(sklearn_time))

# Evaluate accuracy on test set
sklearn_pred = sklearn_rf.predict(X_test)
print('[sklearn] Accuracy score : ', accuracy_score(y_test, sklearn_pred))

[sklearn] Training time (s):  17.29556
[sklearn] Accuracy score :  0.9774766666666667


# Train a Random Forest Model using Snap ML

In [16]:
# Import the Random Forest model directly from the SnapML package
from pai4sk import RandomForestClassifier
snapml_rf = RandomForestClassifier(n_estimators=160, n_jobs=160, random_state=0)

In [17]:
# Training a random forest model using Snap ML
t0 = time.time()
snapml_rf.fit(X_train, y_train)
snapml_time = time.time()-t0
print("[Snap ML] Training time (s):  {0:.5f}".format(snapml_time))

# Evaluate accuracy on test set
snapml_pred = snapml_rf.predict(X_test, num_threads=160)
print('[Snap ML] Accuracy score : {0:.6f}'.format(accuracy_score(y_test, snapml_pred)))
print('[Random Forest] Snap ML vs. sklearn speedup : {0:.2f}x '.format(sklearn_time/snapml_time))

[Snap ML] Training time (s):  14.31760
[Snap ML] Accuracy score : 0.979097
[Random Forest] Snap ML vs. sklearn speedup : 1.21x 


&copy; Copyright IBM Corporation 2019