# Import Libraries

In [3]:
# imports
# import numpy pandas scipy and matplotlib
import numpy as np
import pandas as pd
import scipy
from math import sqrt
import matplotlib.pyplot as plt

# estimators 
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier

# model metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

# cross validation
from sklearn.model_selection import train_test_split

# Reading Data Set

In [4]:
#data
rawData = pd.read_csv("C:/Users/kylie/Desktop/Data Science/C5T3/Resources/default of credit card clients.csv", header=1)
rawData.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [5]:
rawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          30000 non-null  int64
 1   LIMIT_BAL                   30000 non-null  int64
 2   SEX                         30000 non-null  int64
 3   EDUCATION                   30000 non-null  int64
 4   MARRIAGE                    30000 non-null  int64
 5   AGE                         30000 non-null  int64
 6   PAY_0                       30000 non-null  int64
 7   PAY_2                       30000 non-null  int64
 8   PAY_3                       30000 non-null  int64
 9   PAY_4                       30000 non-null  int64
 10  PAY_5                       30000 non-null  int64
 11  PAY_6                       30000 non-null  int64
 12  BILL_AMT1                   30000 non-null  int64
 13  BILL_AMT2                   30000 non-null  int64
 14  BILL_A

# Feature Reduction

### Data Pre-Process 
### Correlation Matrix : Remove BILL_AMT2 - BILL_AMT6; Remove PAY_2 - PAY_6.


In [6]:
#features
features = rawData.iloc[:,[1,2,3,4,5,6,12,18,19,20,21,22,23]]
print('Summary of feature sample')
features.head()

Summary of feature sample


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,BILL_AMT1,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000,2,2,1,24,2,3913,0,689,0,0,0,0
1,120000,2,2,2,26,-1,2682,0,1000,1000,1000,0,2000
2,90000,2,2,2,34,0,29239,1518,1500,1000,1000,1000,5000
3,50000,2,2,1,37,0,46990,2000,2019,1200,1100,1069,1000
4,50000,1,2,1,57,-1,8617,2000,36681,10000,9000,689,679


### Data Engineering
### Feature Selection: Recursive Feature Elimination.

In [7]:
# Train Three Models
modelSVR = SVR()
modelRF = RandomForestRegressor()
modelLR = LinearRegression()

In [8]:
# dependent variable
depVar = rawData['default payment next month']

In [9]:
# Build Training set and Testing set
X_train, X_test, y_train, y_test = train_test_split(features, depVar, test_size = .30, random_state = 123)

In [10]:
# Feature selection
from sklearn.feature_selection import RFE
selector = RFE(modelRF, 8, step=1)
selector = selector.fit(X_train, y_train)
selector

RFE(estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                    criterion='mse', max_depth=None,
                                    max_features='auto', max_leaf_nodes=None,
                                    max_samples=None, min_impurity_decrease=0.0,
                                    min_impurity_split=None, min_samples_leaf=1,
                                    min_samples_split=2,
                                    min_weight_fraction_leaf=0.0,
                                    n_estimators=100, n_jobs=None,
                                    oob_score=False, random_state=None,
                                    verbose=0, warm_start=False),
    n_features_to_select=8, step=1, verbose=0)

In [11]:
selector.support_

array([ True, False, False, False,  True,  True,  True,  True,  True,
        True, False,  True, False])

### Six variables are used to train and test model

In [12]:
#features
features_select = features.iloc[:,[0,4,5,6,7,8]]
print('Summary of Selected Feature Sample')
features_select.head()

Summary of Selected Feature Sample


Unnamed: 0,LIMIT_BAL,AGE,PAY_0,BILL_AMT1,PAY_AMT1,PAY_AMT2
0,20000,24,2,3913,0,689
1,120000,26,-1,2682,0,1000
2,90000,34,0,29239,1518,1500
3,50000,37,0,46990,2000,2019
4,50000,57,-1,8617,2000,36681


# Dividing Data

In [13]:
# dependent variable
depVar = rawData['default payment next month']

In [14]:
# Build Training set and Testing set
X_train, X_test, y_train, y_test = train_test_split(features_select, depVar, test_size = .30, random_state = 123)

In [15]:
# Check X varialbes
X_train.head()

Unnamed: 0,LIMIT_BAL,AGE,PAY_0,BILL_AMT1,PAY_AMT1,PAY_AMT2
16395,320000,36,0,29026,5000,5018
21448,440000,30,-1,777,612,87426
20034,160000,44,-2,-18,0,0
25755,120000,30,0,101596,3706,5502
1438,50000,54,1,48153,0,1400


In [16]:
# Check Y variable
depVar.head()

0    1
1    1
2    0
3    0
4    0
Name: default payment next month, dtype: int64

In [17]:
# Check y_train variable
y_train.head()

16395    0
21448    0
20034    0
25755    0
1438     0
Name: default payment next month, dtype: int64

# Train Models

In [18]:
# Train Three Models
modelSVR = SVR()
modelRF = RandomForestRegressor()
modelLR = LinearRegression()

## Train Random Forest

In [23]:
# Model Fitting: Random Forest
modelRF.fit(X_train,y_train)
print(cross_val_score(modelRF, X_train, y_train))
modelRF.score(X_train,y_train)

[0.11532994 0.12596013 0.1529117  0.14405758 0.10842413]


0.8677062717418731

In [18]:
# Get Random Forest Parameters
modelRF.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [24]:
# Tune RF parameters
modelRF_1 = RandomForestRegressor(max_depth = 70, n_estimators = 400)
# Model Fitting: Random Forest
modelRF_1.fit(X_train,y_train)
print(cross_val_score(modelRF_1, X_train, y_train))
modelRF_1.score(X_train,y_train)

[0.1240066  0.13089    0.15693061 0.15430181 0.11678945]


0.8715808689334226

#### Tune RF improved

# Train SVM

In [20]:
# Model Fitting: SVM
modelSVR.fit(X_train,y_train)
print(cross_val_score(modelSVR, X_train, y_train)) 
modelSVR.score(X_train,y_train)

[-0.08494018 -0.08096492 -0.09337933 -0.08696574 -0.09168185]


-0.08721474928716955

In [21]:
# Get parameters for SVM
modelSVR.get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [None]:
# Tune parameters for SVM
# modelSVR_1 = SVR(C=1, kernel='linear')
# modelSVR_1.fit(X_train,y_train)
# print(cross_val_score(modelSVR_1, X_train, y_train)) 
# modelSVR_1.score(X_train,y_train)

# Train Linear Regression

In [19]:
# Model fitting: Linear Regression
modelLR.fit(X_train,y_train)
print(cross_val_score(modelLR, X_train, y_train)) 
modelLR.score(X_train,y_train)

[0.09976121 0.11108866 0.12309312 0.12807362 0.11290039]


0.11590645284128953

In [20]:
# Get Linear Regression Parameters
modelLR.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}

In [21]:
# Tune Parameters: linear Regression
modelLR_1 = LinearRegression(fit_intercept= False, n_jobs=10)
# Model fitting: Linear Regression
modelLR_1.fit(X_train,y_train)
print(cross_val_score(modelLR_1, X_train, y_train)) 
modelLR_1.score(X_train,y_train)

[0.08182093 0.10207497 0.10780225 0.10710621 0.09349677]


0.09917222539557646

#### Tune Linear Regression model did not improve the results.

# Make Predictions

In [25]:
# Make Predictions
# Random Forest Model is chosen
predictions = modelRF_1.predict(X_test)
predRsquared = r2_score(y_test,predictions)
rmse = sqrt(mean_squared_error(y_test, predictions))
print('R Squared: %.3f' % predRsquared)
print('RMSE: %.3f' % rmse)

R Squared: 0.136
RMSE: 0.383


In [26]:
# Predicitons Statistics
from scipy import stats
stats.describe(predictions)

DescribeResult(nobs=9000, minmax=(0.0, 1.0), mean=0.23942967038362875, variance=0.04828935671784289, skewness=1.36138779304049, kurtosis=1.0961254303293542)

In [None]:
# Ground Truth Statistics
stats.describe(y_test)

In [None]:
# Plot Ground Truth and Prediction
plt.scatter(y_test, predictions, alpha = 0.5)
plt.xlabel('Ground Truth')
plt.ylabel('Predictions')
plt.show();

# Train some additional models: MLP and Logistic

In [None]:
# # MLP 
# modelMLP = MLPClassifier()

In [None]:
# # Model Fitting: MLP
# modelMLP.fit(X_train,y_train)
# print(cross_val_score(modelMLP, X_train, y_train)) 
# modelMLP.score(X_train,y_train)

In [None]:
# Make Predictions
# predictions_MLP = modelMLP.predict_proba(X_test)[:,1]
# predRsquared = r2_score(y_test,predictions_MLP)
# rmse = sqrt(mean_squared_error(y_test, predictions_MLP))
# print('R Squared: %.3f' % predRsquared)
# print('RMSE: %.3f' % rmse)

In [None]:
# predictions_MLP

In [None]:
# Logit Model
# from sklearn.linear_model import LogisticRegression
# modelLG = LogisticRegression()
# modelLG.fit(X_train,y_train)
# print(cross_val_score(modelLG, X_train, y_train)) 
# modelLG.score(X_train,y_train)

In [None]:
# # Make Predictions
# predictions_LG = modelLG.predict_proba(X_test)[:,1]
# predRsquared = r2_score(y_test,predictions_LG)
# rmse = sqrt(mean_squared_error(y_test, predictions_LG))
# print('R Squared: %.3f' % predRsquared)
# print('RMSE: %.3f' % rmse)