In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
plt.style.use('seaborn-white') #Set the matplotlib stylesheet 

In [None]:
ccdef = pd.read_excel("/content/default.xlsx") 

In [None]:
#view raw data
ccdef.head()

Unnamed: 0.1,Unnamed: 0,default,student,balance,income
0,1,No,No,729.526495,44361.625074
1,2,No,Yes,817.180407,12106.1347
2,3,No,No,1073.549164,31767.138947
3,4,No,No,529.250605,35704.493935
4,5,No,No,785.655883,38463.495879


In [None]:
ccdef.shape

(10000, 5)

In [None]:
# data type
ccdef.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  10000 non-null  int64  
 1   default     10000 non-null  object 
 2   student     10000 non-null  object 
 3   balance     10000 non-null  float64
 4   income      10000 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 390.8+ KB


In [None]:
# null values
ccdef.isnull().sum() 

Unnamed: 0    0
default       0
student       0
balance       0
income        0
dtype: int64

In [None]:
#Statistical Analysis
pd.set_option('precision', 2)

In [None]:
ccdef.describe()

Unnamed: 0.1,Unnamed: 0,balance,income
count,10000.0,10000.0,10000.0
mean,5000.5,835.37,33516.98
std,2886.9,483.71,13336.64
min,1.0,0.0,771.97
25%,2500.75,481.73,21340.46
50%,5000.5,823.64,34552.64
75%,7500.25,1166.31,43807.73
max,10000.0,2654.32,73554.23


In [None]:
# analysis of 0 values in predictions
(ccdef.balance == 0).sum(axis=0)

499

In [None]:
# categorical variable analysis
ccdef.student.value_counts()

No     7056
Yes    2944
Name: student, dtype: int64

In [None]:
# response variable analysis
ccdef.default.value_counts()

No     9667
Yes     333
Name: default, dtype: int64

In [None]:
# encode categorical variables
ccdef['default2'] = ccdef.default.factorize()[0]
ccdef['student2'] = ccdef.default.factorize()[0]
ccdef.head(3)

Unnamed: 0.1,Unnamed: 0,default,student,balance,income,default2,student2
0,1,No,No,729.53,44361.63,0,0
1,2,No,Yes,817.18,12106.13,0,0
2,3,No,No,1073.55,31767.14,0,0


In [None]:
# graphical representation
ccdef_dfno = ccdef[ccdef.default2 == 0].sample(frac= 0.15)
ccdef_dfyes = ccdef[ccdef.default2==1]
ccdef_df = ccdef_dfno.append(ccdef_dfyes)

In [None]:
%matplotlib

Using matplotlib backend: agg


In [None]:
fig = plt.figure(figsize=(12,5)) 
gs = mpl.gridspec.GridSpec(1, 4) 
ax1 = plt.subplot(gs[0,:2]) 
ax2 = plt.subplot(gs[0,2:3]) 
ax3 = plt.subplot(gs[0,3:4]) 
ax1.scatter(ccdef_df[ccdef_df.default == 'Yes'].balance, ccdef_df[ccdef_df.default == 'Yes'].income, s=40, c='orange', marker='+', linewidths=1) 
ax1.scatter(ccdef_df[ccdef_df.default == 'No'].balance, ccdef_df[ccdef_df.default == 'No'].income, s=40, marker='o', linewidths='1', edgecolors='lightblue', facecolors='white', alpha=.6) 
ax1.set_ylim(ymin=0) 
ax1.set_ylabel('Income') 
ax1.set_xlim(xmin=-100) 
ax1.set_xlabel('Balance') 
c_palette = {'No':'lightblue', 'Yes':'orange'} 
sns.boxplot('default', 'balance', data=ccdef, orient='v', ax=ax2, palette=c_palette) 
sns.boxplot('default', 'income', data=ccdef, orient='v', ax=ax3, palette=c_palette) 
gs.tight_layout(plt.gcf()) 



In [None]:
#test data
X_train = ccdef.balance.values.reshape(-1,1) 
y = ccdef.default2
X_test = np.arange(ccdef.balance.min(), ccdef.balance.max()).reshape(-1 ,1)

In [None]:
# logistic regression using sklearn
import sklearn.linear_model as skl_lm
clf = skl_lm.LogisticRegression(solver='newton-cg')
clf.fit(X_train,y) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
prob = clf.predict_proba(X_test)
prob = clf.predict_proba(X_test)
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,5)) 
sns.regplot(ccdef.balance, ccdef.default2, order=1, ci=None,scatter_kws={'color':'orange'},line_kws={'color':'lightblue', 'lw':2}, ax=ax1) 
ax2.scatter(X_train, y, color='orange') 
ax2.plot(X_test, prob[:,1], color='lightblue') 
for ax in fig.axes: 
 ax.hlines(1, xmin=ax.xaxis.get_data_interval()[0],xmax=ax.xaxis.get_data_interval()[1],linestyles='dashed', lw=1) 
 ax.hlines(0, xmin=ax.xaxis.get_data_interval()[0],xmax=ax.xaxis.get_data_interval()[1], linestyles='dashed', lw=1) 
 ax.set_ylabel('Probability of default') 
 ax.set_xlabel('Balance') 
 ax.set_yticks([0, 0.25, 0.5, 0.75, 1.]) 
 ax.set_xlim(xmin=-100) 



In [None]:
print(clf)
print('classes: ',clf.classes_)
print('coefficients: ',clf.coef_)
print('intercept :', clf.intercept_)    

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)
classes:  [0 1]
coefficients:  [[0.00549892]]
intercept : [-10.65132973]


In [None]:
#logistic regression statsmodel
import statsmodels.api as sm
import statsmodels.discrete.discrete_model as sms
pd.set_option('precision', 6)
X_train = sm.add_constant(ccdef.balance)
est = sm.Logit(y.ravel(), X_train).fit()

  import pandas.util.testing as tm


Optimization terminated successfully.
         Current function value: 0.079823
         Iterations 10


In [None]:
est.summary2().tables[1]

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-10.651331,0.361169,-29.491287,3.723665e-191,-11.359208,-9.943453
balance,0.005499,0.00022,24.952404,2.010855e-137,0.005067,0.005931


In [None]:
x_train.head()

Unnamed: 0,const,student2
0,1.0,0
1,1.0,0
2,1.0,0
3,1.0,0
4,1.0,0


In [None]:
#logistic regression dummy variable using statsmodel
x_train =  sm.add_constant(ccdef.student2)

y = ccdef.default2

est = sms.Logit(y, X_train).fit()

Optimization terminated successfully.
         Current function value: 0.079823
         Iterations 10


In [None]:
print(est.summary().tables[1].as_text())

                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -10.6513      0.361    -29.491      0.000     -11.359      -9.943
balance        0.0055      0.000     24.952      0.000       0.005       0.006


In [None]:
#Multiple logistic Regressionm
x_train =  sm.add_constant(ccdef[['balance','income','student2']])

y = ccdef.default2

est = sms.Logit(y, X_train).fit()

Optimization terminated successfully.
         Current function value: 0.079823
         Iterations 10


In [None]:
print(est.summary().tables[1])

                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -10.6513      0.361    -29.491      0.000     -11.359      -9.943
balance        0.0055      0.000     24.952      0.000       0.005       0.006


In [None]:
#cofounding
# create balance  and default vectors for student
x_train = ccdef[ccdef.student == 'Yes'].balance.values.reshape(-1,1)
y = ccdef[ccdef.student == 'Yes'].default2

In [None]:
# create balance  and default vectors for non-student
x_train2 = ccdef[ccdef.student == 'No'].balance.values.reshape(-1,1)
y2 = ccdef[ccdef.student == 'No'].default2

In [None]:
# Create test Vector
x_test = np.arange(ccdef.balance.min(), ccdef.balance.max()).reshape(-1,1)
x_test

array([[0.000e+00],
       [1.000e+00],
       [2.000e+00],
       ...,
       [2.652e+03],
       [2.653e+03],
       [2.654e+03]])

In [None]:
#fit both dataset to logistc regression
clf =  skl_lm.LogisticRegression(solver='newton-cg')

clf2 =  skl_lm.LogisticRegression(solver='newton-cg')

clf.fit(x_train,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
clf2.fit(x_train2,y2)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
prob = clf.predict_proba(x_test)
prob2 = clf2.predict_proba(x_test)
ccdef.groupby(['student','default']).size().unstack('default')

default,No,Yes
student,Unnamed: 1_level_1,Unnamed: 2_level_1
No,6850,206
Yes,2817,127


In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(12,5)) 
ax1.plot(x_test, prob[:,1], color='orange', label = 'Student')
ax1.plot(x_test, prob2[:,1], color='lightblue', label = 'Non-Student')
ax1.hlines(127/2817, colors='orange',label='Overall student', xmin=ax1.xaxis.get_data_interval()[0],xmax=ax1.xaxis.get_data_interval()[1], linestyles='dashed')   
ax1.hlines(206/6850, colors='lightblue',label='Overall Non-student', xmin=ax1.xaxis.get_data_interval()[0],xmax=ax1.xaxis.get_data_interval()[1], linestyles='dashed')
ax1.set_ylabel('Default Rate')
ax1.set_xlabel('Credit Card Balance')
ax1.set_yticks([0,0.2,0.4,0.6,0.8,1.])
ax1.set_xlim(450,2500)
ax1.legend(loc=2)
sns.boxplot('student','balance',data= ccdef, orient='v',ax=ax2, palette= c_palette)



<matplotlib.axes._subplots.AxesSubplot at 0x7f463674bc90>

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
x = ccdef[['balance','income','student2']]
y = ccdef.default2
lda = LinearDiscriminantAnalysis()
y_pred = lda.fit(x, y).predict(x)
print(y_pred)

[0 0 0 ... 0 0 0]


In [None]:
ccdef_ccdef = pd.DataFrame({'True default status':y,'Predicted default status':y_pred})
ccdef_ccdef

Unnamed: 0,True default status,Predicted default status
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
9995,0,0
9996,0,0
9997,0,0
9998,0,0


In [None]:
ccdef_ccdef.groupby(['Predicted default status','True default status']).size().unstack('True default status')

True default status,0,1
Predicted default status,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9647,256
1,20,77


In [None]:
decision_prob = 0.2 
y_prob = lda.fit(x,y).predict_proba(x)
ccdef_ccdef = pd.DataFrame({'True default status':y,'Predicted default status':y_prob[:,1]>decision_prob})
ccdef_ccdef.replace(to_replace={0:'No', 1:'Yes', 'True':'Yes', 'False':'No'}, inplace =True)
ccdef_ccdef.groupby(['Predicted default status','True default status']).size().unstack('True default status')

True default status,No,Yes
Predicted default status,Unnamed: 1_level_1,Unnamed: 2_level_1
No,9427,144
Yes,240,189
