In [None]:
# Importing Packages
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
     

# Importing dataset
cred_data=pd.read_csv('https://raw.githubusercontent.com/sautrikc/-Credit-Card-Fraud-Detection/main/UCI_Credit_Card.csv')
     

cred_data.head()
cred_data.shape
cred_data.info()
# Category 0,5,6 are undocumented so needed to be checked
print(cred_data['EDUCATION'].value_counts())
sns.countplot(cred_data['EDUCATION'])
print(cred_data['MARRIAGE'].value_counts())
sns.countplot(cred_data['MARRIAGE'])

# Some of the age values are more than 70 which is fine.
cred_data['AGE'].value_counts()


In [None]:
cred_data.columns


In [None]:
# All the pay are having -2,0 category that are undocumented.
# Given category -1 as pay duly(properly paid on time)
# So -2,0,-1 can be treated as one category. to be checked
print(cred_data['PAY_0'].value_counts())
print(cred_data['PAY_2'].value_counts())

In [None]:
# BILL_AMT1 having 1% of negative values
# BILL_AMT2 TO BILL_AMT6 having 2% negative values
# can be treated as extra paid

# Looks like Bill amount has 1% of outliers - to be cleaned
cred_data[['BILL_AMT1','BILL_AMT2','BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].quantile([0,0.01,0.02,0.03,0.04,0.05,0.06,0.08,0.09,0.1,
0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.93,0.94,0.97,0.99,1])

In [None]:
cred_data[['BILL_AMT1','BILL_AMT2','BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].plot(kind='box')


In [None]:
# Pay amount has 1% of extreme outlier needed to be cleaned
cred_data[['PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].quantile([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.93,0.94,0

In [None]:
# Extreme Outliers have completely compressed the boxplot
cred_data[['PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].plot(kind='box')

In [None]:
# Limit_Bal contains 1% of extreme outliers - require cleaning
cred_data['LIMIT_BAL'].quantile([0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.93,0.94,0.97,0.99,1])

In [None]:
sns.boxplot(y=cred_data['LIMIT_BAL'])

In [None]:
# Simply Category 5,6,0 values are undocumented so we can pour it into category 4 metioned others.
unknown=(cred_data['EDUCATION']==5)|(cred_data['EDUCATION']==6)|(cred_data['EDUCATION']==0)
cred_data.loc[unknown,'EDUCATION']=4
cred_data['EDUCATION'].value_counts()

In [None]:
# Category 0 value are undocumented so we can pour it into category 3 mentioned as others.
unknown=cred_data['MARRIAGE']==0
cred_data.loc[unknown,'MARRIAGE']=3
cred_data['MARRIAGE'].value_counts()
     

In [None]:
# Frequency of each category
cred_data.PAY_0.value_counts()

In [None]:
#Cross tab with target
cross_cred_data2=pd.crosstab(cred_data['PAY_2'],cred_data['default.payment.next.month'])
     

#Cross tab row Percentages
cross_cred_data2_percent=cross_cred_data2.apply(lambda x: x/x.sum(), axis=1)
round(cross_cred_data2_percent,2)


In [None]:
# Treating category -2,-1,0 value as one category 0 for pay duly 
# so going for imputation based on target variable values nearest to them.
fil = (cred_data.PAY_0 == -2) | (cred_data.PAY_0 == -1) | (cred_data.PAY_0 == 0)
cred_data.loc[fil, 'PAY_0'] = 0
fil = (cred_data.PAY_2 == -2) | (cred_data.PAY_2 == -1) | (cred_data.PAY_2 == 0)
cred_data.loc[fil, 'PAY_2'] = 0
fil = (cred_data.PAY_3 == -2) | (cred_data.PAY_3 == -1) | (cred_data.PAY_3 == 0)
cred_data.loc[fil, 'PAY_3'] = 0
fil = (cred_data.PAY_4 == -2) | (cred_data.PAY_4 == -1) | (cred_data.PAY_4 == 0)
cred_data.loc[fil, 'PAY_4'] = 0
fil = (cred_data.PAY_5 == -2) | (cred_data.PAY_5 == -1) | (cred_data.PAY_5 == 0)
cred_data.loc[fil, 'PAY_5'] = 0
fil = (cred_data.PAY_6 == -2) | (cred_data.PAY_6 == -1) | (cred_data.PAY_6 == 0)
cred_data.loc[fil, 'PAY_6'] = 0

In [None]:
# Renaming Pay_0 as pay_1 and default.payment.next as def_pay
cred_data.rename(columns={'PAY_0':'PAY_1','default.payment.next.month':'DEF_PAY'},inplace=True)
pd.set_option('max_columns',None)
cred_data.head()
     

In [None]:
cred_data[['BILL_AMT1','BILL_AMT2','BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].quantile([0,0.01,0.02,0.03,0.04,0.05,0.08,0.09,0.1,0.2,0.3,0.4,0

In [None]:
# Treating 1% positive outlier by imputation based on the median value of that column
cred_data.loc[cred_data['BILL_AMT1']>350110.68,'BILL_AMT1']=cred_data['BILL_AMT1'].median()
cred_data.loc[cred_data['BILL_AMT2']>337495.28,'BILL_AMT2']=cred_data['BILL_AMT2'].median()
cred_data.loc[cred_data['BILL_AMT3']>325030.39,'BILL_AMT3']=cred_data['BILL_AMT3'].median()
cred_data.loc[cred_data['BILL_AMT4']>304997.27,'BILL_AMT4']=cred_data['BILL_AMT4'].median()
cred_data.loc[cred_data['BILL_AMT5']>285868.33,'BILL_AMT5']=cred_data['BILL_AMT5'].median()
cred_data.loc[cred_data['BILL_AMT6']>279505.06,'BILL_AMT6']=cred_data['BILL_AMT6'].median()
     

cred_data[['BILL_AMT1','BILL_AMT2','BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].plot(kind='box')

In [None]:
# Treating 1% negative outlier in BILL_AMT1 and 2% negative outlier in the rest BILL_AMT
# by imputation based on the median value of the negative values of the columns
cred_data.loc[cred_data['BILL_AMT1']<0,'BILL_AMT1']=cred_data['BILL_AMT1'][cred_data['BILL_AMT1']<0].median()
cred_data.loc[cred_data['BILL_AMT2']<0,'BILL_AMT2']=cred_data['BILL_AMT1'][cred_data['BILL_AMT1']<0].median()
cred_data.loc[cred_data['BILL_AMT3']<0,'BILL_AMT3']=cred_data['BILL_AMT1'][cred_data['BILL_AMT1']<0].median()
cred_data.loc[cred_data['BILL_AMT4']<0,'BILL_AMT4']=cred_data['BILL_AMT1'][cred_data['BILL_AMT1']<0].median()
cred_data.loc[cred_data['BILL_AMT5']<0,'BILL_AMT5']=cred_data['BILL_AMT1'][cred_data['BILL_AMT1']<0].median()
cred_data.loc[cred_data['BILL_AMT6']<0,'BILL_AMT6']=cred_data['BILL_AMT1'][cred_data['BILL_AMT1']<0].median()
     

cred_data[['BILL_AMT1','BILL_AMT2','BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].plot(kind='box')

In [None]:
# Treating 1% extreme outlier by imputation based on the median value of that column
cred_data.loc[cred_data['PAY_AMT1']>67000,'PAY_AMT1']=cred_data['PAY_AMT1'].median()
cred_data.loc[cred_data['PAY_AMT2']>76700,'PAY_AMT2']=cred_data['PAY_AMT2'].median()
cred_data.loc[cred_data['PAY_AMT3']>70000,'PAY_AMT3']=cred_data['PAY_AMT3'].median()
cred_data.loc[cred_data['PAY_AMT4']>67100,'PAY_AMT4']=cred_data['PAY_AMT4'].median()
cred_data.loc[cred_data['PAY_AMT5']>65700,'PAY_AMT5']=cred_data['PAY_AMT5'].median()
cred_data.loc[cred_data['PAY_AMT6']>82700,'PAY_AMT6']=cred_data['PAY_AMT6'].median()

In [None]:
cred_data[['PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].plot(kind='box')


In [None]:
# Treating 1% extreme outlier by imputation based on the median value of that column
cred_data.loc[cred_data['LIMIT_BAL']>500000,'LIMIT_BAL']=cred_data['LIMIT_BAL'].median()

In [None]:
sns.boxplot(y=cred_data['LIMIT_BAL'])


In [None]:
# Copying the clean data for further analysis.
cred_data_new=cred_data.copy(deep=True)
cred_data_new.head()

In [None]:
# Creating X array that will contain features and y array will contain the target vector
X=cred_data_new.drop('DEF_PAY',axis=1)
y=cred_data_new['DEF_PAY']

# Importing the package
from sklearn.model_selection import train_test_split

# Using train_test_split() function to split the whole data to train data of 80% and test data of 20%.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# Checking the shapes of the train and test data
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('y_train',y_train.shape)
print('y_test',y_test.shape)
     

In [None]:
# Importing the Library
from sklearn.linear_model import LogisticRegression

logistic=LogisticRegression(max_iter=2000)


# Building a Multiple Logistic Regression Model by fitting the target and the features
logistic.fit(X_train,y_train)
     

In [None]:
# Coefficient and intercept
print(logistic.coef_)
print(logistic.intercept_)

In [None]:
# Prediction of target using the features
pred=logistic.predict(X_test)
pred
     

In [None]:
# Importing the sklearn package for creating the confusion matrix
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,pred)
cm
   

In [None]:
# Calculating the accuracy of the model
total=sum(sum(cm))
accuracy=(cm[0,0]+cm[1,1])/total
round(accuracy*100,3)

In [None]:
# Importing the package
import statsmodels.formula.api as sm
# Creating function for calculating VIF
def vif_cal(input_data):
    x_vars = input_data
    xvar_names=x_vars.columns
    for i in range(0,xvar_names.shape[0]):
        y=x_vars[xvar_names[i]] 
        x=x_vars[xvar_names.drop(xvar_names[i])]
        rsq=sm.ols(formula="y~x", data=x_vars).fit().rsquared  
        vif=round(1/(1-rsq),2)
        print (xvar_names[i], " VIF = " , vif)

In [None]:
# Calculating VIF for all the Features
vif_cal(input_data=X_train)

In [None]:
# Dropping squentially the feature having the high vif
vif_cal(input_data=X_train.drop('BILL_AMT2',axis=1))

In [None]:
vif_cal(input_data=X_train.drop(['BILL_AMT2','BILL_AMT5'],axis=1))

In [None]:
vif_cal(input_data=X_train.drop(['BILL_AMT2','BILL_AMT5','BILL_AMT3'],axis=1))

In [None]:
# Looks like there are no more high vif that means no more interdependency between the feature
vif_cal(input_data=X_train.drop(['BILL_AMT2','BILL_AMT5','BILL_AMT3','BILL_AMT4'],axis=1))

In [None]:
# Importing the statsmodel library
import statsmodels.discrete.discrete_model as sm
m=sm.Logit(y,X)
# Fitting feature to the model
Res=m.fit()
# Printing Summary
print(Res.summary())
     

In [None]:
# Dropping the feature that are not required
X.drop(['BILL_AMT1', 'BILL_AMT2','BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6','PAY_2','ID'],axis=1,inplace=True)
     

# Importing the statsmodel library
import statsmodels.discrete.discrete_model as sm
m=sm.Logit(y,X)
# Fitting feature to the model
Res=m.fit()
# Printing Summary
print(Res.summary())

In [None]:
# Using train_test_split() function to split the whole data to train data of 80% and test data of 20%.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# Checking the shapes of the train and test data
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('y_train',y_train.shape)
print('y_test',y_test.shape)

In [None]:
# Fitting the feature and target vector to the model
logistic.fit(X_train,y_train)
# Predicting the target
pred=logistic.predict(X_test)
pred

In [None]:
# Creating confusion matrix
cm=confusion_matrix(y_test,pred)
cm
# Calculating Accuracy
total=sum(sum(cm))
accuracy=(cm[0,0]+cm[1,1])/total
print('Accuracy=',round(accuracy*100,3))

# Calculating Sensitivity
Sensitivity=cm[0,0]/(cm[0,0]+cm[0,1])
print('Sensitivity-',round(Sensitivity*100,2))

# Calculating Specificity
Specificity = cm[1,1]/(cm[1,0]+cm[1,1])
print('Specificity-',round(Specificity*100,2))

In [None]:
round(Res.tvalues.pow(2)).sort_values(ascending=False).head(5)


In [None]:
X1=cred_data_new[['PAY_1','MARRIAGE','LIMIT_BAL','SEX','EDUCATION']]
y1=cred_data_new['DEF_PAY']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=50)

# Fitting the target and the features
logistic.fit(X1_train,y1_train)

#predict
Pred2=logistic.predict(X1_test)
Pred2

In [None]:
# Confusion Matrix and Accuracy
cm1=confusion_matrix(y1_test,Pred2)
print(cm1)

total=sum(sum(cm))
accuracy=(cm[0,0]+cm[1,1])/total
print('Accuracy=',round(accuracy*100,3))

In [None]:
# Copying the clean data for further analysis.
cred_data_new=cred_data.copy(deep=True)
cred_data_new.head()

In [None]:
X=cred_data_new.drop('DEF_PAY',axis=1)
y=cred_data_new['DEF_PAY']

In [None]:
# Importing the statsmodel library
import statsmodels.discrete.discrete_model as sm
m=sm.Logit(y,X)
# Fitting feature to the model
results=m.fit()
# Printing Summary
print(results.summary())

In [None]:
# Predict the traget using the features
predict1=results.predict()

In [None]:

# Taking the threshold value 0.5 as it is logistic regression
threshold=0.5
predictions1=[ 0 if x < threshold else 1 for x in predict1]

# Confusion Matrix and Accuracy
from sklearn.metrics import confusion_matrix,classification_report
cm=confusion_matrix(y,predictions1)
print(cm)

total=sum(sum(cm))
accuracy=(cm[0,0]+cm[1,1])/total
print('Accuracy=',round(accuracy*100,2))

# Calculating Sensitivity
Sensitivity=cm[0,0]/(cm[0,0]+cm[0,1])
print('Sensitivity-',round(Sensitivity*100,2))

# Calculating Specificity
Specificity = cm[1,1]/(cm[1,0]+cm[1,1])
print('Specificity-',round(Specificity*100,2))

In [None]:
# Creating X array that will contain features and y array will contain the target vector
X=cred_data_new.drop('DEF_PAY',axis=1)
y=cred_data_new['DEF_PAY']

# Importing the package
from sklearn.model_selection import train_test_split

# Using train_test_split() function to split the whole data to train data of 80% and test data of 20%.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# Importing the Library
from sklearn.linear_model import LogisticRegression

logistic=LogisticRegression( solver='newton-cg', max_iter=200)

# Building a Multiple Logistic Regression Model by fitting the target and the features
logistic.fit(X_train,y_train)

# Prediction of target using the features
pred=logistic.predict(X_train)
pred

In [None]:
# Confusion Matrix and Accuracy
from sklearn.metrics import confusion_matrix,classification_report
cm=confusion_matrix(y_train,pred)
print(cm)

total=sum(sum(cm))
accuracy=(cm[0,0]+cm[1,1])/total
print('Accuracy=',round(accuracy*100,2))

# Calculating Sensitivity
Sensitivity=cm[0,0]/(cm[0,0]+cm[0,1])
print('Sensitivity-',round(Sensitivity*100,2))

# Calculating Specificity
Specificity = cm[1,1]/(cm[1,0]+cm[1,1])
print('Specificity-',round(Specificity*100,2))

In [None]:
X=cred_new_smote.drop('DEF_PAY',axis=1)
y=cred_new_smote['DEF_PAY']

# Importing the package
from xgboost import XGBClassifier
import time

start_time = time.time()

# Building XGB model for binary classification
model_new=XGBClassifier(n_estimators=100,max_depth=5,learning_rate=0.1,eval_metric='error',
                        early_stopping_rounds=4,tree_method='hist')

model_new.fit(X_train,y_train)

predict2=model_new.predict(X_train)

print("Time taken by XGB "+ str((time.time() - start_time))+ " Seconds")

In [None]:
# Importing confusion_matrix
from sklearn.metrics import confusion_matrix

# Creating Confusion Matrix using the train data and the predicted value
cm=confusion_matrix(y_train,predict2)
print(cm)

# Calculating the accuracy of the predicted value of the test data
total=sum(sum(cm))
accuracy=(cm[0,0]+cm[1,1])/total
print('Accuracy=',round(accuracy*100,2))

# Calculating Sensitivity
Sensitivity=cm[0,0]/(cm[0,0]+cm[0,1])
print('Sensitivity-',round(Sensitivity*100,2))

# Calculating Specificity
Specificity = cm[1,1]/(cm[1,0]+cm[1,1])
print('Specificity-',round(Specificity*100,2))

In [None]:
# Predicting the target using the train data features
predict3=model_new.predict(X_test)

# Creating Confusion matrix using the test data and the predicted value
cm1=confusion_matrix(y_test,predict3)
print(cm1)

# Calculating the accuracy of the train data
total=sum(sum(cm))
accuracy=(cm[0,0]+cm[1,1])/total
print('Train Accuracy=',round(accuracy*100,2))

# Calculating the accuracy of the test data
total1=sum(sum(cm1))
accuracy1=(cm1[0,0]+cm1[1,1])/total1
print('Test Accuracy=',round(accuracy1*100,2))

In [None]:
# Classification Report for Checking Recall Precision and F1-Score
from sklearn.metrics import classification_report
print(classification_report(y_train,predict2))