In [1]:
# Import all the libraries which are needed for exploratory data analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# PCA Analysis
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
#Another Solution
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline

In [2]:
#Set max row display
pd.set_option('display.max_row',100)

In [3]:
#Set max column width to 50
pd.set_option('display.max_columns',50)

In [4]:
#Read the data
ccdefault=pd.read_csv('UCI_Credit_Card.csv')

In [None]:
#Explore the data
ccdefault.head(3)

In [None]:
#Metadata Information
ccdefault.info()

In [None]:
#Statistical Information
ccdefault.describe()

In [None]:
#Dataset Shape
ccdefault.shape

In [None]:
#Check for any nulls or missing values
total = ccdefault.isnull().sum().sort_values(ascending = False)
percent = (ccdefault.isnull().sum()/ccdefault.isnull().count()*100).sort_values(ascending = False)
pd.concat([total, percent], axis=1, keys=['Total', 'Percent']).transpose()

In [None]:
#rename column names
ccdefault.rename(columns={'default.payment.next.month':'DEFAULT'},inplace=True)

In [None]:
#find unique values
ccdefault.AGE.unique()

In [None]:
#find unique values
ccdefault.DEFAULT.unique()

In [None]:
#find unique values
ccdefault.LIMIT_BAL.unique()
#find unique values
ccdefault.LIMIT_BAL.value_counts()

In [None]:
#Basic Data Visualizations to understand the data
plt.plot(ccdefault['LIMIT_BAL'])

References
https://towardsdatascience.com/data-visualization-using-matplotlib-16f1aae5ce70

In [None]:
ccdefault.head(3)

In [None]:
sns.pairplot(data=ccdefault[["ID","LIMIT_BAL","SEX","EDUCATION","MARRIAGE","AGE","DEFAULT"]],
             hue="DEFAULT", dropna=True)

In [None]:
sns.pairplot(data=ccdefault[["PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6","DEFAULT"]],
             hue="DEFAULT", dropna=True)

In [None]:
sns.pairplot(data=ccdefault[["BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6","DEFAULT"]],
             hue="DEFAULT", dropna=True)

In [None]:
sns.pairplot(data=ccdefault[["PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6","DEFAULT"]],
             hue="DEFAULT", dropna=True)

In [None]:
#Pairgrid - Pairplot is built on pairgrid
g = sns.PairGrid(data=ccdefault[["PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6"]])
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot, n_levels=6);

In [None]:
# feature names as a list
col = ccdefault.columns       # .columns gives columns names in data 
print(col)

In [None]:
# y includes our labels and x includes our features
y = ccdefault.DEFAULT

In [None]:
ax = sns.countplot(y,label="Count")

In [None]:
list = ['ID','DEFAULT'] # ID is removed
x = ccdefault.drop(list,axis = 1 )
x.head()

In [None]:
# Voilin plot - First 12 features
data_dia = y
data = x
data_n_2 = (data - data.mean()) / (data.std())              # standardization
data = pd.concat([y,data_n_2.iloc[:,0:12]],axis=1)
data = pd.melt(data,id_vars="DEFAULT",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(10,10))
sns.violinplot(x="features", y="value", hue="DEFAULT", data=data,split=True, inner="quart")
plt.xticks(rotation=90)

In [None]:
# Second 13 features
data = pd.concat([y,data_n_2.iloc[:,12:23]],axis=1)
data = pd.melt(data,id_vars="DEFAULT",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(10,10))
sns.violinplot(x="features", y="value", hue="DEFAULT", data=data,split=True, inner="quart")
plt.xticks(rotation=90)

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(data=ccdefault[["PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6"]])

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(data=ccdefault[["BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6"]])

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(data=ccdefault[["PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6"]])

In [None]:
import seaborn as sns
df = sns.load_dataset('iris')
sns.boxplot( x=df["species"], y=df["sepal_length"] )



In [None]:
sns.boxplot(data=df)

In [None]:
#SWARM PLOTS : For better visualization of distribution data
plt.figure(figsize=(12, 6))
sns.swarmplot(x="SEX", y="AGE", hue='DEFAULT', data=ccdefault,dodge=True)#, jitter=True);

In [None]:
plt.figure(figsize=(12, 6))
sns.swarmplot(x="MARRIAGE", y="AGE", hue='DEFAULT', data=ccdefault,dodge=True)#, jitter=True);

In [None]:
plt.figure(figsize=(12, 6))
sns.swarmplot(x="EDUCATION", y="AGE", hue='DEFAULT', data=ccdefault,dodge=True)#, jitter=True);

In [None]:
plt.figure(figsize=(12, 6))
sns.swarmplot(x="LIMIT_BAL", y="AGE", hue='DEFAULT', data=ccdefault,dodge=True)#, jitter=True);

In [None]:
#greater than 50,000 limit balance
grt75filter = ccdefault.LIMIT_BAL > 500000
ccdefaultgt75k=ccdefault[grt75filter]
plt.figure(figsize=(18, 6))
sns.swarmplot(x="LIMIT_BAL", y="AGE", hue='DEFAULT', data=ccdefaultgt75k,dodge=True)#, jitter=True);

In [None]:
bins = [0,50000 , 100000, 250000, 350000,450000,1000000]
labels =[1,2,3,4,5,6]
ccdefault['binned'] = pd.cut(ccdefault['LIMIT_BAL'], bins,labels=labels)

In [None]:
#greater than 50,000 limit balance
plt.figure(figsize=(18, 6))
sns.swarmplot(x="binned", y="AGE", hue='DEFAULT', data=ccdefault,dodge=True)#, jitter=True);

Bins:
    1: 0-50,000
    2: 51,000-100,000
    3: 100,001-250,000
    4: 250,001-350,000
    5: 350,001-450,000
    6: Greater than 450,000

In [None]:
import pandas_profiling
pandas_profiling.ProfileReport(ccdefault)

In [None]:
ccdefault.dtypes

In [None]:
import plotly.express as px
fig = px.parallel_coordinates(ccdefault, color='DEFAULT',
                              dimensions=['SEX', 'AGE','MARRIAGE','EDUCATION','LIMIT_BAL','PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6','PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6'],
                              color_continuous_scale=px.colors.diverging.Tealrose)
fig.show()

In [None]:
# Verifying the data for payment columns
import plotly.express as px
fig = px.parallel_coordinates(ccdefault, color='DEFAULT',
                              dimensions=['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6','PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6'],
                              color_continuous_scale=px.colors.diverging.Tealrose)
fig.show()

Principle component Analysis: 
PCA identifies the combination of components that account for the most variance of the data. The below initial analysis is done to identify the components. In this case, we are checking with MinMaxScaler.

In [None]:
#PCA
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=[0, 1])
data_rescaled = scaler.fit_transform(ccdefault.iloc[1:, 0:24])

In [None]:
#Fitting the PCA algorithm with our Data
pca = PCA().fit(data_rescaled)
#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Credit Card Dataset Explained Variance')
plt.show()

Based on initial analysis, the plot tells us that selecting 12 components we can preserve something around 98.8% or 99% of the total variance of the data. This is a primiliminary analysis.

Before fitting my model, we need to standardize the numerical features and create dummies for my categorical features.