In [29]:
# Import pandas
import pandas as pd

# Column names
col_names = ["Gender","Age","Debt","Married","BankCustomer","EducationLevel","Ethnicity","YearsEmployed","PriorDefault","Employed","CreditScore", "DriversLicense", "Citizen", "ZipCode", "Income" , "ApprovalStatus"]

# Load dataset
cc_apps = pd.read_csv("datasets/crx.data", header=None, names=col_names);

# Inspect data
cc_apps.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [30]:
summary = cc_apps.describe()
summary = summary.transpose()
summary.head()


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Debt,690.0,4.758725,4.978163,0.0,1.0,2.75,7.2075,28.0
YearsEmployed,690.0,2.223406,3.346513,0.0,0.165,1.0,2.625,28.5
CreditScore,690.0,2.4,4.86294,0.0,0.0,0.0,3.0,67.0
Income,690.0,1017.385507,5210.102598,0.0,0.0,5.0,395.5,100000.0


In [18]:
# Import numpy
import numpy as np 

# Replace the '?'s with NaN
cc_apps = cc_apps.replace("?", np.nan)

# Inspect the missing values again
cc_apps.tail(7)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
683,b,36.42,0.75,y,p,d,v,0.585,f,f,0,f,g,240,3,-
684,b,40.58,3.29,u,g,m,v,3.5,f,f,0,t,s,400,0,-
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260,0,-
686,a,22.67,0.75,u,g,c,v,2.0,f,t,2,t,g,200,394,-
687,a,25.25,13.5,y,p,ff,ff,2.0,f,t,1,t,g,200,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280,750,-
689,b,35.0,3.375,u,g,c,h,8.29,f,f,0,t,g,0,0,-


In [19]:
# Impute the missing values with mean imputation
cc_apps.fillna(cc_apps.mean(), inplace=True)

# Count the number of NaNs in the dataset to verify
cc_apps.isnull().sum()

Gender            12
Age               12
Debt               0
Married            6
BankCustomer       6
EducationLevel     9
Ethnicity          9
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
ZipCode           13
Income             0
ApprovalStatus     0
dtype: int64

In [20]:
# Iterate over each column of cc_apps
for col in cc_apps:
    # Check if the column is of object type
    if cc_apps[col].dtypes == 'object':
        # Impute with the most frequent value
        cc_apps = cc_apps.fillna(cc_apps[col].value_counts().index[0])

# Count the number of NaNs in the dataset and print the counts to verify
cc_apps.isnull().sum()

Gender            0
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
ZipCode           0
Income            0
ApprovalStatus    0
dtype: int64

In [21]:
# Import LabelEncoder
from sklearn import preprocessing
    
# Instantiate LabelEncoder
le = preprocessing.LabelEncoder()
# Iterate over all the values of each column and extract their dtypes
for col in cc_apps:
    # Compare if the dtype is object
    if cc_apps[col].dtypes == 'object':
    # Use LabelEncoder to do the numeric transformation
        cc_apps[col]=le.fit_transform(cc_apps[col])
        
cc_apps.tail(5)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
685,1,52,10.085,3,3,5,4,1.25,0,0,0,0,0,90,0,1
686,0,71,0.75,2,1,2,8,2.0,0,1,2,1,0,67,394,1
687,0,97,13.5,3,3,6,3,2.0,0,1,1,1,0,67,1,1
688,1,20,0.205,2,1,0,8,0.04,0,0,0,0,0,96,750,1
689,1,197,3.375,2,1,2,4,8.29,0,0,0,1,0,0,0,1


In [22]:
# Correlation matrix
corr_matrix = cc_apps.corr()
corr_matrix['ApprovalStatus'].sort_values(ascending=False)

ApprovalStatus    1.000000
Married           0.191431
BankCustomer      0.187520
Citizen           0.100867
ZipCode           0.094851
Gender            0.028934
Ethnicity        -0.000877
DriversLicense   -0.031625
EducationLevel   -0.130026
Age              -0.133304
Income           -0.175657
Debt             -0.206294
YearsEmployed    -0.322475
CreditScore      -0.406410
Employed         -0.458301
PriorDefault     -0.720407
Name: ApprovalStatus, dtype: float64

In [23]:
# Not really helpful
from pandas.plotting import scatter_matrix

attributes = ["ApprovalStatus", "Married", "BankCustomer","Citizen"]

scatter_matrix(cc_apps[attributes], figsize=(12, 8))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11c041c50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11c972c50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11c99cf98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11c9ce550>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11c9f4ac8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11ca24080>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11ca4d5f8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11ca756a0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11ca756d8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11cacc198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11caf2710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11cb1ac88>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11cb4a240>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11cb707b8>,
   