Data set Source: https://archive.ics.uci.edu/ml/datasets/Credit+Approval

The descriptive features below are produced from the crx.names file:

A1:b, a.
A2:continuous.
A3:continuous.
A4:u, y, l, t.
A5:g, p, gg.
A6:c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff.
A7:v, h, bb, j, n, z, dd, ff, o.
A8:continuous.
A9:t, f.
A10:t, f.
A11:continuous.
A12:t, f.
A13:g, p, s.
A14:continuous.
A15:continuous.
A16: +,-    # (class attribute)

In [1]:
#Imported necessary Modules and Loaded the data !</h1>


import numpy as np
import pandas as pd

import os, ssl
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
    getattr(ssl, '_create_unverified_context', None)): 
    ssl._create_default_https_context = ssl._create_unverified_context


uci_df = pd.read_csv('Assignment1_Q1_crx.data', sep = ',',names = ["A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15","A16"]) 

uci_df.shape

(690, 16)

In [2]:
uci_df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [3]:
#checking Data types 
uci_df.dtypes


A1      object
A2      object
A3     float64
A4      object
A5      object
A6      object
A7      object
A8     float64
A9      object
A10     object
A11      int64
A12     object
A13     object
A14     object
A15      int64
A16     object
dtype: object

In [4]:
uci_df.describe(include = np.object).round(2) #categorical feature

Unnamed: 0,A1,A2,A4,A5,A6,A7,A9,A10,A12,A13,A14,A16
count,690,690,690,690,690,690,690,690,690,690,690,690
unique,3,350,4,4,15,10,2,2,2,3,171,2
top,b,?,u,g,c,v,t,f,f,g,0,-
freq,468,12,519,519,137,399,361,395,374,625,132,383


In [5]:
uci_df.describe(include = np.number).round(2) # numerical feature

Unnamed: 0,A3,A8,A11,A15
count,690.0,690.0,690.0,690.0
mean,4.76,2.22,2.4,1017.39
std,4.98,3.35,4.86,5210.1
min,0.0,0.0,0.0,0.0
25%,1.0,0.16,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.21,2.62,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [6]:
  # checking unusual values among the columns to understand about missing values , Similarly we can check the unique calues for all the columns
    
uci_df['A1'].unique()

array(['b', 'a', '?'], dtype=object)

# # Setting all unusual value to NaN

In [7]:
# replacing unusual values with NaN

uci_df1 = uci_df.replace('?' ,np.nan)  
uci_df1.describe(include = np.object)
uci_df1.isnull().sum()

A1     12
A2     12
A3      0
A4      6
A5      6
A6      9
A7      9
A8      0
A9      0
A10     0
A11     0
A12     0
A13     0
A14    13
A15     0
A16     0
dtype: int64

In [8]:
#changing data type to float
uci_df1['A2'] = uci_df1.A2.astype(float)
uci_df1['A14'] = uci_df1.A14.astype(float)
uci_df1.dtypes

A1      object
A2     float64
A3     float64
A4      object
A5      object
A6      object
A7      object
A8     float64
A9      object
A10     object
A11      int64
A12     object
A13     object
A14    float64
A15      int64
A16     object
dtype: object

# # Imputation

In [9]:
#imputing Numerical Feature
uci_df1['A2'].fillna(uci_df1['A2'].median(axis = 0),inplace = True)
uci_df1['A14'].fillna(uci_df1['A14'].median(axis = 0),inplace = True)


In [10]:
#imputing categorical Feature
for column in ['A1', 'A4', 'A5', 'A6','A7','A9','A10','A12','A13','A16']:
    uci_df1[column].fillna(uci_df1[column].mode()[0], inplace=True)


In [11]:
uci_df1.isnull().sum() # checking whether imputation is sucessful or not

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A14    0
A15    0
A16    0
dtype: int64

# # Binning

In [12]:
##Binning 
uci_df2 = uci_df1.copy()

uci_df2['A2'] = pd.qcut(uci_df2['A2'], q=3, 
                                     labels=['low', 'medium', 'high'])
uci_df2.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,medium,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,high,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,medium,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,medium,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,low,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [13]:
uci_df2['A2'].value_counts()


medium    231
low       230
high      229
Name: A2, dtype: int64

In [14]:
## onehot encoding
encod_int = {'low': 0, 'medium': 1, 'high': 2}
uci_dfint = uci_df2.copy()

uci_dfint['A2'] = uci_dfint['A2'].replace(encod_int)

uci_dfint.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,1,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,2,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,1,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,1,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,0,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [15]:
uci_cols = uci_dfint.columns[uci_dfint.dtypes == np.object].tolist()
uci_cols

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13', 'A16']

In [16]:
## Variable conversion to float for  normalization
uci_encod = uci_dfint.copy()

for col in uci_cols:
    q = len(uci_encod[col].unique())
    if (q == 2):
        uci_encod[col] = pd.get_dummies(uci_encod[col], drop_first=True)
        
uci_encod = pd.get_dummies(uci_encod)

uci_encod.sample(n=4, random_state=11)

Unnamed: 0,A1,A2,A3,A8,A9,A10,A11,A12,A14,A15,...,A7_ff,A7_h,A7_j,A7_n,A7_o,A7_v,A7_z,A13_g,A13_p,A13_s
214,1,1,2.71,5.25,1,1,1,0,211.0,0,...,0,0,0,0,0,1,0,1,0,0
545,1,2,11.0,1.5,1,0,0,0,0.0,0,...,0,0,0,0,0,1,0,0,0,1
436,1,0,0.585,0.0,0,1,3,0,350.0,769,...,1,0,0,0,0,0,0,1,0,0
201,0,2,1.0,2.25,1,0,0,1,0.0,300,...,0,0,0,0,0,0,0,1,0,0


# # Normalization

In [17]:
## Normalization
from sklearn import preprocessing

Data_scaler = preprocessing.StandardScaler()

uci_standard = Data_scaler.fit_transform(uci_encod)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [18]:
uci_standard_df = pd.DataFrame(uci_standard, 
                                    columns=uci_encod.columns)

uci_standard_df.sample(n=4, random_state=11)

Unnamed: 0,A1,A2,A3,A8,A9,A10,A11,A12,A14,A15,...,A7_ff,A7_h,A7_j,A7_n,A7_o,A7_v,A7_z,A13_g,A13_p,A13_s
214,0.661438,0.001777,-0.411841,0.905058,0.95465,1.157144,-0.288101,-0.919195,0.159461,-0.195413,...,-0.300079,-0.5,-0.108306,-0.07636,-0.053916,0.83137,-0.108306,0.32249,-0.108306,-0.300079
545,0.661438,1.227857,1.25464,-0.216324,0.95465,-0.864196,-0.493887,-0.919195,-1.066817,-0.195413,...,-0.300079,-0.5,-0.108306,-0.07636,-0.053916,0.83137,-0.108306,-3.100868,-0.108306,3.332456
436,0.661438,-1.224303,-0.839015,-0.664877,-1.047504,1.157144,0.123472,-0.919195,0.967293,-0.047708,...,3.332456,-0.5,-0.108306,-0.07636,-0.053916,-1.202834,-0.108306,0.32249,-0.108306,-0.300079
201,-1.511858,1.227857,-0.75559,0.007953,0.95465,-0.864196,-0.493887,1.087908,-1.066817,-0.137791,...,-0.300079,-0.5,-0.108306,-0.07636,-0.053916,-1.202834,-0.108306,0.32249,-0.108306,-0.300079


In [19]:
## changing name for last feature
df_clean = uci_standard_df.rename(columns = {'A16':'Target'})
df_clean.sample(n=4, random_state=11)

Unnamed: 0,A1,A2,A3,A8,A9,A10,A11,A12,A14,A15,...,A7_ff,A7_h,A7_j,A7_n,A7_o,A7_v,A7_z,A13_g,A13_p,A13_s
214,0.661438,0.001777,-0.411841,0.905058,0.95465,1.157144,-0.288101,-0.919195,0.159461,-0.195413,...,-0.300079,-0.5,-0.108306,-0.07636,-0.053916,0.83137,-0.108306,0.32249,-0.108306,-0.300079
545,0.661438,1.227857,1.25464,-0.216324,0.95465,-0.864196,-0.493887,-0.919195,-1.066817,-0.195413,...,-0.300079,-0.5,-0.108306,-0.07636,-0.053916,0.83137,-0.108306,-3.100868,-0.108306,3.332456
436,0.661438,-1.224303,-0.839015,-0.664877,-1.047504,1.157144,0.123472,-0.919195,0.967293,-0.047708,...,3.332456,-0.5,-0.108306,-0.07636,-0.053916,-1.202834,-0.108306,0.32249,-0.108306,-0.300079
201,-1.511858,1.227857,-0.75559,0.007953,0.95465,-0.864196,-0.493887,1.087908,-1.066817,-0.137791,...,-0.300079,-0.5,-0.108306,-0.07636,-0.053916,-1.202834,-0.108306,0.32249,-0.108306,-0.300079


In [20]:
df_clean.shape


(690, 43)

In [21]:
df_clean.describe(include='all').round(3) 


Unnamed: 0,A1,A2,A3,A8,A9,A10,A11,A12,A14,A15,...,A7_ff,A7_h,A7_j,A7_n,A7_o,A7_v,A7_z,A13_g,A13_p,A13_s
count,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,...,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0
mean,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,...,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0
std,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,...,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001,1.001
min,-1.512,-1.224,-0.957,-0.665,-1.048,-0.864,-0.494,-0.919,-1.067,-0.195,...,-0.3,-0.5,-0.108,-0.076,-0.054,-1.203,-0.108,-3.101,-0.108,-0.3
25%,-1.512,-1.224,-0.756,-0.616,-1.048,-0.864,-0.494,-0.919,-0.602,-0.195,...,-0.3,-0.5,-0.108,-0.076,-0.054,-1.203,-0.108,0.322,-0.108,-0.3
50%,0.661,0.002,-0.404,-0.366,0.955,-0.864,-0.494,-0.919,-0.137,-0.194,...,-0.3,-0.5,-0.108,-0.076,-0.054,0.831,-0.108,0.322,-0.108,-0.3
75%,0.661,1.228,0.492,0.12,0.955,1.157,0.123,1.088,0.514,-0.119,...,-0.3,-0.5,-0.108,-0.076,-0.054,0.831,-0.108,0.322,-0.108,-0.3
max,0.661,1.228,4.672,7.858,0.955,1.157,13.294,1.088,10.557,19.012,...,3.332,2.0,9.233,13.096,18.547,0.831,9.233,0.322,9.233,3.332


In [22]:
df_clean.head()

Unnamed: 0,A1,A2,A3,A8,A9,A10,A11,A12,A14,A15,...,A7_ff,A7_h,A7_j,A7_n,A7_o,A7_v,A7_z,A13_g,A13_p,A13_s
0,0.661438,0.001777,-0.956613,-0.291083,0.95465,1.157144,-0.288101,-0.919195,0.107155,-0.195413,...,-0.300079,-0.5,-0.108306,-0.07636,-0.053916,0.83137,-0.108306,0.32249,-0.108306,-0.300079
1,-1.511858,1.227857,-0.060051,0.24419,0.95465,1.157144,0.74083,-0.919195,-0.816912,-0.087852,...,-0.300079,2.0,-0.108306,-0.07636,-0.053916,-1.202834,-0.108306,0.32249,-0.108306,-0.300079
2,-1.511858,0.001777,-0.856102,-0.216324,0.95465,-0.864196,-0.493887,-0.919195,0.560471,-0.037144,...,-0.300079,2.0,-0.108306,-0.07636,-0.053916,-1.202834,-0.108306,0.32249,-0.108306,-0.300079
3,0.661438,0.001777,-0.647038,0.456505,0.95465,1.157144,0.535044,1.087908,-0.485643,-0.194837,...,-0.300079,-0.5,-0.108306,-0.07636,-0.053916,0.83137,-0.108306,0.32249,-0.108306,-0.300079
4,0.661438,-1.224303,0.174141,-0.153526,0.95465,-0.864196,-0.493887,-0.919195,-0.369408,-0.195413,...,-0.300079,-0.5,-0.108306,-0.07636,-0.053916,0.83137,-0.108306,-3.100868,-0.108306,3.332456


# # Saving to CSV

In [23]:
df_clean.to_csv('df_clean.csv', index=False)
