In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.0-preview2/spark-3.0.0-preview2-bin-hadoop2.7.tgz
!tar -xvf spark-3.0.0-preview2-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-preview2-bin-hadoop2.7"
import findspark
findspark.init()

# Importing the libraries required 
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import sum
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [0]:
sc= spark.sparkContext
import pandas as pd
import numpy as np
import matplotlib.pyplot as mlb

In [0]:
data = spark.read.csv('/content/CC_data.csv', header = 'true', inferSchema='true')
#data_new = data.withColumnRenamed("Male","Gender").printSchema()
type(data)

pyspark.sql.dataframe.DataFrame

In [0]:
data.show(5)

+------+-----+-----+-------+------------+--------------+---------+-------------+------------+--------+-----------+--------------+-------+-------+------+--------+
|Gender|  Age| Debt|Married|BankCustomer|EducationLevel|Ethnicity|YearsEmployed|PriorDefault|Employed|CreditScore|DriversLicense|Citizen|ZipCode|Income|Approved|
+------+-----+-----+-------+------------+--------------+---------+-------------+------------+--------+-----------+--------------+-------+-------+------+--------+
|     b|30.83|  0.0|      u|           g|             w|        v|         1.25|           t|       t|          1|             f|      g|    202|     0|       +|
|     a|58.67| 4.46|      u|           g|             q|        h|         3.04|           t|       t|          6|             f|      g|     43|   560|       +|
|     a| 24.5|  0.5|      u|           g|             q|        h|          1.5|           t|       f|          0|             f|      g|    280|   824|       +|
|     b|27.83| 1.54|      u|

In [0]:
class_frequency = data.groupBy("Approved").count()
class_frequency.show()

+--------+-----+
|Approved|count|
+--------+-----+
|       +|  307|
|       -|  383|
+--------+-----+



In [0]:
data2 = data.toPandas()
data2

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.500,u,g,q,h,1.50,t,f,0,f,g,280,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280,750,-


In [0]:
# Replace "?" with NaN
data2.replace('?', np.NaN, inplace = True)
# Convert Age to numeric
data2["Age"] = pd.to_numeric(data2["Age"])
# CC_data2 = CC_data[:,:]
data3 = data2.copy()

In [0]:
data3.tail(5)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260,0,-
686,a,22.67,0.75,u,g,c,v,2.0,f,t,2,t,g,200,394,-
687,a,25.25,13.5,y,p,ff,ff,2.0,f,t,1,t,g,200,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280,750,-
689,b,35.0,3.375,u,g,c,h,8.29,f,f,0,t,g,0,0,-


In [0]:
data3.isnull().sum()

Gender            12
Age               12
Debt               0
Married            6
BankCustomer       6
EducationLevel     9
Ethnicity          9
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
ZipCode           13
Income             0
Approved           0
dtype: int64

We still having some missing values to deal with. Gender, Age and ZipCode have some missing values

In [0]:
# Impute the missing values with mean imputation
data3.fillna(data3.mean(), inplace=True)

# Count the number of NaNs in the dataset to verify
pd.isna(data3).sum()
data3.tail(18)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
672,a,50.25,0.835,u,g,aa,v,0.5,f,f,0,t,g,240,117,-
673,,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-


In [0]:
# Iterate over each column of data3
for col in data3:
    # Check if the column is of object type
    if data3[col].dtype == 'object':
        # Impute with the most frequent value
        data3 = data3.fillna(data3[col].value_counts().index[0])
        

# Count the number of NaNs in the dataset and print the counts to verify
pd.isna(data3).sum()

Gender            0
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
ZipCode           0
Income            0
Approved          0
dtype: int64

**CORRELATION MATRIX FOR PREDICTORS HAVING NUMERIC DATATYPE** - We consider the most relevant predictors that have a numeric datatype and hence zipcode is excluded.

In [0]:
#correlation matrix
import seaborn as sns
corrmat = data3.corr()
corrmat.style.background_gradient(cmap='Accent')

Unnamed: 0,Age,Debt,YearsEmployed,CreditScore,Income
Age,1.0,0.201316,0.392787,0.185575,0.018539
Debt,0.201316,1.0,0.298902,0.271207,0.123121
YearsEmployed,0.392787,0.298902,1.0,0.32233,0.051345
CreditScore,0.185575,0.271207,0.32233,1.0,0.063692
Income,0.018539,0.123121,0.051345,0.063692,1.0


There seems to be a close relationship between the below pair of predictor variables - 
1. YearsEmployed and Age 
2. YearsEmployed and Credit Score
3. Years Employed and Debt

**FINDING RELATIONSHIP BETWEEN PREDICTOR VARIABLE AND RESPONSE VARIABLE**

We used the crosstab  function from pandas package that explains which predictor is likely to contribute to the credit card approval prediction. 

In [0]:
def percConvert(ser):
  return ser/float(ser[-1])*100
  pd.crosstab(data3["EducationLevel"],data3["Approved"],margins=True).apply(percConvert, axis=1)


1) Effect of Education level on Credit Card approval

In [0]:
pd.crosstab(data3["EducationLevel"],data3["Approved"],margins=True).apply(percConvert, axis=1)

2) Effect of Gender on Credit Card approvals

In [0]:
pd.crosstab(data3["Gender"],data3["Approved"],margins=True).apply(percConvert, axis=1)

3) 

In [0]:
pd.crosstab(data3["CreditScore"],data3["Approved"],margins=True).apply(percConvert, axis=1)
#pd.crosstab(data3["Debt"],data3["Approved"],margins=True).apply(percConvert, axis=1)
#pd.crosstab(data3["Married"],data3["Approved"],margins=True).apply(percConvert, axis=1)
#pd.crosstab(data3["BankCustomer"],data3["Approved"],margins=True).apply(percConvert, axis=1)
#pd.crosstab(data3["Ethnicity"],data3["Approved"],margins=True).apply(percConvert, axis=1)
#pd.crosstab(data3["YearsEmployed"],data3["Approved"],margins=True).apply(percConvert, axis=1)

In [0]:
pd.crosstab(data3["PriorDefault"],data3["Approved"],margins=True).apply(percConvert, axis=1) #In Percentage 

Approved,+,-,All
PriorDefault,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
f,6.990881,93.009119,100.0
t,78.67036,21.32964,100.0
All,44.492754,55.507246,100.0


**COVERTING ALL ATTRIBUTES INTO A NUMERICAL VALUE BY ASSIGNING LABELS**

In [0]:
#label_data = data3
le = LabelEncoder()
# # Looping for each object type column
# Using label encoder to convert into numeric types
for col in label_data:
    if label_data[col].dtypes=='object':
        label_data[col]=le.fit_transform(label_data[col])

In [0]:
label_data.head(5)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,1,30.83,0.0,2,1,13,8,1.25,1,1,1,0,0,42,0,0
1,0,58.67,4.46,2,1,11,4,3.04,1,1,6,0,0,118,560,0
2,0,24.5,0.5,2,1,11,4,1.5,1,0,0,0,0,74,824,0
3,1,27.83,1.54,2,1,13,8,3.75,1,1,5,1,0,1,3,0
4,1,20.17,5.625,2,1,13,8,1.71,1,0,0,0,2,8,0,0
