# Credit Card Lead Prediction

In [19]:
# Import Libraries
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

In [3]:
# Import Dataset
df = pd.read_csv("E:\\Pycharm Projects\\Machine_Learning\\GitHub\\Creditcard Lead Prediction\\train_s3TEQDk.csv")

# Print df
df.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0


In [6]:
# Print shape of df
print("Number of Rows:", df.shape[0])
print("Number of Columns:", df.shape[1])

Number of Rows: 245725
Number of Columns: 11


In [7]:
# Check the datatypes of df
df.dtypes

ID                     object
Gender                 object
Age                     int64
Region_Code            object
Occupation             object
Channel_Code           object
Vintage                 int64
Credit_Product         object
Avg_Account_Balance     int64
Is_Active              object
Is_Lead                 int64
dtype: object

In [8]:
# Print the Statiscical description of the data
df.describe()

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Is_Lead
count,245725.0,245725.0,245725.0,245725.0
mean,43.856307,46.959141,1128403.0,0.237208
std,14.828672,32.353136,852936.4,0.425372
min,23.0,7.0,20790.0,0.0
25%,30.0,20.0,604310.0,0.0
50%,43.0,32.0,894601.0,0.0
75%,54.0,73.0,1366666.0,0.0
max,85.0,135.0,10352010.0,1.0


In [10]:
# Information of Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245725 entries, 0 to 245724
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ID                   245725 non-null  object
 1   Gender               245725 non-null  object
 2   Age                  245725 non-null  int64 
 3   Region_Code          245725 non-null  object
 4   Occupation           245725 non-null  object
 5   Channel_Code         245725 non-null  object
 6   Vintage              245725 non-null  int64 
 7   Credit_Product       216400 non-null  object
 8   Avg_Account_Balance  245725 non-null  int64 
 9   Is_Active            245725 non-null  object
 10  Is_Lead              245725 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 20.6+ MB


In [15]:
# See unique values in each categorical column
for column in df.select_dtypes(include = ["object"]).columns:
    print("Unique categories in "+column+" are:")
    print(df[column].unique())
    print()

Unique categories in ID are:
['NNVBBKZB' 'IDD62UNG' 'HD3DSEMC' ... 'GEHAUCWT' 'GE7V8SAH' 'BOCZSWLJ']

Unique categories in Gender are:
['Female' 'Male']

Unique categories in Region_Code are:
['RG268' 'RG277' 'RG270' 'RG282' 'RG261' 'RG265' 'RG283' 'RG254' 'RG269'
 'RG257' 'RG279' 'RG280' 'RG252' 'RG284' 'RG259' 'RG281' 'RG258' 'RG266'
 'RG260' 'RG274' 'RG256' 'RG275' 'RG273' 'RG267' 'RG272' 'RG251' 'RG262'
 'RG264' 'RG278' 'RG276' 'RG263' 'RG250' 'RG255' 'RG253' 'RG271']

Unique categories in Occupation are:
['Other' 'Salaried' 'Self_Employed' 'Entrepreneur']

Unique categories in Channel_Code are:
['X3' 'X1' 'X2' 'X4']

Unique categories in Credit_Product are:
['No' nan 'Yes']

Unique categories in Is_Active are:
['No' 'Yes']



## Hypothesis Testing

### *Check Relation Between Gender and Is_lead (Using Chi-square test)* 

In [18]:
# Cross tab between "Gender" and "Is_Lead".
Gender_Is_lead = pd.crosstab(df["Is_Lead"], df["Gender"])

# Print Gender_Is_lead
print(Gender_Is_lead)

Gender   Female   Male
Is_Lead               
0         88823  98614
1         22705  35583


In [21]:
# Apply Chi-Square test
# H0: There is no significance relation between Gender and Is_Lead
# H1: There is significance relation between Gender and Is_Lead
# If p-value < 0.05, Reject H0
stat, p, dof, expected = chi2_contingency(Gender_Is_lead)
print("Observed Statistic: ", stat)
print("p-value: ", p)

Observed Statistic:  1275.9029016565435
p-value:  1.949380935011988e-279


*p-value < 0.05, So Reject H0*
<br>*So, There is significance relation between Gender and Is_Lead*

### *Check Relation Between Region_Code and Is_lead (Using Chi-square test)*

In [26]:
# Cross tab between "Gender" and "Is_Lead".
Region_Code_Is_lead = pd.crosstab(df["Is_Lead"], df["Region_Code"])

# Print Gender_Is_lead
Region_Code_Is_lead

Region_Code,RG250,RG251,RG252,RG253,RG254,RG255,RG256,RG257,RG258,RG259,...,RG275,RG276,RG277,RG278,RG279,RG280,RG281,RG282,RG283,RG284
Is_Lead,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2103,4569,3694,1377,21127,1549,2446,4931,1521,2085,...,2702,1996,9851,1422,3079,9766,3979,4709,20531,13504
1,393,1381,592,481,5713,469,401,1170,430,501,...,543,768,2975,400,897,3009,1114,1120,8885,5816


In [24]:
# Apply Chi-Square test
# H0: There is no significance relation between Region_Code and Is_Lead
# H1: There is significance relation between Region_Code and Is_Lead
# If p-value < 0.05, Reject H0
stat, p, dof, expected = chi2_contingency(Region_Code_Is_lead)
print("Observed Statistic: ", stat)
print("p-value: ", p)

Observed Statistic:  3913.669551933591
p-value:  0.0


*p-value < 0.05, Reject H0
<br>So, There is significance relation between Region_Code and Is_Lead*

### *Check Relation Between Occupation and Is_lead (Using Chi-square test)*

In [29]:
# Cross tab between "Occupation" and "Is_Lead".
Occupation_Is_lead = pd.crosstab(df["Is_Lead"], df["Occupation"])

# Print Occupation_Is_lead
Occupation_Is_lead

Occupation,Entrepreneur,Other,Salaried,Self_Employed
Is_Lead,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,905,52984,60503,73045
1,1762,17189,11496,27841


In [30]:
# Apply Chi-Square test
# H0: There is no significance relation between Occupation and Is_Lead
# H1: There is significance relation between Occupation and Is_Lead
# If p-value < 0.05, Reject H0
stat, p, dof, expected = chi2_contingency(Occupation_Is_lead)
print("Observed Statistic: ", stat)
print("p-value: ", p)

Observed Statistic:  5896.249668475439
p-value:  0.0


*p-value < 0.05, Reject H0
<br>So, There is significance relation between Occupation and Is_Lead*

### *Check Relation Between Channel_Code and Is_lead (Using Chi-square test)* 

In [32]:
# Cross tab between "Channel_Code" and "Is_Lead".
Channel_Code_Is_lead = pd.crosstab(df["Is_Lead"], df["Channel_Code"])

# Print Channel_Code_Is_lead
Channel_Code_Is_lead

Channel_Code,X1,X2,X3,X4
Is_Lead,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,94236,45519,43493,4189
1,9482,22207,25219,1380


In [33]:
# Apply Chi-Square test
# H0: There is no significance relation between Channel_Code and Is_Lead
# H1: There is significance relation between Channel_Code and Is_Lead
# If p-value < 0.05, Reject H0
stat, p, dof, expected = chi2_contingency(Channel_Code_Is_lead)
print("Observed Statistic: ", stat)
print("p-value: ", p)

Observed Statistic:  21664.453956739817
p-value:  0.0


*p-value < 0.05, Reject H0
<br>So, There is significance relation between Channel_Code and Is_Lead*

### *Check Relation Between Credict_Product and Is_lead (Using Chi-square test)* 

In [37]:
# Cross tab between "Credit_Product" and "Is_Lead".
Credit_Product_Is_lead = pd.crosstab(df["Is_Lead"], df["Credit_Product"])

# Print Credit_Product_Is_lead
Credit_Product_Is_lead

Credit_Product,No,Yes
Is_Lead,Unnamed: 1_level_1,Unnamed: 2_level_1
0,133734,49353
1,10623,22690


In [38]:
# Apply Chi-Square test
# H0: There is no significance relation between Credit_Product and Is_Lead
# H1: There is significance relation between Credit_Product and Is_Lead
# If p-value < 0.05, Reject H0
stat, p, dof, expected = chi2_contingency(Credit_Product_Is_lead)
print("Observed Statistic: ", stat)
print("p-value: ", p)

Observed Statistic:  21494.00881079159
p-value:  0.0


*p-value < 0.05, Reject H0
<br>So, There is significance relation between Credit_Product and Is_Lead*

### *Check Relation Between Is_Active and Is_lead (Using Chi-square test)* 

In [39]:
# Cross tab between "Is_Active" and "Is_Lead".
Is_Active_Is_lead = pd.crosstab(df["Is_Lead"], df["Is_Active"])

# Print Is_Active_Is_lead
Is_Active_Is_lead

Is_Active,No,Yes
Is_Lead,Unnamed: 1_level_1,Unnamed: 2_level_1
0,119007,68430
1,31283,27005


In [40]:
# Apply Chi-Square test
# H0: There is no significance relation between Is_Active and Is_Lead
# H1: There is significance relation between Is_Active and Is_Lead
# If p-value < 0.05, Reject H0
stat, p, dof, expected = chi2_contingency(Is_Active_Is_lead)
print("Observed Statistic: ", stat)
print("p-value: ", p)

Observed Statistic:  1805.2967681025368
p-value:  0.0


*p-value < 0.05, Reject H0
<br>So, There is significance relation between Is_Active and Is_Lead*

### *Check Relation Between Age and Is_lead (Using 2-Sample ttest)*