# Assigment 3
## Author: **Prashanth Kakkerla**
### Load data

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [21]:
# Load the data
data = pd.read_csv('UniversalBank.csv')

In [22]:
data.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [23]:
# Check for missing values
print(data.isnull().sum())

# Check the data types of each column
print(data.dtypes)

# Get some descriptive statistics of the data
print(data.describe())

# Check the distribution of the target variable
print(data['Personal Loan'].value_counts())


ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64
ID                      int64
Age                     int64
Experience              int64
Income                  int64
ZIP Code                int64
Family                  int64
CCAvg                 float64
Education               int64
Mortgage                int64
Personal Loan           int64
Securities Account      int64
CD Account              int64
Online                  int64
CreditCard              int64
dtype: object
                ID          Age   Experience       Income      ZIP Code  \
count  5000.000000  5000.000000  5000.000000  5000.000000   5000.000000   
mean   2500.500000    45.338400    20.104600    73.774200  93152.50

In [24]:
# Partition the data into training (60%) and validation (40%) sets
train_data, validation_data = train_test_split(data, test_size=0.4, random_state=42)

In [25]:
# A. Create a pivot table for the training data
pivot_table_A = pd.pivot_table(train_data, values='ID', index=['CreditCard'], columns=['Online', 'Personal Loan'], aggfunc='count', fill_value=0)

In [26]:
pivot_table_A

Online,0,0,1,1
Personal Loan,0,1,0,1
CreditCard,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,780,70,1141,129
1,319,34,481,46


In [27]:
# B. Probability of loan acceptance for a customer with CreditCard=1 and Online=1
prob_loan_acceptance = pivot_table_A[1][1] / pivot_table_A[1].sum()
print(f"Probability of loan acceptance for a customer with CreditCard=1 and Online=1: {prob_loan_acceptance}")

Probability of loan acceptance for a customer with CreditCard=1 and Online=1: CreditCard
0    0.079531
1    0.262857
dtype: float64


In [28]:
# Create a pivot table for the training data with Loan as a function of Online
pivot_table_loan_online = pd.pivot_table(train_data, values='ID', index=['Personal Loan'], columns=['Online'], aggfunc='count', fill_value=0)
print(pivot_table_loan_online)

# Create a pivot table for the training data with Loan as a function of CC
pivot_table_loan_cc = pd.pivot_table(train_data, values='ID', index=['Personal Loan'], columns=['CreditCard'], aggfunc='count', fill_value=0)
print(pivot_table_loan_cc)


Online            0     1
Personal Loan            
0              1099  1622
1               104   175
CreditCard        0    1
Personal Loan           
0              1921  800
1               199   80


In [29]:
# i. P(CC = 1 | Loan = 1)
prob_cc_given_loan = pivot_table_loan_cc[1][1] / pivot_table_loan_cc[1].sum()
print(f"P(CC = 1 | Loan = 1): {prob_cc_given_loan}")

# ii. P(Online = 1 | Loan = 1)
prob_online_given_loan = pivot_table_loan_online[1][1] / pivot_table_loan_online[1].sum()
print(f"P(Online = 1 | Loan = 1): {prob_online_given_loan}")

# iii. P(Loan = 1)
prob_loan = pivot_table_A.sum(axis=1)[1] / pivot_table_A.sum().sum()
print(f"P(Loan = 1): {prob_loan}")

# iv. P(CC = 1 | Loan = 0)
prob_cc_given_no_loan = pivot_table_loan_cc[0][1] / pivot_table_loan_cc[0].sum()
print(f"P(CC = 1 | Loan = 0): {prob_cc_given_no_loan}")

# v. P(Online = 1 | Loan = 0)
prob_online_given_no_loan = pivot_table_loan_online[0][1] / pivot_table_loan_online[0].sum()
print(f"P(Online = 1 | Loan = 0): {prob_online_given_no_loan}")

# vi. P(Loan = 0)
prob_no_loan = pivot_table_A.sum(axis=1)[0] / pivot_table_A.sum().sum()
print(f"P(Loan = 0): {prob_no_loan}")


P(CC = 1 | Loan = 1): 0.09090909090909091
P(Online = 1 | Loan = 1): 0.09738452977184196
P(Loan = 1): 0.29333333333333333
P(CC = 1 | Loan = 0): 0.0938679245283019
P(Online = 1 | Loan = 0): 0.08645054031587697
P(Loan = 0): 0.7066666666666667


In [30]:
# P(Loan = 1 | CC = 1, Online = 1)
prob_loan_given_cc_online = (prob_cc_given_loan * prob_online_given_loan * prob_loan) / ((prob_cc_given_loan * prob_online_given_loan * prob_loan) + (prob_cc_given_no_loan * prob_online_given_no_loan * prob_no_loan))
print(f"P(Loan = 1 | CC = 1, Online = 1): {prob_loan_given_cc_online}")


P(Loan = 1 | CC = 1, Online = 1): 0.3117000679144145


## Comparing Estimates:
- Estimate from Pivot Table (B):
  - Probability of loan acceptance for a customer with CreditCard=1 and Online=1: ~0.263

### Estimate from Naive Bayes Model (E):
Probability of loan acceptance for a customer with CreditCard=1 and Online=1: ~0.312

In [31]:
# H. Train a Naive Bayes model on the data
features = ['Online', 'CreditCard']
target = 'Personal Loan'
X_train = train_data[features]
y_train = train_data[target]

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# The entry corresponding to P(Loan = 1 | CC = 1, Online = 1) is obtained using predict_proba
p_CC_given_loan_1_model = nb_model.predict_proba([[1, 0]])[0][1]  # P(CC = 1 | Loan = 1)
p_Online_given_loan_1_model = nb_model.predict_proba([[0, 1]])[0][1]  # P(Online = 1 | Loan = 1)
p_loan_1_model = nb_model.class_prior_[1]  # P(Loan = 1)

# P(Loan = 1 | CC = 1, Online = 1) using Bayes' theorem
p_loan_1_given_CC_and_Online_model = (p_CC_given_loan_1_model * p_Online_given_loan_1_model * p_loan_1_model) / \
                                     ((p_CC_given_loan_1_model * p_Online_given_loan_1_model * p_loan_1_model) +
                                      ((1 - p_CC_given_loan_1_model) * (1 - p_Online_given_loan_1_model) * (1 - p_loan_1_model)))

print("P(Loan = 1 | CC = 1, Online = 1) from Naive Bayes model:", p_loan_1_given_CC_and_Online_model)

P(Loan = 1 | CC = 1, Online = 1) from Naive Bayes model: 0.001020750123943752
