## Setup

In [1]:
import pandas as pd
import numpy as np

import os
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from typing import List, Tuple

In [2]:
# Get File Paths
# google_drive_path = "/Users/andrewstange/Desktop/CMU/Spring_2023/80-816/project/Project_80816/data"
google_drive_path = "/Users/nathanluskey/Library/CloudStorage/GoogleDrive-nluskey@andrew.cmu.edu/My Drive/80816_Project/Data/"

train_filename = "ChurnForBankCustomers_optbinning_train.csv"
test_filename = "ChurnForBankCustomers_optbinning_test.csv"

train_full_filename = os.path.join(google_drive_path, train_filename)
test_full_filename = os.path.join(google_drive_path, test_filename)

assert os.path.isfile(train_full_filename)
assert os.path.isfile(test_full_filename)

In [3]:
# Convert dataframe to numpy array
df_train = pd.read_csv(train_full_filename)
df_test = pd.read_csv(test_full_filename)
display(df_train.dtypes)
display(df_train.head(5))

CreditScore         int64
Geography          object
Gender             object
Age                 int64
Tenure              int64
Balance             int64
NumOfProducts       int64
HasCrCard            bool
IsActiveMember       bool
EstimatedSalary     int64
Exited               bool
dtype: object

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,7,France,Male,1,0,5,0,True,False,1,False
1,2,France,Female,1,5,5,0,False,True,5,False
2,2,France,Male,6,1,0,1,True,True,4,False
3,4,Germany,Female,4,1,3,0,True,True,4,True
4,5,Spain,Female,8,1,0,0,True,False,5,False


In [4]:
for col in df_train.columns:
    unique_vals = sorted(df_train[col].unique())
    data_to_num = {data: num for num, data in enumerate(unique_vals)}
    df_train[col] = df_train[col].map(data_to_num)

for col in df_test.columns:
    unique_vals = sorted(df_test[col].unique())
    data_to_num = {data: num for num, data in enumerate(unique_vals)}
    df_test[col] = df_test[col].map(data_to_num)

display(df_train.dtypes)
display(df_train.head(5))

CreditScore        int64
Geography          int64
Gender             int64
Age                int64
Tenure             int64
Balance            int64
NumOfProducts      int64
HasCrCard          int64
IsActiveMember     int64
EstimatedSalary    int64
Exited             int64
dtype: object

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,7,0,1,1,0,5,0,1,0,1,0
1,2,0,0,1,5,5,0,0,1,5,0
2,2,0,1,6,1,0,1,1,1,4,0
3,4,1,0,4,1,3,0,1,1,4,1
4,5,2,0,8,1,0,0,1,0,5,0


## Evaluate Naive Bayes Classifier using Exited Markov Blanket

In [5]:
all_results = pd.DataFrame()

In [6]:
def filter_nb(df_train: pd.DataFrame, df_test: pd.DataFrame, markov_blanket: List[str]) -> Tuple[np.ndarray, np.ndarray]:
    # Filter Markov Blanket: drop all columns not in the Markov Blanket of 'Exited'
    df_train = df_train.filter(markov_blanket + ["Exited"])

    # 5-fold cross validation of MultinomialNB
    X_train = df_train.drop('Exited', axis=1).to_numpy()
    y_train = df_train['Exited'].to_numpy()
    train_scores = cross_val_score(MultinomialNB(), X_train, y_train)

    # Filter Markov Blanket: drop all columns not in the Markov Blanket of 'Exited'
    df_test = df_test.filter(markov_blanket + ["Exited"])
    X_test = df_test.drop('Exited', axis=1).to_numpy()
    y_test = df_test['Exited'].to_numpy()

    # Train on training data and then test on testing data
    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)
    y_pred = mnb.predict(X_test)
    test_score = accuracy_score(y_test, y_pred)


    return train_scores, test_score

def save_scores(algo: str, train_accs: np.ndarray, test_acc: float, all_results: pd.DataFrame) -> pd.DataFrame:
    new_row = pd.DataFrame()
    new_row["algorithm"] = [algo]

    new_row["test_acc"] = [test_acc]

    new_row["train_acc_avg"] = [np.mean(train_accs)]
    new_row["train_accs_var"] = [np.var(train_accs)]

    for i, score in enumerate(train_accs):
        new_row[f"train_acc_{i+1}"] = [score]

    all_results = pd.concat([all_results, new_row], ignore_index=True)
    return all_results

In [7]:
# All variables
all_vars = df_train.columns.to_list()
all_vars.remove('Exited')
scores, acc = filter_nb(df_train, df_test, all_vars)
print(f"Cross Validation Acc: {scores}\nTest Acc: {acc}")

all_results = save_scores("None", scores, acc, all_results)

Cross Validation Acc: [0.815    0.809375 0.806875 0.813125 0.800625]
Test Acc: 0.8115


In [8]:
# PC Markov Blanket
pc_blanket = ['Age', 'IsActiveMember', 'Gender', 'Balance', 'Geography', 'NumOfProducts']
scores, acc = filter_nb(df_train, df_test, pc_blanket)
print(f"Cross Validation Acc: {scores}\nTest Acc: {acc}")

all_results = save_scores("PC", scores, acc, all_results)

Cross Validation Acc: [0.821875 0.80875  0.81625  0.818125 0.813125]
Test Acc: 0.8265


In [9]:
# FCI Markov Blanket
fci_blanket = ['Age', 'IsActiveMember', 'Gender', 'Balance', 'Geography', 'NumOfProducts']
scores, acc = filter_nb(df_train, df_test, fci_blanket)
print(f"Cross Validation Acc: {scores}\nTest Acc: {acc}")

all_results = save_scores("FCI", scores, acc, all_results)

Cross Validation Acc: [0.821875 0.80875  0.81625  0.818125 0.813125]
Test Acc: 0.8265


In [10]:
# GES Markov Blanket
ges_blanket = ['Age', 'IsActiveMember', 'Gender', 'Geography', 'NumOfProducts']
scores, acc = filter_nb(df_train, df_test, ges_blanket)
print(f"Cross Validation Acc: {scores}\nTest Acc: {acc}")

all_results = save_scores("GES", scores, acc, all_results)

Cross Validation Acc: [0.8225   0.808125 0.816875 0.814375 0.81375 ]
Test Acc: 0.8255


## Evaluate using Parents

In [11]:
# GES Markov Blanket
ges_parents = ['Age', 'IsActiveMember']
scores, acc = filter_nb(df_train, df_test, ges_parents)
print(f"Cross Validation Acc: {scores}\nTest Acc: {acc}")

all_results = save_scores("GES (parents)", scores, acc, all_results)

Cross Validation Acc: [0.7925   0.7925   0.791875 0.791875 0.791875]
Test Acc: 0.813


In [12]:
all_results.to_csv("NaiveBayes_results.csv", index=False)