## Setup

In [1]:
import pandas as pd
import numpy as np

from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

import graphviz 

from typing import List, Tuple
import os

In [2]:
# Get File Paths
# google_drive_path = "/Users/andrewstange/Desktop/CMU/Spring_2023/80-816/project/Project_80816/data"
google_drive_path = "/Users/nathanluskey/Library/CloudStorage/GoogleDrive-nluskey@andrew.cmu.edu/My Drive/80816_Project/Data/"

# train_filename = "ChurnForBankCustomers_optbinning_train.csv"
# test_filename = "ChurnForBankCustomers_optbinning_test.csv"

train_filename = "ChurnForBankCustomers_train.csv"
test_filename = "ChurnForBankCustomers_test.csv"

train_full_filename = os.path.join(google_drive_path, train_filename)
test_full_filename = os.path.join(google_drive_path, test_filename)

assert os.path.isfile(train_full_filename)
assert os.path.isfile(test_full_filename)

In [3]:
# Convert dataframe to numpy array
df_train = pd.read_csv(train_full_filename)
df_test = pd.read_csv(test_full_filename)
display(df_train.dtypes)
display(df_train.head(5))

CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard             bool
IsActiveMember        bool
EstimatedSalary    float64
Exited                bool
dtype: object

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,751,France,Male,29,1,135536.5,1,True,False,66825.33,False
1,605,France,Female,30,9,135422.31,1,False,True,186418.85,False
2,567,France,Male,42,2,0.0,2,True,True,167984.61,False
3,665,Germany,Female,37,3,111911.63,1,True,True,110359.68,True
4,682,Spain,Female,58,4,0.0,1,True,False,176036.01,False


In [4]:
def ohe_df(df: pd.DataFrame) -> Tuple[pd.DataFrame, dict]:
    col_former_to_new = dict()
    for col in df.columns:
        curr_data = df[col]
        if curr_data.dtype == "object":
            one_hot = pd.get_dummies(curr_data)
            df = df.drop(col, axis=1)
            df = df.join(one_hot)
            col_former_to_new[col] = one_hot.columns
    return df, col_former_to_new

def convert_columns(input_list: List[str], col_former_to_new: dict) -> List[str]:
    # Convert to new columns
    new_list = []
    for col in input_list:
        new_col = col_former_to_new.get(col, [col])
        new_list.extend(new_col)
    return new_list

In [5]:
df_train, col_former_to_new = ohe_df(df_train)
df_test, _ = ohe_df(df_test)

display(df_train.dtypes)
display(df_train.head(5))

CreditScore          int64
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard             bool
IsActiveMember        bool
EstimatedSalary    float64
Exited                bool
France                bool
Germany               bool
Spain                 bool
Female                bool
Male                  bool
dtype: object

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain,Female,Male
0,751,29,1,135536.5,1,True,False,66825.33,False,True,False,False,False,True
1,605,30,9,135422.31,1,False,True,186418.85,False,True,False,False,True,False
2,567,42,2,0.0,2,True,True,167984.61,False,True,False,False,False,True
3,665,37,3,111911.63,1,True,True,110359.68,True,False,True,False,True,False
4,682,58,4,0.0,1,True,False,176036.01,False,False,False,True,True,False


## Evaluate Naive Bayes Classifier using Exited Markov Blanket

In [6]:
all_results = pd.DataFrame()

In [7]:
def filter_dt(df_train: pd.DataFrame, df_test: pd.DataFrame, markov_blanket: List[str]) -> Tuple[np.ndarray, np.ndarray]:
    # Filter Markov Blanket: drop all columns not in the Markov Blanket of 'Exited'
    df_train = df_train.filter(markov_blanket + ["Exited"])

    # 5-fold cross validation of Decision Tree
    X_train = df_train.drop('Exited', axis=1).to_numpy()
    y_train = df_train['Exited'].to_numpy()
    train_scores = cross_val_score(tree.DecisionTreeClassifier(random_state=80816), X_train, y_train)

    # Filter Markov Blanket: drop all columns not in the Markov Blanket of 'Exited'
    df_test = df_test.filter(markov_blanket + ["Exited"])
    X_test = df_test.drop('Exited', axis=1).to_numpy()
    y_test = df_test['Exited'].to_numpy()

    # Train on training data and then test on testing data
    dt = tree.DecisionTreeClassifier(random_state=80816)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    test_score = accuracy_score(y_test, y_pred)


    return train_scores, test_score, dt

def save_scores(algo: str, train_accs: np.ndarray, test_acc: float, all_results: pd.DataFrame) -> pd.DataFrame:
    new_row = pd.DataFrame()
    new_row["algorithm"] = [algo]

    new_row["test_acc"] = [test_acc]

    new_row["train_acc_avg"] = [np.mean(train_accs)]
    new_row["train_accs_var"] = [np.var(train_accs)]

    for i, score in enumerate(train_accs):
        new_row[f"train_acc_{i+1}"] = [score]

    all_results = pd.concat([all_results, new_row], ignore_index=True)
    return all_results

In [8]:
# All variables
all_vars = df_train.columns.to_list()
all_vars.remove('Exited')
scores, acc, dt_all = filter_dt(df_train, df_test, all_vars)
print(f"Cross Validation Acc: {scores}\nTest Acc: {acc}")

all_results = save_scores("None", scores, acc, all_results)

Cross Validation Acc: [0.779375 0.8025   0.786875 0.819375 0.8     ]
Test Acc: 0.7925


In [9]:
# PC Markov Blanket
pc_blanket = ['Age', 'IsActiveMember', 'Gender', 'Balance', 'Geography', 'NumOfProducts']
pc_blanket = convert_columns(pc_blanket, col_former_to_new)

scores, acc, dt_PC = filter_dt(df_train, df_test, pc_blanket)
print(f"Cross Validation Acc: {scores}\nTest Acc: {acc}")

all_results = save_scores("PC", scores, acc, all_results)

Cross Validation Acc: [0.80625  0.816875 0.814375 0.81625  0.80875 ]
Test Acc: 0.8165


In [10]:
# FCI Markov Blanket
fci_blanket = ['Age', 'IsActiveMember', 'Gender', 'Balance', 'Geography', 'NumOfProducts']
fci_blanket = convert_columns(fci_blanket, col_former_to_new)

scores, acc, dt_FCI = filter_dt(df_train, df_test, fci_blanket)
print(f"Cross Validation Acc: {scores}\nTest Acc: {acc}")

all_results = save_scores("FCI", scores, acc, all_results)

Cross Validation Acc: [0.80625  0.816875 0.814375 0.81625  0.80875 ]
Test Acc: 0.8165


In [11]:
# GES Markov Blanket
ges_blanket = ['Age', 'IsActiveMember', 'Gender', 'Geography', 'NumOfProducts']
ges_blanket = convert_columns(ges_blanket, col_former_to_new)

scores, acc, dt_GES = filter_dt(df_train, df_test, ges_blanket)
print(f"Cross Validation Acc: {scores}\nTest Acc: {acc}")

all_results = save_scores("GES", scores, acc, all_results)

Cross Validation Acc: [0.839375 0.843125 0.83625  0.85125  0.835   ]
Test Acc: 0.8605


## Evaluate using Parents

In [12]:
# GES Parents
ges_parents = ['Age', 'IsActiveMember']
ges_parents = convert_columns(ges_parents, col_former_to_new)

scores, acc, dt_GES_parents = filter_dt(df_train, df_test, ges_parents)
print(f"Cross Validation Acc: {scores}\nTest Acc: {acc}")

all_results = save_scores("GES (parents)", scores, acc, all_results)

Cross Validation Acc: [0.823125 0.825625 0.829375 0.83625  0.82    ]
Test Acc: 0.831


In [13]:
all_results.to_csv("DecisionTrees_results.csv", index=False)

## Visualization

In [14]:
dot_data = tree.export_graphviz(dt_all, out_file=None, 
                      feature_names=all_vars,  
                      class_names=['Exited', 'Stayed'],  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  
graph.render(filename="all_data", directory="./DecisionTrees", view=False, format="png")


dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.523493 to fit


'DecisionTrees/all_data.png'

In [15]:
dot_data = tree.export_graphviz(dt_PC, out_file=None, 
                      feature_names=pc_blanket,  
                      class_names=['Exited', 'Stayed'],  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  
graph.render(filename="PC", directory="./DecisionTrees", view=False, format="png")

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.411796 to fit


'DecisionTrees/PC.png'