***
# ISOM3360 Data Mining for Business Analytics
## Group 23 Project Code - Credit Card Defaultee Analysis
### Part 2.3 - Naive Bayes Classifier
***

Name: LAM, Ho Chit  
ITSC: hclamao  
SID: 20607878

Name: LEE, Ho Wan Owen  
ITSC: hwolee  
SID: 20604852

Name: LEE, Wai Chung  
ITSC: wcleeaj  
SID: 20702733

### Workflow of this notebook (TBC)

1. Explore features and characteristics of dataset
2. Drop columns of low data quality (e.g. large amounts of empty values)
3. Determine $k$ columns to keep in the dataset (feature selection)
4. Perform one-hot encoding
5. Split into training and testing sets
6. Perform data cleaning
   - Dealing with missing values
7. Perform data standardization / normalization
8. Export preprocessed data to .csv files at `./data_preprocessed/`

### Naive Bayes
we will use all the train data (891 examples) to construct the tree and evaluate the model

#### Step 1: Import the training and testing set and modules

In [24]:
import pandas as pd
#Import Gaussian Naive Bayes model from sklearn
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Naive Bayes Classifier
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
gnb = GaussianNB()

pd.set_option('display.max_rows', 500)
raw_df = pd.read_csv('../data_preprocessed/raw2.csv', index_col='SK_ID_CURR')

In [25]:
raw_df.describe()

Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Unknown,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents
count,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0
mean,0.080729,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,-16036.995067,63815.045904,-4986.120328,...,0.63878,0.06429,0.14778,7e-06,0.052317,0.887344,0.036366,0.00851,0.015873,0.048258
std,0.272419,0.722121,237123.1,402490.8,14493.454517,369279.4,0.013831,4363.988632,141275.766519,3522.886321,...,0.480355,0.24527,0.354882,0.00255,0.222666,0.316173,0.1872,0.091858,0.124983,0.214312
min,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-25229.0,-17912.0,-24672.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,-19682.0,-2760.0,-7479.5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-15750.0,-1213.0,-4504.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,-12413.0,-289.0,-2010.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,1.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7489.0,365243.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Step 2: Define useful features

In [26]:
# define independent attributes 
features = sorted(raw_df)
features

['AMT_ANNUITY',
 'AMT_CREDIT',
 'AMT_GOODS_PRICE',
 'AMT_INCOME_TOTAL',
 'AMT_INCOME_TOTAL_MINMAX',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'APARTMENTS_AVG',
 'APARTMENTS_MEDI',
 'APARTMENTS_MODE',
 'BASEMENTAREA_AVG',
 'BASEMENTAREA_MEDI',
 'BASEMENTAREA_MODE',
 'CNT_CHILDREN',
 'CNT_FAM_MEMBERS',
 'COMMONAREA_AVG',
 'COMMONAREA_MEDI',
 'COMMONAREA_MODE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_ID_PUBLISH',
 'DAYS_LAST_PHONE_CHANGE',
 'DAYS_REGISTRATION',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'ELEVATORS_AVG',
 'ELEVATORS_MEDI',
 'ELEVATORS_MODE',
 'EMERGENCYSTATE_MODE',
 'ENTRANCES_AVG',
 'ENTRANCES_MEDI',
 'ENTRANCES_MODE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'FEMALE',
 'FLAG_CONT_MOBILE',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUME

In [27]:
raw_df.isnull().sum()

TARGET                                                    0
NAME_CONTRACT_TYPE                                        0
FLAG_OWN_CAR                                              0
CNT_CHILDREN                                              0
AMT_INCOME_TOTAL                                          0
AMT_CREDIT                                                0
AMT_ANNUITY                                               0
AMT_GOODS_PRICE                                           0
REGION_POPULATION_RELATIVE                                0
DAYS_BIRTH                                                0
DAYS_EMPLOYED                                             0
DAYS_REGISTRATION                                         0
DAYS_ID_PUBLISH                                           0
OWN_CAR_AGE                                          202929
FLAG_MOBIL                                                0
FLAG_EMP_PHONE                                            0
FLAG_WORK_PHONE                         

In [30]:
raw_df['DAYS_LAST_PHONE_CHANGE'].fillna(raw_df['DAYS_LAST_PHONE_CHANGE'].mean(), inplace=True)

In [32]:
# Remove apparent null and useless columns
raw_df.dropna(axis=1, inplace=True)

In [34]:
raw_df.head()

Unnamed: 0_level_0,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Unknown,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,1,Cash loans,N,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,...,0,0,1,0,0,1,0,0,0,0
100003,0,Cash loans,N,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,...,1,0,0,0,0,1,0,0,0,0
100004,0,Revolving loans,Y,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,...,0,0,1,0,0,1,0,0,0,0
100006,0,Cash loans,N,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,...,0,0,0,0,0,1,0,0,0,0
100007,0,Cash loans,N,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,...,0,0,1,0,0,1,0,0,0,0


#### Step 3: Data Splitting

In [41]:
# define independent variables / attirbutes / features
features = raw_df.columns
features = features.drop('TARGET')
# define one single target variable / label
target = ['TARGET']

# get defined training dataset
X = train_df[features]
y = train_df[target]

In [45]:
# import train split function
from sklearn.model_selection import train_test_split

# split data into 80% and 20%, put 20% in testing
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=3360)

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196806 entries, 329669 to 137065
Data columns (total 82 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   NAME_CONTRACT_TYPE                                 196806 non-null  object 
 1   FLAG_OWN_CAR                                       196806 non-null  object 
 2   CNT_CHILDREN                                       196806 non-null  int64  
 3   AMT_INCOME_TOTAL                                   196806 non-null  float64
 4   AMT_CREDIT                                         196806 non-null  float64
 5   AMT_ANNUITY                                        196806 non-null  float64
 6   AMT_GOODS_PRICE                                    196806 non-null  float64
 7   REGION_POPULATION_RELATIVE                         196806 non-null  float64
 8   DAYS_BIRTH                                         196806 non-null  i

#### Modelling

In [None]:
from sklearn import tree
import matplotlib.pyplot as plt

# function for simple tree visualization

def simple_tree_vis(model):
    plt.figure(figsize = (100,150))
    tree.plot_tree(model,ax=None, fontsize=50)
    plt.show()
    return None

simple_tree_vis(model)

In [None]:
from sklearn.tree import export_graphviz
from IPython.display import Image
import pydotplus
import graphviz

# function for fancy tree visualization

def tree_vis(model):
    dot_data = tree.export_graphviz(model, out_file=None, 
                      feature_names=features,  
                      class_names=['Did not survive', 'Survived'],
                      filled = True, rounded=True,  
                      special_characters=True)
    graph = graphviz.Source(dot_data)
    graph.render("titanic_decisiontree")
    return graph

# uncomment the next line for graphical representation of the decision tree
# tree_vis(model)

#### Step 4: Evaluate the model on 20% validation set

- Calculate:
  - Accuracy
  - Precision
  - Recall
  <!-- - F1 score -->
- Display confusion matrix
- Plot curves:
  - Precision-Recall curve
  - ROC curve

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report
# from sklearn.metrics import roc_curve, precision_recall_curve, auc

def evaluate_model(model):
    
    # training
    print("---------- Evaluation ----------\n")
    print("Evaluation: Training")
    preds = model.predict(X_train)

    # output all metrics scores
    print("\tAccuracy:", accuracy_score(y_train, preds, normalize=True, sample_weight=None))
    # print("Precision:", precision_score(truth, preds, sample_weight=None))
    # print("Recall:", recall_score(truth, preds, sample_weight=None))

    # display confusion matrix
    print("\tConfusion matrix:\n", confusion_matrix(y_train, preds))
    
    # print classification report
    print("\tClassification report:\n", classification_report(y_train, preds))
    
    
    # validation
    print("Evaluation: Validation")
    preds = model.predict(X_val)

    # output all metrics scores
    print("\tAccuracy:", accuracy_score(y_val, preds, normalize=True, sample_weight=None))
    # print("Precision:", precision_score(truth, preds, sample_weight=None))
    # print("Recall:", recall_score(truth, preds, sample_weight=None))

    # display confusion matrix
    print("\tConfusion matrix:\n", confusion_matrix(y_val, preds))
    
    # print classification report
    print("\tClassification report:\n", classification_report(y_val, preds))
    
    
    return None

In [None]:
# evaluate model

evaluate_model(model)

Since the difference between training and validation accuracy is substantial and the training accuracy is extremely close to 100%, it is safe to conclude that severe overfitting occured in this model with default hyperparameters.  
There are 3 methods to reduce overfitting:
- Hyperparameter tuning (manual)
- Cross validation
- Hyperparameter tuning (via GridSearchCV)

### Manual Hyperparameter Tuning

##### max_depth = 8

In [None]:
model1 = create_model(max_depth=8)
evaluate_model(model1)

##### max_leaf_nodes = 50

In [None]:
model2 = create_model(max_leaf_nodes=50)
evaluate_model(model2)

##### min_samples_split = 2

In [None]:
model3 = create_model(min_samples_split=2)
evaluate_model(model3)

##### min_samples_leaf = 6

In [None]:
model4 = create_model(min_samples_leaf=6)
evaluate_model(model4)

##### min_impurity_decrease = 0.05

In [None]:
model5 = create_model(min_impurity_decrease=0.05)
evaluate_model(model5)

##### Combination of hyperparameters above
- max_depth = 8
- max_leaf_nodes = 50
- min_samples_split = 2
- min_samples_leaf = 6
<!-- - min_impurity_decrease = 0.1 -->

In [None]:
model6 = create_model(max_depth=8, 
                      max_leaf_nodes=50,
                      min_samples_split=2,
                      min_samples_leaf=6)
evaluate_model(model6)

### 10-fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

score_cv = cross_val_score(model6, X, y, cv=10)
print("CV results:", score_cv)
print("Mean =", score_cv.mean())

### GridSearchCV

In [None]:
# create try_grid
try_grid = [{'max_depth': np.arange(3, 16),     # 3 to 15
             'max_leaf_nodes': np.arange(1, 19)*5,      # 5, 10, 15, ..., 90
             'min_samples_split': np.arange(2, 7),     # 2 - 6
             'min_samples_leaf': np.arange(3, 10),      # 3 - 9
             'min_impurity_decrease': np.linspace(0, 0.225, 8),}]        # 0, 0.025, 0.05, etc., 0.2

In [None]:
from sklearn.model_selection import GridSearchCV

# create GridSearchCV object
DTM = GridSearchCV(DecisionTreeClassifier(random_state=3360), param_grid=try_grid, cv=10, verbose=1)

In [None]:
DTM.fit(X, y)

print("Best params:", DTM.best_params_)
print("Best score :", DTM.best_score_)

In [None]:
# create instance of best model
best_model = create_model(**DTM.best_params_)

evaluate_model(best_model)
simple_tree_vis(best_model)

### Generate data file for prediction results

In [None]:
# create dataframe for prediction results
preds = pd.DataFrame(index=test_df.index, columns=['Survived'])

# store prediction results of best model into dataframe
preds['Survived'] = best_model.predict(test_df[features])

# export to csv file
preds.to_csv('prediction.csv')

preds.describe()

### Conclusion and findings

The results are fairly predictive.

## This is the end of Part 2.3 Naive Bayes Classifier.