In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *

# define a function for calculating the metric to be used later 
def classification_metrics(Y_pred, Y_true):
    acc = accuracy_score(Y_true, Y_pred)
    precision = precision_score(Y_true, Y_pred)
    recall = recall_score(Y_true, Y_pred)
    f1score = f1_score(Y_true, Y_pred)
    auc = roc_auc_score(Y_true, Y_pred)

    return acc, precision, recall, f1score, auc

# define a function for printing the metrics 
def display_metrics(classifierName,Y_pred,Y_true):
    print ("_____________________________")
    print ("Model: "+classifierName)
    acc, precision, recall, f1score, auc = classification_metrics(Y_pred,Y_true)
    print ("Accuracy: "+str(acc))
    print ("Precision: "+str(precision))
    print ("Recall: "+str(recall))
    print ("F1-score: "+str(f1score))
    print ("AUC: "+str(auc))
    print ("_____________________________")
    print ("")

In [21]:
# Load the dataset
data=pd.read_csv('/Users/von/Documents/SPU/202309 AUTUMN/ISM6353/Project/satisfaction.csv')

In [22]:
# Check for missing values
missing_values = data.isnull().sum()

In [23]:
# Remove missing values
data_clean = data.dropna()
data_clean = data_clean.drop('id',axis=1)
data_clean = pd.get_dummies(data_clean, columns=['Gender','Customer Type','Type of Travel','Class'],drop_first=True)

In [24]:
# Set target variable and the features
target = 'satisfaction_v2'
y = data_clean[target]
X = data_clean.drop(columns=[target])

In [25]:
# Convert target variable as categorical
y = pd.get_dummies(y, drop_first=True)

In [26]:
# Standardize the data
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)

In [27]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=101)

# define the model
model_nb = GaussianNB()

# train the model
model_nb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [28]:
# make predictions 
y_pred = model_nb.predict(X_test)

# calculate the confusion matrix for the test data 
confusion_matrix_results = confusion_matrix(y_test, y_pred)

# print the counts of the confusion matrix 
print('confusion matrix: \n', confusion_matrix_results)

# print the metrics 
display_metrics('Naive Bayes', y_pred, y_test)

confusion matrix: 
 [[13850  3703]
 [ 3396 17898]]
_____________________________
Model: Naive Bayes
Accuracy: 0.8172574458774166
Precision: 0.8285727512615156
Recall: 0.8405184559030713
F1-score: 0.8345028558107005
AUC: 0.814778683315291
_____________________________

