This is a print-out of the entire code used to train and test the best-performing SVM and MLP models. 

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

sns.set_style('whitegrid')
plt.style.use('ggplot')

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix, roc_curve, auc

from sklearn.svm import SVC

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

from skorch import NeuralNetClassifier
from skorch.callbacks import EarlyStopping

import pickle
import time

# Preprocessing data

In [3]:
# importing data
train_data = pd.read_csv('train_data.csv') # this dataset is saved as 'train.csv' in Kaggle
test_data = pd.read_csv('test_data.csv') # this dataset is saved as 'test.csv' in Kaggle

# combining train and test sets to have entire dataset
data = pd.concat([test_data, train_data], axis = 0)
# dropping irrelevant columns
data = data.drop(data.iloc[:,[0, 1]], axis = 1)

# categorical columns
categorical_indexes = list(range(6, 20))
data.iloc[:,categorical_indexes] = data.iloc[:,categorical_indexes].astype('category')
data['Gender'] = data['Gender'].astype('category')
data['Customer Type'] = data['Customer Type'].astype('category')
data['Type of Travel'] = data['Type of Travel'].astype('category')
data['Class'] = data['Class'].astype('category')
data['satisfaction'] = data['satisfaction'].astype('category')

# handling missing data
data['Arrival Delay in Minutes'].fillna(data['Departure Delay in Minutes'], inplace = True)

# numerical and categorical columns
numerical_columns = [c for c in data.columns if data[c].dtype.name != 'category']
categorical_columns = [c for c in data.columns if data[c].dtype.name == 'category']

# encoding categorical columns
data["Gender"]=data["Gender"].map({"Male":0,"Female":1})
data["Customer Type"]=data["Customer Type"].map({"disloyal Customer":0,"Loyal Customer":1})
data["Type of Travel"]=data["Type of Travel"].map({"Personal Travel":0,"Business travel":1})
data["Class"]=data["Class"].map({"Eco":0,"Eco Plus":1,"Business":2})
data["satisfaction"]=data["satisfaction"].map({"neutral or dissatisfied":0,"satisfied":1})

# numerical columns
data_numerical = data[numerical_columns]
scaler = MinMaxScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# dropping columns
data = data.drop(['Gender', 'Departure/Arrival time convenient', 'Gate location',
                  'Departure Delay in Minutes', 'Arrival Delay in Minutes'], axis = 1)
data

FileNotFoundError: [Errno 2] No such file or directory: 'train_data.csv'

# Importing data

In [None]:
# Importing entire preprocessed data
data = pd.read_csv('data.csv') 

In [None]:
# Splitting data
# X (input) and y (target)
X = data.iloc[:, 0:17]
y = data.satisfaction 

# Splitting whole dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 24)

# Support Vector Machines (SVM)

## Training SVM model

In [None]:
# Hyperparameter tuning
# Paramater grid
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

In [None]:
# Create SVM classifier
svm_classifier = SVC()

# Perform grid search with cross validation
grid_search_svm = GridSearchCV(svm_classifier, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

# Fit the grid search on training data
grid_search_svm.fit(X_train, y_train)

In [None]:
# Best model from grid search
best_model_svm = grid_search_svm.best_estimator_

# Best parameters from grid search
best_params_svm = grid_search_svm.best_params_
print("Best Parameters:", best_params_svm)

In [None]:
# Save best performing model using pickle
# Save model as file

filename = 'best_svm.pkl'

pickle.dump(best_model_svm, open(filename, 'wb'))

## Testing SVM model

In [None]:
# Loading best perfoming SVM model from file
with open('best_svm.pkl', 'rb') as file:
    
    best_model_svm = pickle.load(file)

In [None]:
# Testing on test set
# Record start time
start_time = time.time()

# Make predictions on test data
y_pred_svm = best_model_svm.predict(X_test)

# Record end time
end_time = time.time()

# Compute testing time
testing_time = end_time - start_time
print("Testing Time:", testing_time, "seconds")

In [None]:
# Evaluating model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
roc_auc_svm = roc_auc_score(y_test, y_pred_svm)

print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1 Score:", f1_svm)
print("ROC AUC Score:", roc_auc_svm)

# Creating dataFrame with evaluation metrics
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC Score'],
    'Score': [accuracy_svm, precision_svm, recall_svm, f1_svm, roc_auc_svm]
})

In [None]:
# Calculate confusion matrix
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)

# Plot confusion matrix
plt.figure(figsize=(4, 4))
sns.heatmap(conf_matrix_svm, cmap='Blues', annot=True, fmt='d', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
# Adjust the tick labels
plt.xticks(ticks=[0.5, 1.5], labels=['Neutral or dissatisfied', 'Satisfied'])
plt.yticks(ticks=[0.5, 1.5], labels=['Neutral or dissatisfied', 'Satisfied'])

plt.show()

In [None]:
# Calculate ROC curve and AUC
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred_svm)
roc_auc_svm = auc(fpr_svm, tpr_svm)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_svm, tpr_svm, color='blue', lw=2, 
         label='ROC curve (area = %0.2f)' % roc_auc_svm)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

# Multilayer Perceptron (MLP)

# Training MLP model

In [None]:
# Further preprocessing data to feed to MLP model
# X (input) and y (target)
X = data.iloc[:, 0:17]
y = data.satisfaction 

# Binary encoding of labels
encoder = LabelEncoder()
y = encoder.fit_transform(y)

# Convert to 2D PyTorch tensors
X = torch.tensor(X.values, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

# Splitting whole dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 24)

In [None]:
# Define the MLP model
class MLP_layers(nn.Module):
    # initialising attributes of MLP model
    def __init__(self, input_size, output_size, num_hidden_layers, hidden_size, activation, weight_init):
        super(MLP_layers, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.num_hidden_layers = num_hidden_layers
        self.hidden_size = hidden_size
        self.activation = activation
        self.weight_init = weight_init
        
        # Define input layer
        self.input_layer = nn.Linear(input_size, hidden_size)
        
        # Define hidden layers
        self.hidden_layers = nn.ModuleList()
        for _ in range(num_hidden_layers):
            self.hidden_layers.append(nn.Linear(hidden_size, hidden_size))
        
        # Define output layer
        self.output_layer = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        # defines foward pass of MLP model
        # Flatten input tensor if needed
        x = x.view(-1, self.input_size)
        
        # Forward pass through input layer
        x = self.activation(self.input_layer(x))
        
        # Forward pass through hidden layers
        for hidden_layer in self.hidden_layers:
            x = self.activation(hidden_layer(x))
        
        # Forward pass through output layer with sigmoid activation
        x = torch.sigmoid(self.output_layer(x))
        return x

In [None]:
# Defining model with skorch
mlp = NeuralNetClassifier(
    MLP_layers,  
    module__input_size=X_train.shape[1],
    module__output_size=1, 
    criterion=nn.BCELoss(), # binary cross entropy loss
    optimizer=optim.Adam, # Adam optimiser
    max_epochs=100,
    device='cuda' if torch.cuda.is_available() else 'cpu',
)

In [None]:
# Hyperparameter tuning
# Define the grid search parameters
param_grid2 = {
    'module__num_hidden_layers': [1, 2],  # Number of hidden layers
    'module__hidden_size': [16, 32],  # Size of each hidden layer
    'optimizer__lr': [0.001, 0.01],  # Learning rate
    'batch_size': [32],  # Batch size
    'max_epochs': [30],  # Number of epochs
    'optimizer__weight_decay': [0],  # Regularisation
    'module__activation': [F.relu, F.tanh],  # Activation function
    'module__weight_init': ['he']  # Weight initialisation method
}

In [None]:
# Perform grid search with cross validation
grid_search_mlp = GridSearchCV(estimator=mlp, param_grid=param_grid2, n_jobs=-1, cv=10 , error_score='raise')

# Fit the grid search on training data
grid_search_mlp.fit(X_train, y_train)

In [None]:
# Get the best model from the grid search
best_model_mlp = grid_search_mlp.best_estimator_
best_params_mlp = grid_search_mlp.best_params_
print("Best Parameters:", best_params_mlp)

In [None]:
# Save best performing model using pickle
# Save model as file
filename = 'best_mlp.pkl'

pickle.dump(best_model_mlp, open(filename, 'wb'))

## Testing MLP model

In [1]:
# Loading best perfoming MLP model from file
with open('best_mlp.pkl', 'rb') as file:
    
    best_model_mlp = pickle.load(file)

NameError: name 'pickle' is not defined

In [None]:
# Testing on test set
# Record start time
start_time = time.time()

# Make predictions on test data
y_pred_mlp = best_model_mlp.predict(X_test)

# Record end time
end_time = time.time()

# Compute testing time
testing_time = end_time - start_time
print("Testing Time:", testing_time, "seconds")

In [None]:
# Evaluating model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
precision_mlp = precision_score(y_test, y_pred_mlp)
recall_mlp = recall_score(y_test, y_pred_mlp)
f1_mlp = f1_score(y_test, y_pred_mlp)
roc_auc_mlp = roc_auc_score(y_test, y_pred_mlp)

print("Accuracy:", accuracy_mlp)
print("Precision:", precision_mlp)
print("Recall:", recall_mlp)
print("F1 Score:", f1_mlp)
print("ROC AUC Score:", roc_auc_mlp)

In [None]:
# Calculate confusion matrix
conf_matrix_mlp = confusion_matrix(y_test, y_pred_mlp)

# Plot confusion matrix
plt.figure(figsize=(4, 4))
sns.heatmap(conf_matrix_mlp, cmap='Blues', annot=True, fmt='d', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
# Adjust the tick labels
plt.xticks(ticks=[0.5, 1.5], labels=['Neutral or dissatisfied', 'Satisfied'])
plt.yticks(ticks=[0.5, 1.5], labels=['Neutral or dissatisfied', 'Satisfied'])

plt.show()


In [None]:
# Calculate ROC curve and AUC
fpr_mlp, tpr_mlp, _ = roc_curve(y_test, y_pred_mlp)
roc_auc_mlp= auc(fpr_mlp, tpr_mlp)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_mlp, tpr_mlp, color='red', lw=2, 
         label='ROC curve (area = %0.2f)' % roc_auc_mlp)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

In [None]:
# ROC curve and AUC for SVM and MLP
plt.figure(figsize=(8, 6))
# Plot ROC curve for SVM
plt.plot(fpr_svm, tpr_svm, color='blue', lw=2, 
         label='SVM ROC curve (area = %0.2f)' % roc_auc_svm)
# Plot ROC curve for MLP
plt.plot(fpr_mlp, tpr_mlp, color='red', lw=2, 
         label='MLP ROC curve (area = %0.2f)' % roc_auc_mlp)

# Plotting the diagonal line (random classifier)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')

# Setting plot properties
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()