In [50]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import StandardScaler

# load the dataset
df = pd.read_csv("test.csv")

# check the first few rows
df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,...,2,2,2,2,4,2,4,0,20.0,satisfied


In [51]:
print("Missing values:\n", df.isnull().sum())
print("\nNumber of duplicate rows:", df.duplicated().sum())
df = df.drop_duplicates()

Missing values:
 Unnamed: 0                            0
id                                    0
Gender                                0
Customer Type                         0
Age                                   0
Type of Travel                        0
Class                                 0
Flight Distance                       0
Inflight wifi service                 0
Departure/Arrival time convenient     0
Ease of Online booking                0
Gate location                         0
Food and drink                        0
Online boarding                       0
Seat comfort                          0
Inflight entertainment                0
On-board service                      0
Leg room service                      0
Baggage handling                      0
Checkin service                       0
Inflight service                      0
Cleanliness                           0
Departure Delay in Minutes            0
Arrival Delay in Minutes             83
satisfaction           

In [52]:
print(df.columns.tolist())

df = df.drop(columns=['Unnamed: 0', 'id'])

print(df.columns.tolist())

df_new = df.copy()


['Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class', 'Flight Distance', 'Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking', 'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes', 'satisfaction']
['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class', 'Flight Distance', 'Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking', 'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes', 'satisfaction']


In [53]:
# fill missing 'Arrival Delay in Minutes' column values with the mean value 
df_new['Arrival Delay in Minutes'] = df_new['Arrival Delay in Minutes'].fillna(df_new['Arrival Delay in Minutes'].mean()) 

# Gender: Female = 1, Male = 0
df_new['Gender'] = df_new['Gender'].replace({'Female': 1, 'Male': 0}).astype(int)
# print(df_new['Gender'].head(10).to_list())

# Customer Type: Loyal Customer = 1, disoyal Customer = 0
df_new['Customer Type'] = df_new['Customer Type'].replace({'Loyal Customer': 1, 'disloyal Customer': 0}).astype(int)
# print(df_new['Customer Type'].head(10).to_list())

# Type of Travel: Business travel = 1, Personal Travel = 0
df_new['Type of Travel'] = df_new['Type of Travel'].replace({'Business travel': 1, 'Personal Travel': 0}).astype(int)
# print(df_new['Type of Travel'].head(30).to_list())

# satisfaction: satisfied = 1, neutral or dissatisfied = 0
df_new['satisfaction'] = df_new['satisfaction'].replace({'satisfied': 1, 'neutral or dissatisfied': 0}).astype(int)
# print(df_new['satisfaction'].head(30).to_list())

# Class: Business = (1, 0), Eco = (0, 1), Other = (0, 0)
df_new['Class Business'] = (df_new['Class'] == 'Business').astype(int)
df_new['Class Eco'] = (df_new['Class'] == 'Eco').astype(int)
df_new = df_new.drop(columns=['Class'])
# print(df_new.columns.tolist())



  df_new['Gender'] = df_new['Gender'].replace({'Female': 1, 'Male': 0}).astype(int)
  df_new['Customer Type'] = df_new['Customer Type'].replace({'Loyal Customer': 1, 'disloyal Customer': 0}).astype(int)
  df_new['Type of Travel'] = df_new['Type of Travel'].replace({'Business travel': 1, 'Personal Travel': 0}).astype(int)
  df_new['satisfaction'] = df_new['satisfaction'].replace({'satisfied': 1, 'neutral or dissatisfied': 0}).astype(int)


In [None]:
# scale numeric features 
scaler = StandardScaler()
y = df_new['satisfaction']
X = df_new.drop(columns=['satisfaction'])
X_scaled = scaler.fit_transform(X)

In [56]:
# split into training, validation, and test sets 
from sklearn.model_selection import train_test_split
# training data 80%, testing data 20%
X_train, X_testing, y_train, y_testing = train_test_split(X_scaled, y, train_size=0.8, random_state=42, shuffle=True)
# validation and test data is 10% each
X_val, X_test, y_val, y_test = train_test_split(X_testing, y_testing, train_size=0.5, random_state=42, shuffle=True)

In [60]:
# print(df_new.columns.tolist())
print(X_train[:20])

[[ 0.98593202  0.47603968  1.8750118  -1.50743517 -0.52749278 -2.04046047
   0.62164094 -1.95111359 -0.76209981  1.34034786  1.28242204  0.41723532
  -1.01455585 -1.08080722 -2.54024532 -0.53823824 -0.24751653 -1.39689295
  -0.21695197 -0.38228641 -0.39354257 -0.9627357   1.11636981]
 [-1.01426871 -2.10066521 -0.70172986  0.66337845 -0.20906761  0.2061275
  -0.0305297   0.17214266  0.0178657   0.58930433 -0.19303782  0.41723532
   0.47990687  1.25917014 -0.26551378  1.16171635  1.32814563  1.14406253
   0.54102306 -0.32884254 -0.39354257  1.03870668 -0.89576051]
 [-1.01426871  0.47603968 -0.70172986  0.66337845  1.51223071 -0.54273516
  -0.68270034  0.17214266 -0.76209981  0.58930433  0.54469211  0.41723532
   1.22713822 -0.30081477  0.49273006  0.31173905  0.54031455 -0.54990779
   0.54102306 -0.32884254 -0.15326596  1.03870668 -0.89576051]
 [ 0.98593202  0.47603968 -2.02313583 -1.50743517  1.046609   -1.29159781
   1.27381157 -1.24336151 -1.54206532 -1.66382626 -1.66849767 -1.8553804

In [61]:
# store feature names
features = X.columns.to_list()
print(features)

['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Flight Distance', 'Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking', 'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes', 'Class Business', 'Class Eco']
