# Import libraries

In [42]:
import numpy as np
import pandas as pd
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Read data

In [43]:
train_dataset = pd.read_csv('train.csv')
print('================================ Data Train ================================')
print(f'Number of rows: {len(train_dataset)}')
train_dataset.head()

Number of rows: 70001


Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [44]:
private_test_dataset = pd.read_csv('private_test.csv')
print('================================ Private Data Test ================================')
print(f'Number of rows: {len(private_test_dataset)}')
private_test_dataset.head()

Number of rows: 18903


Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,20490,Male,Loyal Customer,57,Business travel,Business,1688,1,1,1,...,1,4,4,4,5,2,4,3,0,0.0
1,112979,Female,Loyal Customer,41,Personal Travel,Eco Plus,1242,3,5,2,...,4,5,5,2,3,4,5,4,0,0.0
2,95660,Female,Loyal Customer,50,Personal Travel,Eco,928,2,5,2,...,5,3,3,2,3,5,3,3,0,0.0
3,85302,Male,Loyal Customer,45,Personal Travel,Eco,813,3,2,3,...,4,4,1,5,4,4,4,4,0,0.0
4,127959,Female,disloyal Customer,46,Business travel,Eco,1061,1,1,1,...,4,4,1,2,3,3,4,4,79,69.0


In [45]:
public_test_dataset = pd.read_csv('public_test.csv')
print('================================ Private Data Test ================================')
print(f'Number of rows: {len(public_test_dataset)}')
public_test_dataset.head()

Number of rows: 15000


Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,26978,Male,Loyal Customer,9,Business travel,Business,1722,2,2,2,...,2,2,2,3,4,1,4,2,8,2.0
1,77760,Male,Loyal Customer,19,Business travel,Business,3279,4,4,4,...,4,4,5,4,4,5,5,4,0,0.0
2,32065,Female,Loyal Customer,32,Business travel,Business,216,1,3,1,...,5,5,2,3,5,5,5,5,0,0.0
3,101454,Male,disloyal Customer,22,Business travel,Business,345,4,0,4,...,2,3,3,2,5,5,4,3,116,113.0
4,44417,Female,Loyal Customer,37,Business travel,Eco Plus,265,2,5,5,...,2,2,2,4,3,1,2,2,0,21.0


In [46]:
test_dataset = pd.concat([public_test_dataset, private_test_dataset], ignore_index=True)
print('================================ Data Test ================================')
print(f'Number of rows: {len(test_dataset)}')
test_dataset.head()

Number of rows: 33903


Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,26978,Male,Loyal Customer,9,Business travel,Business,1722,2,2,2,...,2,2,2,3,4,1,4,2,8,2.0
1,77760,Male,Loyal Customer,19,Business travel,Business,3279,4,4,4,...,4,4,5,4,4,5,5,4,0,0.0
2,32065,Female,Loyal Customer,32,Business travel,Business,216,1,3,1,...,5,5,2,3,5,5,5,5,0,0.0
3,101454,Male,disloyal Customer,22,Business travel,Business,345,4,0,4,...,2,3,3,2,5,5,4,3,116,113.0
4,44417,Female,Loyal Customer,37,Business travel,Eco Plus,265,2,5,5,...,2,2,2,4,3,1,2,2,0,21.0


# Preprocess Data

In [47]:
features = ['Class', 'On-board service', 'Ease of Online booking', 'Gender', 'Customer Type', 'Age', 'Type of Travel', 'Inflight wifi service', 'Departure/Arrival time convenient', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness', 'Gate location']
target = ['satisfaction']
X = train_dataset[features]
y = train_dataset[target]
X_submit = test_dataset[features]

## Encode data

In [48]:
for col in X.columns:
    first_valid_idx = X[col].first_valid_index()
    X[col].fillna(X.at[first_valid_idx, col], inplace=True)
    X_submit[col].fillna(X.at[first_valid_idx, col], inplace=True)
    if X[col].dtype == 'object':
        label_encoder = LabelEncoder()
        label_encoder.fit(X[col])
        X[col] = label_encoder.transform(X[col])
        X_submit[col] = label_encoder.transform(X_submit[col])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col].fillna(X.at[first_valid_idx, col], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_submit[col].fillna(X.at[first_valid_idx, col], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = label_encoder.transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

## Train test split

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Multinomial NaiveBayes

In [50]:
mnb_model = MultinomialNB()
model = mnb_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [51]:
mnb_model.score(X_test, y_test)

0.6009570744946789

# SVM

In [52]:
standard_scaler = StandardScaler()
standard_scaler.fit(X_train)
X_train_std = standard_scaler.transform(X_train)
X_test_std = standard_scaler.transform(X_test)
X_submit_std = standard_scaler.transform(X_submit)
# Thầy cho bộ dữ liệu là supprise.csv
# sp_dataset = pd.read_csv('supprise.csv')
# X_sp = sp_dataset[feature]
# y_sp = sp_dataset[target]
# for col in X.columns:
#     first_valid_idx = X[col].first_valid_index()
#     X[col].fillna(X.at[first_valid_idx, col], inplace=True)
#     X_submit[col].fillna(X.at[first_valid_idx, col], inplace=True)
#     X_sp[col].fillna(X.at[first_valid_idx, col], inplace=True)
#     if X[col].dtype == 'object':
#         label_encoder = LabelEncoder()
#         label_encoder.fit(X[col])
#         X[col] = label_encoder.transform(X[col])
#         X_submit[col] = label_encoder.transform(X_submit[col])
#         X_sp[col] = label_encoder.transform(X_sp[col])
# mnb_model.predict(X_sp)
# X_sp_std = standard_scaler.transform(X_sp)
# svm_model.predict(X_sp_std)

In [53]:
svm_model = SVC()
svm_model.fit(X_train_std, y_train)

  y = column_or_1d(y, warn=True)


In [54]:
svm_model.score(X_test_std, y_test)

0.9512891936290265

# Evaluate

In [55]:
y_pred_mnb = mnb_model.predict(X_test)
print("Multinomial Naive Bayes:")
print(classification_report(y_test, y_pred_mnb))

Multinomial Naive Bayes:
                         precision    recall  f1-score   support

neutral or dissatisfied       0.66      0.61      0.63      7899
              satisfied       0.54      0.58      0.56      6102

               accuracy                           0.60     14001
              macro avg       0.60      0.60      0.60     14001
           weighted avg       0.61      0.60      0.60     14001



In [56]:
y_pred_svm = svm_model.predict(X_test_std)
print("Support Vector Machine:")
print(classification_report(y_test, y_pred_svm))

Support Vector Machine:
                         precision    recall  f1-score   support

neutral or dissatisfied       0.95      0.97      0.96      7899
              satisfied       0.96      0.93      0.94      6102

               accuracy                           0.95     14001
              macro avg       0.95      0.95      0.95     14001
           weighted avg       0.95      0.95      0.95     14001



# Submit

In [57]:
y_public_test_prediction = svm_model.predict(X_submit_std)

In [58]:
public_results = []
for i in range(len(y_public_test_prediction)):
    public_results.append((test_dataset['id'].tolist()[i], y_public_test_prediction[i]))

In [59]:
public_submission_df = pd.DataFrame(public_results, columns=['id','satisfaction'])
public_submission_df.to_csv('submission.csv', index=False)  