DATA PREPROCESSING

In [None]:
import pandas as pd

df = pd.read_csv('Invistico_Airline.csv')

df = df.dropna()

df['satisfaction'] = df['satisfaction'].map({'dissatisfied': 0, 'satisfied': 1})
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
df['Customer Type'] = df['Customer Type'].map({'disloyal Customer': 0, 'Loyal Customer': 1})
df['Type of Travel'] = df['Type of Travel'].map({'Business travel': 0, 'Personal Travel': 1})
df['Class'] = df['Class'].map({'Eco': 0, 'Eco Plus': 1, 'Business': 2})
df['Arrival Delay in Minutes'] = df['Arrival Delay in Minutes'].astype('int')

print(df.head())

SPLIT DATA

In [12]:
from sklearn.model_selection import train_test_split

target = 'satisfaction'
X = df.drop(target, axis=1)
y = df[target]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

TRAIN MODEL

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)
print("Validation Set Classification Report:")
print(classification_report(y_val, y_val_pred))

y_test_pred = model.predict(X_test)
print("Test Set Classification Report:")
print(classification_report(y_test, y_test_pred))

Validation Set Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      8840
           1       0.97      0.95      0.96     10583

    accuracy                           0.96     19423
   macro avg       0.95      0.96      0.96     19423
weighted avg       0.96      0.96      0.96     19423

Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      8799
           1       0.97      0.96      0.96     10625

    accuracy                           0.96     19424
   macro avg       0.96      0.96      0.96     19424
weighted avg       0.96      0.96      0.96     19424



In [16]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

CROSS VALIDATION

In [14]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validated accuracy scores:", scores)
print("Average accuracy:", scores.mean())

Cross-validated accuracy scores: [0.95415931 0.95233892 0.95382833 0.9559797  0.95465578]
Average accuracy: 0.9541924095322154


TEST

In [15]:
# Sample input = X là vector gồm input như liệt kê bên dưới
# Output là y mang giá trị 0 hoặc 1 (0 là không hài lòng, 1 là hài lòng)
sample = pd.DataFrame([{
    'Gender' : 0,                               # 0 hoặc 1
    'Customer Type' : 0,                        # 0 hoặc 1
    'Age' : 35,
    'Type of Travel' : 0,
    'Class' : 0,
    'Flight Distance' : 1500,
    'Seat comfort' : 0,
    'Departure/Arrival time convenient' : 3,
    'Food and drink' : 1,
    'Gate location' : 2,
    'Inflight wifi service' : 4,
    'Inflight entertainment' : 3,
    'Online support' : 2,
    'Ease of Online booking' : 4,
    'On-board service' : 3,
    'Leg room service' : 2,
    'Baggage handling' : 3,
    'Checkin service' : 4,
    'Cleanliness' : 3,
    'Online boarding' : 4,
    'Departure Delay in Minutes' : 0,
    'Arrival Delay in Minutes' : 0,             
}])

# One-hot encode the sample
sample_encoded = pd.get_dummies(sample, columns=['Gender', 'Customer Type', 'Type of Travel', 'Class'], drop_first=True)

# (fill missing columns with 0)
for col in model.feature_names_in_:
    if col not in sample_encoded.columns:
        sample_encoded[col] = 0

# Ensure correct column order
sample_encoded = sample_encoded[model.feature_names_in_]

# Predict
prediction = model.predict(sample_encoded)
print("Predicted satisfaction:", "Satisfied" if prediction[0] == 1 else "Not Satisfied")


Predicted satisfaction: Satisfied


In [None]:
# Làm home.html để người dùng nhập dữ liệu và dự đoán
# gọi file model.pkl để dự đoán
# Cách gọi:
# Load model
import os 
current_dir = os.getcwd()
pickle_path = os.path.join(current_dir, 'model.pkl')
with open(pickle_path, 'rb') as f:
    model = pickle.load(f)
    
    
# Mapping input data
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
df['Customer Type'] = df['Customer Type'].map({'disloyal Customer': 0, 'Loyal Customer': 1})
df['Type of Travel'] = df['Type of Travel'].map({'Business travel': 0, 'Personal Travel': 1})
df['Class'] = df['Class'].map({'Eco': 0, 'Eco Plus': 1, 'Business': 2})

# Đặt một function để dự đoán dữ liệu qua việc gọi model
def predict(X):
    result = model.predict(X)
    return result

y = model.predict(X)

# Mapping output data
df['satisfaction'] = df['satisfaction'].map({0: 'disatisfied', 1: 'satisfied'})
# Biểu diễn xem có hài lòng hay không
