In [53]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import os
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns



In [54]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 1. Load the dataset
df = pd.read_csv("Passenger_satisfaction.csv")

# 2. Drop duplicates early
df.drop_duplicates(inplace=True)

# 3. Fill missing values
df['Arrival Delay in Minutes'].fillna(df['Arrival Delay in Minutes'].median(), inplace=True)

# 4. Drop unwanted columns
df.drop(columns=['Unnamed: 0', 'id'], inplace=True, errors='ignore')

# 5. Inspect categorical columns
categorical_cols = df.select_dtypes(include='object').columns.tolist()
print("Categorical Columns:", categorical_cols)

for col in categorical_cols:
    print(f"{col}: {df[col].unique()}")

# 6. Label encode binary categorical columns
label_encoders = {}
binary_cols = ['Gender', 'Customer Type', 'Type of Travel']  # ✅ Removed 'Class'

for col in binary_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 7. One-hot encode 'Class'
df = pd.get_dummies(df, columns=['Class'])  # ✅ No error here

# 8. Encode target variable
le_target = LabelEncoder()
df['satisfaction_encoded'] = le_target.fit_transform(df['satisfaction'])

# Optional: Check the final columns
print(df.columns)


Categorical Columns: ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'satisfaction']
Gender: ['Male' 'Female']
Customer Type: ['Loyal Customer' 'disloyal Customer']
Type of Travel: ['Personal Travel' 'Business travel']
Class: ['Eco Plus' 'Business' 'Eco']
satisfaction: ['neutral or dissatisfied' 'satisfied']
Index(['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Flight Distance',
       'Inflight wifi service', 'Departure/Arrival time convenient',
       'Ease of Online booking', 'Gate location', 'Food and drink',
       'Online boarding', 'Seat comfort', 'Inflight entertainment',
       'On-board service', 'Leg room service', 'Baggage handling',
       'Checkin service', 'Inflight service', 'Cleanliness',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction', 'Class_Business', 'Class_Eco', 'Class_Eco Plus',
       'satisfaction_encoded'],
      dtype='object')


In [55]:
import matplotlib.pyplot as plt
import seaborn as sns

# Rating features
rating_features = [
    'Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking',
    'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
    'Inflight entertainment', 'On-board service', 'Leg room service',
    'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness'
]




In [56]:
fig, axes = plt.subplots(1, 3, figsize=(18,5))

sns.histplot(df['Age'], bins=30, ax=axes[0], kde=True)
axes[0].set_title('Age Distribution')

sns.histplot(df['Flight Distance'], bins=30, ax=axes[1], kde=True)
axes[1].set_title('Flight Distance Distribution')

sns.histplot(df['Arrival Delay in Minutes'], bins=30, ax=axes[2], kde=True)
axes[2].set_title('Arrival Delay Distribution')

plt.tight_layout()
plt.savefig("eda_distribution_plots.png")  # Save the figure
plt.close()  
plt.show()


In [57]:
import matplotlib.pyplot as plt
import seaborn as sns

# Categorical plots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.countplot(x='Gender', hue='satisfaction', data=df, ax=axes[0])
axes[0].set_title('Satisfaction by Gender')

sns.countplot(x='Customer Type', hue='satisfaction', data=df, ax=axes[1])
axes[1].set_title('Satisfaction by Customer Type')

sns.countplot(x='Type of Travel', hue='satisfaction', data=df, ax=axes[2])
axes[2].set_title('Satisfaction by Type of Travel')

plt.tight_layout()
plt.savefig('categorical_satisfaction.png')
plt.close()

# Define numeric rating features
rating_features = [
    'Inflight wifi service', 'Departure/Arrival time convenient', 'Ease of Online booking',
    'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
    'Inflight entertainment', 'On-board service', 'Leg room service',
    'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness'
]

# Numeric columns to consider for correlation heatmap (only numeric)
numeric_cols = ['Age', 'Flight Distance'] + [col for col in rating_features if col in df.columns]

# Ensure 'satisfaction_encoded' is in your dataframe and numeric (0/1)
if 'satisfaction_encoded' not in df.columns:
    # Example encoding: 
    df['satisfaction_encoded'] = df['satisfaction'].map({'satisfied': 1, 'neutral or dissatisfied': 0})

plt.figure(figsize=(12, 10))
corr = df[numeric_cols + ['satisfaction_encoded']].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap of Numeric Features and Satisfaction')
plt.savefig('correlation_heatmap.png')
plt.close()


In [58]:
# Define important features based on your feature importance & correlation
important_features = [
    'Online boarding', 'Inflight wifi service', 'Type of Travel',
    'Class_Business', 'Inflight entertainment', 'Flight Distance',
    'Ease of Online booking', 'Age', 'Customer Type', 'Seat comfort'
]

# Ensure all important features exist in dataframe columns
important_features = [feat for feat in important_features if feat in df.columns]

# Use only important features for X
X = df[important_features]
y = df['satisfaction_encoded']



In [59]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# If you want to apply SMOTE only on train
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Models dictionary
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}

# Function to evaluate model and log with MLflow
def train_and_log_model(model_name, model, X_train, y_train, X_test, y_test):
    with mlflow.start_run(run_name=model_name):
        # Train
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        
        # Metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        
        # Log params and metrics
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("precision", prec)
        mlflow.log_metric("recall", rec)
        
        # Log confusion matrix as artifact (optional: save as image or csv)
        import matplotlib.pyplot as plt
        import seaborn as sns
        import os
        
        plt.figure(figsize=(6,5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        
        cm_path = f'confusion_matrix_{model_name}.png'
        plt.savefig(cm_path)
        plt.close()
        
        mlflow.log_artifact(cm_path)
        os.remove(cm_path)
        
        # Log the model itself
        mlflow.sklearn.log_model(model, model_name)
        
        print(f"{model_name} -- Accuracy: {acc:.4f}, F1-score: {f1:.4f}")

# Train and log all models
for name, mdl in models.items():
    train_and_log_model(name, mdl, X_train_smote, y_train_smote, X_test, y_test)


LogisticRegression -- Accuracy: 0.8502, F1-score: 0.8316
RandomForest -- Accuracy: 0.9420, F1-score: 0.9332
GradientBoosting -- Accuracy: 0.9285, F1-score: 0.9185


In [60]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Define numeric columns to scale
# Define numeric columns from your important features (only numeric ones)
numeric_cols = ['Age', 'Flight Distance'] + [feat for feat in important_features if feat in rating_features]

# Create scaler and fit on train data only
scaler = StandardScaler()

# We’ll build pipelines for LR and GB that first scale numeric features
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', 'passthrough', [col for col in X.columns if col not in numeric_cols])  # categorical already encoded
    ])


# Pipelines
pipelines = {
    'LogisticRegression': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=2000, random_state=42))
    ]),
    'GradientBoosting': Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', GradientBoostingClassifier(random_state=42))
    ]),
    'RandomForest': RandomForestClassifier(random_state=42)  # no scaling pipeline needed
}

# Train-test split and SMOTE as before
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train and log function adapted to handle pipeline or model directly
def train_and_log_model(model_name, model, X_train, y_train, X_test, y_test):
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("precision", prec)
        mlflow.log_metric("recall", rec)
        
        import matplotlib.pyplot as plt
        import seaborn as sns
        import os
        
        plt.figure(figsize=(6,5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        
        cm_path = f'confusion_matrix_{model_name}.png'
        plt.savefig(cm_path)
        plt.close()
        
        mlflow.log_artifact(cm_path)
        os.remove(cm_path)
        
        mlflow.sklearn.log_model(model, model_name)
        
        print(f"{model_name} -- Accuracy: {acc:.4f}, F1-score: {f1:.4f}")

# Train all models again
for name, mdl in pipelines.items():
    train_and_log_model(name, mdl, X_train_smote, y_train_smote, X_test, y_test)


LogisticRegression -- Accuracy: 0.8516, F1-score: 0.8330
GradientBoosting -- Accuracy: 0.9285, F1-score: 0.9185
RandomForest -- Accuracy: 0.9420, F1-score: 0.9332


In [61]:
import pickle

# Save label encoders dict
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

# Save scaler inside pipeline (only needed for LogisticRegression and GradientBoosting)
# You can save entire pipeline directly:
with open('best_model.pkl', 'wb') as f:
    pickle.dump(pipelines['RandomForest'], f)  # or your selected model pipeline

# If scaler separate, save it as well (but here scaler is inside pipeline)


In [62]:
import pickle

with open("columns.pkl", "wb") as f:
    pickle.dump(X.columns.tolist(), f)

In [63]:
import sklearn
print(sklearn.__version__)

1.0.2


In [64]:
le_target = LabelEncoder()
df['satisfaction_encoded'] = le_target.fit_transform(df['satisfaction'])
print(le_target.classes_)  # This should print ['dissatisfied', 'satisfied']


['neutral or dissatisfied' 'satisfied']


In [65]:
import pickle
import pandas as pd

with open('best_model.pkl', 'rb') as f:
    model = pickle.load(f)

feat_imp = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feat_imp.head(10))


Online boarding           0.218846
Inflight wifi service     0.163166
Type of Travel            0.132472
Flight Distance           0.095036
Inflight entertainment    0.092446
Class_Business            0.078307
Age                       0.070241
Ease of Online booking    0.055655
Customer Type             0.047314
Seat comfort              0.046516
dtype: float64


In [66]:
print("Original class distribution:\n", y.value_counts())
print("After SMOTE:\n", pd.Series(y_train_smote).value_counts())


Original class distribution:
 0    58879
1    45025
Name: satisfaction_encoded, dtype: int64
After SMOTE:
 1    47103
0    47103
Name: satisfaction_encoded, dtype: int64


In [67]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Dissatisfied', 'Satisfied']))


              precision    recall  f1-score   support

Dissatisfied       0.95      0.95      0.95     11776
   Satisfied       0.93      0.94      0.93      9005

    accuracy                           0.94     20781
   macro avg       0.94      0.94      0.94     20781
weighted avg       0.94      0.94      0.94     20781



In [68]:
le_target = LabelEncoder()
df['satisfaction_encoded'] = le_target.fit_transform(df['satisfaction'])
print(le_target.classes_)   # Must print: ['neutral or dissatisfied' 'satisfied']


['neutral or dissatisfied' 'satisfied']


In [69]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


In [70]:
print("Original distribution:\n", y.value_counts())
print("Train distribution before SMOTE:\n", y_train.value_counts())
print("Train distribution after SMOTE:\n", pd.Series(y_train_smote).value_counts())


Original distribution:
 0    58879
1    45025
Name: satisfaction_encoded, dtype: int64
Train distribution before SMOTE:
 0    47103
1    36020
Name: satisfaction_encoded, dtype: int64
Train distribution after SMOTE:
 1    47103
0    47103
Name: satisfaction_encoded, dtype: int64


In [71]:
print("Train distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())


Train distribution after SMOTE:
1    47103
0    47103
Name: satisfaction_encoded, dtype: int64


In [72]:
if hasattr(model, "predict_proba"):
    y_probs = model.predict_proba(X_test)[:, 1]
    print(f"Sample predicted probabilities for 'satisfied' class: {y_probs[:10]}")


Sample predicted probabilities for 'satisfied' class: [0.   0.03 0.9  0.03 0.   0.   1.   0.07 0.67 1.  ]


In [73]:
print("Number of samples with prob > 0.5:", (y_probs > 0.5).sum())
print("Total test samples:", len(y_probs))


Number of samples with prob > 0.5: 9045
Total test samples: 20781


In [74]:
from collections import Counter
print(Counter(y_pred))  # y_pred from model.predict(X_test)


Counter({0: 11736, 1: 9045})


In [75]:
threshold = 0.3
y_pred_thresh = (y_probs >= threshold).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_thresh, target_names=le_target.classes_))

from collections import Counter
print("Predicted class counts with threshold 0.3:", Counter(y_pred_thresh))


                         precision    recall  f1-score   support

neutral or dissatisfied       0.97      0.90      0.93     11776
              satisfied       0.88      0.96      0.92      9005

               accuracy                           0.93     20781
              macro avg       0.92      0.93      0.92     20781
           weighted avg       0.93      0.93      0.93     20781

Predicted class counts with threshold 0.3: Counter({0: 10933, 1: 9848})
