In [14]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


from xgboost import XGBClassifier

from IPython.display import Markdown, display


cust_df = pd.read_csv("Customer Satisfaction.csv")

"""
The goal of this project is to analyse the factors influencing customer satisfaction in road transport. 
By examining the relationships between various attributes such as age, travel category, distance travelled, and service ratings, 
we aim to build a predictive model that identifies the key drivers of a 'Satisfied' or 'Neutral or Dissatisfied' outcome.
"""

display(Markdown("# Data Cleaning"))

# Print data types of each column in the df
print("Data types of each column:")
print(cust_df.dtypes)


# Check missing values before cleaning
print("Missing values before cleaning:")
print(cust_df.isnull().sum())



display(Markdown("## Handle missing values"))
# Handle missing values appropriately
# This will come in handy later during scaling where we could specify what columns to scale
# For numeric columns: use median

numeric_cols = cust_df.select_dtypes(include=[np.number]).columns
cust_df[numeric_cols] = cust_df[numeric_cols].fillna(cust_df[numeric_cols].median())

# For categorical columns: use mode
categorical_cols = cust_df.select_dtypes(include=['object']).columns



for col in categorical_cols:
    if cust_df[col].isnull().sum() > 0:
        mode_value = cust_df[col].mode()[0] if not cust_df[col].mode().empty else 'Unknown'
        cust_df[col] = cust_df[col].fillna(mode_value)

# Verify missing values after cleaning
print("Missing values after cleaning:")
print(cust_df.isnull().sum())






# Handle Outliers- Remove them if % of outliers in a column is less than 5% else leave it.
display(Markdown("## Outlier Detection and Removal"))

print(f"Dataset shape before outlier removal: {cust_df.shape}")
print("\n")
print("Outlier Analysis:")

def get_outliers(df, column):
    """Detect outliers using IQR method. Use industry standard 1.5*IQR rule."""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR    
    outlier_rows = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outlier_rows.index

# Store original dataset size
original_size = len(cust_df)

# Check outliers for each numeric column
columns_to_clean = []

for col in numeric_cols:
    outlier_indices = get_outliers(cust_df, col)
    outlier_percentage = (len(outlier_indices) / original_size) * 100    
    print(f"{col}: {len(outlier_indices)} outliers ({outlier_percentage:.2f}%)", end="")    
    if outlier_percentage == 0.0:
        print(" -> No outliers detected")
        continue
    elif outlier_percentage < 5.0:
        columns_to_clean.append(col)
        print(" -> Will remove outliers (< 5%)")
    else:
        print(" -> Will keep outliers (>= 5%)")

print(f"\nColumns where outliers will be removed: {columns_to_clean}")

# Remove outliers from columns where they represent < 5% of data
outliers_to_remove = set()
for col in columns_to_clean:
    outlier_indices = get_outliers(cust_df, col)
    outliers_to_remove.update(outlier_indices)

if outliers_to_remove:
    print(f"\nRemoving {len(outliers_to_remove)} rows containing outliers...")
    cust_df = cust_df.drop(outliers_to_remove).reset_index(drop=True)
    print(f"Dataset shape after outlier removal: {cust_df.shape}")
    print(f"Removed {original_size - len(cust_df)} rows ({((original_size - len(cust_df)) / original_size) * 100:.2f}% of original data)")
else:
    print("\nNo outliers removed (all columns had >= 5% outliers)")



display(Markdown("# Data Preparation and Feature Engineering"))
# Encode categorical variables using one-hot encoding. We already have categorical_cols from before
cust_df = pd.get_dummies(cust_df, columns=categorical_cols, drop_first=True, dtype=np.uint8)

# Create a new column TotalDelay as the sum of all delay columns - Departure Delay (min) and Arrival Delay (min)
cust_df["Total Delay"] = cust_df["Departure Delay (min)"] + cust_df["Arrival Delay (min)"]

# Scale numeric features from numeric_cols using Min-Max Scaling to bring them to a 0-1 range
scaler = MinMaxScaler()
cust_df[numeric_cols] = scaler.fit_transform(cust_df[numeric_cols])

# Rename column Satisfaction_satisfied to Satisfaction
cust_df.rename(columns={"Satisfaction_satisfied": "Satisfaction"}, inplace=True)

# Split the dataset into features (X) and target variable (y)
X = cust_df.drop("Satisfaction", axis=1)
y = cust_df["Satisfaction"]

# Display X and y shapes
print(f"Final feature set shape: {X.shape}")
print(f"Final target variable shape: {y.shape}")
display(y.head())




display(Markdown("# Exploratory Data Analysis"))
# In cust_df find out how all other variables are correlated to the target variable Satisfaction
# Display correlation of all features with target variable Satisfaction
correlations = cust_df.corr()["Satisfaction"].sort_values(ascending=False)
display(Markdown("## Correlation of features with target variable Satisfaction:"))
print(correlations)




display(Markdown("# Model Training & Model Evaluation"))

# Split the dataset into training and testing sets
# Note - stratify=y to maintain class distribution in both sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=28, stratify=y)
print(f"Training set shape: {X_train.shape}, {y_train.shape}")
print(f"Testing set shape: {X_test.shape}, {y_test.shape}")

# Define all models
models = {
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=20, min_samples_leaf=10, random_state=28)
    , "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=20, random_state=28)
    , "KNN": KNeighborsClassifier(n_neighbors=5, weights="distance")
    , "XGBoost": XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=28)    
    , "Logistic Regression": LogisticRegression(random_state=28, max_iter=1000)    
}

def evaluate_model(model, model_name, X_train, X_test, y_train, y_test):        
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, output_dict=True)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        "model_name": model_name,
        "accuracy": accuracy,
        "classification_report": class_report,
        "confusion_matrix": conf_matrix,
        "predictions": y_pred
    }



all_results = []
for model_name, model in models.items():        
    all_results.append(evaluate_model(model, model_name, X_train, X_test, y_train, y_test))

print(f"Obtained eval results for {len(all_results)} models.")



display(Markdown("## Model Comparison"))

for result in all_results:
    print("\n")
    print(f"{result["model_name"]}:")
    print(f"Accuracy: {result["accuracy"]:.4f}")
    print(f"Precision (macro avg): {result["classification_report"]["macro avg"]["precision"]:.4f}")
    print(f"Recall (macro avg): {result["classification_report"]["macro avg"]["recall"]:.4f}")
    print(f"F1-score (macro avg): {result["classification_report"]["macro avg"]["f1-score"]:.4f}")
    print(f"Confusion Matrix:")
    print(f"{result["confusion_matrix"]}")
    print("\n" + "~.~"*50)



display(Markdown("## Best Model Selection"))

# Best model based on accuracy
best_model = max(all_results, key=lambda x: x["accuracy"])
print(f"üèÜ Best Model: {best_model["model_name"]}")
print(f"üéñÔ∏è Best Accuracy: {best_model["accuracy"]:.4f}")

print("\n")

# Best model based on F1-score
best_model_f1 = max(all_results, key=lambda x: x["classification_report"]["macro avg"]["f1-score"])
print(f"üèÜ Best Model by F1-score: {best_model_f1["model_name"]}")
print(f"üéñÔ∏è Best F1-score: {best_model_f1["classification_report"]["macro avg"]["f1-score"]:.4f}")

# Create a comparison table
comparison_data = []
for result in all_results:
    comparison_data.append(
        {
        "Model": result["model_name"],
        "Accuracy": result["accuracy"],
        "Precision": result["classification_report"]["macro avg"]["precision"],
        "Recall": result["classification_report"]["macro avg"]["recall"],
        "F1-Score": result["classification_report"]["macro avg"]["f1-score"]
        }
    )

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values("Accuracy", ascending=False)

display(Markdown("## Model Comparison Table"))
display(comparison_df.round(4))



# Data Cleaning

Data types of each column:
id                            int64
Gender                       object
Age                           int64
Travel Category              object
Travel Class                 object
Distance Travelled            int64
Departure/Arrival Rating    float64
Booking Ease                float64
Boarding Point              float64
Food                        float64
Seat Comfort                float64
Entertainment               float64
Leg Room                    float64
Luggage Handling            float64
Cleanliness                   int64
Departure Delay (min)         int64
Arrival Delay (min)         float64
Satisfaction                 object
dtype: object
Missing values before cleaning:
id                              0
Gender                          0
Age                             0
Travel Category                 0
Travel Class                 6283
Distance Travelled              0
Departure/Arrival Rating    12170
Booking Ease                12149
Boardin

## Handle missing values

Missing values after cleaning:
id                          0
Gender                      0
Age                         0
Travel Category             0
Travel Class                0
Distance Travelled          0
Departure/Arrival Rating    0
Booking Ease                0
Boarding Point              0
Food                        0
Seat Comfort                0
Entertainment               0
Leg Room                    0
Luggage Handling            0
Cleanliness                 0
Departure Delay (min)       0
Arrival Delay (min)         0
Satisfaction                0
dtype: int64


## Outlier Detection and Removal

Dataset shape before outlier removal: (103904, 18)


Outlier Analysis:
id: 0 outliers (0.00%) -> No outliers detected
Age: 0 outliers (0.00%) -> No outliers detected
Distance Travelled: 3650 outliers (3.51%) -> Will remove outliers (< 5%)
Departure/Arrival Rating: 0 outliers (0.00%) -> No outliers detected
Booking Ease: 0 outliers (0.00%) -> No outliers detected
Boarding Point: 0 outliers (0.00%) -> No outliers detected
Food: 0 outliers (0.00%) -> No outliers detected
Seat Comfort: 10892 outliers (10.48%) -> Will keep outliers (>= 5%)
Entertainment: 0 outliers (0.00%) -> No outliers detected
Leg Room: 9268 outliers (8.92%) -> Will keep outliers (>= 5%)
Luggage Handling: 6670 outliers (6.42%) -> Will keep outliers (>= 5%)
Cleanliness: 0 outliers (0.00%) -> No outliers detected
Departure Delay (min): 14529 outliers (13.98%) -> Will keep outliers (>= 5%)
Arrival Delay (min): 13954 outliers (13.43%) -> Will keep outliers (>= 5%)

Columns where outliers will be removed: ['Distance Travelled

# Data Preparation and Feature Engineering

Final feature set shape: (100254, 19)
Final target variable shape: (100254,)


0    0
1    0
2    1
3    0
4    1
Name: Satisfaction, dtype: uint8

# Exploratory Data Analysis

## Correlation of features with target variable Satisfaction:

Satisfaction                       1.000000
Entertainment                      0.369536
Seat Comfort                       0.321039
Cleanliness                        0.298401
Leg Room                           0.274403
Distance Travelled                 0.228934
Luggage Handling                   0.226913
Food                               0.192798
Booking Ease                       0.166159
Age                                0.132568
Gender_Male                        0.013091
id                                 0.010742
Boarding Point                    -0.001299
Departure/Arrival Rating          -0.046919
Departure Delay (min)             -0.050481
Total Delay                       -0.054425
Arrival Delay (min)               -0.057236
Travel Class_Premium              -0.097057
Travel Class_Economy              -0.420401
Travel Category_Personal Travel   -0.445161
Name: Satisfaction, dtype: float64


# Model Training & Model Evaluation

Training set shape: (80203, 19), (80203,)
Testing set shape: (20051, 19), (20051,)
Obtained eval results for 5 models.
Obtained eval results for 5 models.


## Model Comparison



Decision Tree:
Accuracy: 0.8795
Precision (macro avg): 0.8803
Recall (macro avg): 0.8715
F1-score (macro avg): 0.8750
Confusion Matrix:
[[10702   907]
 [ 1510  6932]]

~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~


Random Forest:
Accuracy: 0.8855
Precision (macro avg): 0.8847
Recall (macro avg): 0.8797
F1-score (macro avg): 0.8819
Confusion Matrix:
[[10640   969]
 [ 1326  7116]]

~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~


KNN:
Accuracy: 0.8316
Precision (macro avg): 0.8285
Recall (macro avg): 0.8246
F1-score (macro avg): 0.8263
Confusion Matrix:
[[10088  1521]
 [ 1856  6586]]

~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~~.~


XGBoost:
Accuracy: 0.9019
Precision (macro av

## Best Model Selection

üèÜ Best Model: XGBoost
üéñÔ∏è Best Accuracy: 0.9019


üèÜ Best Model by F1-score: XGBoost
üéñÔ∏è Best F1-score: 0.8990


## Model Comparison Table

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
3,XGBoost,0.9019,0.9009,0.8973,0.899
1,Random Forest,0.8855,0.8847,0.8797,0.8819
0,Decision Tree,0.8795,0.8803,0.8715,0.875
2,KNN,0.8316,0.8285,0.8246,0.8263
4,Logistic Regression,0.8214,0.8171,0.816,0.8165
