In [3]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns

In [4]:
df = pd.read_csv("https://drive.google.com/uc?id=19zgQEeZNUtybmQ4ZxGtwR6i-NdqQJKBK")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25976 entries, 0 to 25975
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         25976 non-null  int64  
 1   id                                 25976 non-null  int64  
 2   Gender                             25976 non-null  object 
 3   Customer Type                      25976 non-null  object 
 4   Age                                25976 non-null  int64  
 5   Type of Travel                     25976 non-null  object 
 6   Class                              25976 non-null  object 
 7   Flight Distance                    25976 non-null  int64  
 8   Inflight wifi service              25976 non-null  int64  
 9   Departure/Arrival time convenient  25976 non-null  int64  
 10  Ease of Online booking             25976 non-null  int64  
 11  Gate location                      25976 non-null  int

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,...,2,2,2,2,4,2,4,0,20.0,satisfied


In [7]:
df.columns

Index(['Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Age', 'Type of Travel',
       'Class', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

In [8]:
# 1. Cleaning: Drop unnecessary columns
# 'Unnamed: 0', 'id' are identifiers/artifacts not useful for prediction
cols_to_drop = ['Unnamed: 0', 'id']
df_clean = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

# 2. Validation & Cleaning: Handle Missing Values
# Based on the df.info, 'Arrival Delay in Minutes' has missing values.
# We fill with median (robust to outliers).
df_clean['Arrival Delay in Minutes'] = df_clean['Arrival Delay in Minutes'].fillna(df_clean['Arrival Delay in Minutes'].median())

# 3. Feature Engineering
# Create 'Total Delay'
df_clean['Total Delay'] = df_clean['Departure Delay in Minutes'] + df_clean['Arrival Delay in Minutes']

# Create 'Delay Category' (No Delay, Short Delay, Long Delay)
def categorize_delay(x):
    if x == 0: return 'No Delay'
    elif x <= 30: return 'Short Delay'
    else: return 'Long Delay'
df_clean['Delay Category'] = df_clean['Total Delay'].apply(categorize_delay)

# 4. Encoding
# Label Encoding for Target ('satisfaction')
satisfaction_map = {'neutral or dissatisfied': 0, 'satisfied': 1}
df_clean['satisfaction_encoded'] = df_clean['satisfaction'].map(satisfaction_map)

# Ordinal Encoding for 'Class' (Order matters)
class_map = {'Eco': 1, 'Eco Plus': 2, 'Business': 3}
df_clean['Class_encoded'] = df_clean['Class'].map(class_map)

# One-Hot Encoding for nominal variables (Gender, Customer Type, Type of Travel)
# We use get_dummies just to show how it's done, but we'll keep the original columns for EDA readability
df_encoded = pd.get_dummies(df_clean, columns=['Gender', 'Customer Type', 'Type of Travel'], drop_first=True)


# ==========================================
# PART 2: EXPLORATORY DATA ANALYSIS (EDA)
# ==========================================

print("--- SHAPE ---")
print(df_clean.shape)
print("\n--- INFO ---")
print(df_clean.info())
print("\n--- DESCRIBE (Numerical) ---")
print(df_clean.describe())

--- SHAPE ---
(25976, 27)

--- INFO ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25976 entries, 0 to 25975
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             25976 non-null  object 
 1   Customer Type                      25976 non-null  object 
 2   Age                                25976 non-null  int64  
 3   Type of Travel                     25976 non-null  object 
 4   Class                              25976 non-null  object 
 5   Flight Distance                    25976 non-null  int64  
 6   Inflight wifi service              25976 non-null  int64  
 7   Departure/Arrival time convenient  25976 non-null  int64  
 8   Ease of Online booking             25976 non-null  int64  
 9   Gate location                      25976 non-null  int64  
 10  Food and drink                     25976 non-null  int64  
 11  Online boardin