## Creating a baseline model using the cleaned data (before advanced feature engineering)

In [29]:
# Import the necessary packages
import os

# For calculating
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#for modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns


In [11]:
# load the cleaned data 
df = pd.read_csv('../data/train_cleaned.csv', dtype={'continent_dep': 'string', 'continent_arr': 'string'}, na_values=[''])
df.head(5)

Unnamed: 0,id,target,airline,day_of_week,continent_dep,iso_country_dep,type_dep,continent_arr,iso_country_arr,type_arr,is_same_country
0,train_id_0,260.0,TU,Sunday,AF,MA,large_airport,AF,TN,large_airport,0.0
1,train_id_1,20.0,TU,Wednesday,EU,IT,large_airport,AF,TN,large_airport,0.0
2,train_id_2,0.0,TU,Saturday,AF,TN,large_airport,EU,TR,large_airport,0.0
3,train_id_3,0.0,TU,Sunday,AF,TN,medium_airport,EU,FR,medium_airport,0.0
4,train_id_4,22.0,TU,Sunday,AF,TN,large_airport,AF,DZ,large_airport,0.0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93679 entries, 0 to 93678
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               93679 non-null  object 
 1   target           93679 non-null  float64
 2   airline          93679 non-null  object 
 3   day_of_week      93679 non-null  object 
 4   continent_dep    93437 non-null  string 
 5   iso_country_dep  93679 non-null  object 
 6   type_dep         93679 non-null  object 
 7   continent_arr    93437 non-null  string 
 8   iso_country_arr  93679 non-null  object 
 9   type_arr         93679 non-null  object 
 10  is_same_country  93679 non-null  float64
dtypes: float64(2), object(7), string(2)
memory usage: 7.9+ MB


In [13]:
df.columns

Index(['id', 'target', 'airline', 'day_of_week', 'continent_dep',
       'iso_country_dep', 'type_dep', 'continent_arr', 'iso_country_arr',
       'type_arr', 'is_same_country'],
      dtype='object')

In [16]:
# Define features and the target 
X = df.drop(columns=['target'])
y = df['target']

# Save 'id' column separately for reference (I'm not sure it should be included as a feature) 
id_column = X['id']

# Identify categorical and numerical features
cat_features = X.select_dtypes(include=['object', 'string']).columns.drop('id')  # Exclude 'id'
num_features = X.select_dtypes(include=['float64', 'int64']).columns

# One-Hot Encode categorical features
X_encoded = pd.get_dummies(X.drop(columns=['id']), columns=cat_features, drop_first=True)

# Split the data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


Training set: (74943, 129), Test set: (18736, 129)


In [24]:
# Check to see how the ccat_features have been encoded 
X_encoded.head(2)

Unnamed: 0,is_same_country,airline_5M,airline_6P,airline_BJ,airline_D4,airline_GJ,airline_GW,airline_OL,airline_PS,airline_QS,...,iso_country_arr_SE,iso_country_arr_SI,iso_country_arr_SK,iso_country_arr_SN,iso_country_arr_TG,iso_country_arr_TN,iso_country_arr_TR,iso_country_arr_UA,type_arr_medium_airport,type_arr_small_airport
0,0.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,0.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [25]:
print(X_train.describe())

       is_same_country
count     74943.000000
mean          0.071227
std           0.257206
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000


In [26]:
# Check the target value
print("Target Value Range:")
print(y.describe())

Target Value Range:
count    93679.000000
mean        55.655099
std        123.262956
min          0.000000
25%          0.000000
50%         19.000000
75%         50.000000
max       3451.000000
Name: target, dtype: float64


Maximum Delay: 3451 minutes

Some extreme delays exist (almost 57 hours delay)!!
This indicates possible outliers.

**Should we include them or deal with them from our baseline model?**

In [27]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")


X_train shape: (74943, 129)
X_test shape: (18736, 129)


In [28]:
print("Remaining Features in X_train:")
print(X_train.columns)

Remaining Features in X_train:
Index(['is_same_country', 'airline_5M', 'airline_6P', 'airline_BJ',
       'airline_D4', 'airline_GJ', 'airline_GW', 'airline_OL', 'airline_PS',
       'airline_QS',
       ...
       'iso_country_arr_SE', 'iso_country_arr_SI', 'iso_country_arr_SK',
       'iso_country_arr_SN', 'iso_country_arr_TG', 'iso_country_arr_TN',
       'iso_country_arr_TR', 'iso_country_arr_UA', 'type_arr_medium_airport',
       'type_arr_small_airport'],
      dtype='object', length=129)


length=129 --> OneHotEncoder created the new columns from our cat features

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Scale only numerical features (in this case, 'is_same_country')
X_train[['is_same_country']] = scaler.fit_transform(X_train[['is_same_country']])
X_test[['is_same_country']] = scaler.transform(X_test[['is_same_country']])

print("Numerical Features Scaled Successfully!")


In [19]:
# Initialize Linear Regression model
baseline_model = LinearRegression()

# Train the model
baseline_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = baseline_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print results
print(f"Model Performance on Test Set:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")


Model Performance on Test Set:
Mean Absolute Error (MAE): 758948915.45
Root Mean Squared Error (RMSE): 74503199314.11
