## Creating a baseline model using the cleaned data (before advanced feature engineering)

In [16]:
# Import the necessary packages
import os

# For calculating
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#for modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# load the cleaned data 
df = pd.read_csv('../data/train_cleaned.csv', dtype={'continent_dep': 'string', 'continent_arr': 'string'}, na_values=[''])
df.head(5)

Unnamed: 0,id,target,airline,day_of_week,time_day_std,time_day_sta,continent_dep,iso_country_dep,type_dep,continent_arr,iso_country_arr,type_arr,is_same_country
0,train_id_0,260.0,TU,Sunday,morning,midday,AF,MA,large_airport,AF,TN,large_airport,0.0
1,train_id_1,20.0,TU,Wednesday,afternoon,afternoon,EU,IT,large_airport,AF,TN,large_airport,0.0
2,train_id_2,0.0,TU,Saturday,night,morning,AF,TN,large_airport,EU,TR,large_airport,0.0
3,train_id_3,0.0,TU,Sunday,afternoon,evening,AF,TN,medium_airport,EU,FR,medium_airport,0.0
4,train_id_4,22.0,TU,Sunday,afternoon,afternoon,AF,TN,large_airport,AF,DZ,large_airport,0.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93679 entries, 0 to 93678
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               93679 non-null  object 
 1   target           93679 non-null  float64
 2   airline          93679 non-null  object 
 3   day_of_week      93679 non-null  object 
 4   time_day_std     93679 non-null  object 
 5   time_day_sta     93679 non-null  object 
 6   continent_dep    93437 non-null  string 
 7   iso_country_dep  93679 non-null  object 
 8   type_dep         93679 non-null  object 
 9   continent_arr    93437 non-null  string 
 10  iso_country_arr  93679 non-null  object 
 11  type_arr         93679 non-null  object 
 12  is_same_country  93679 non-null  float64
dtypes: float64(2), object(9), string(2)
memory usage: 9.3+ MB


In [4]:
df['is_same_country'] = df['is_same_country'].replace({0: False, 1: True})

In [5]:
df['target_log'] = np.log(df['target']+1)

In [6]:
# Define features and the target 
X = df.drop(columns=['target', 'id', 'target_log'])
y = df['target_log']

# One-Hot Encode categorical features
X_encoded = pd.get_dummies(X, drop_first=True)

# Split the data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


Training set: (74943, 137), Test set: (18736, 137)


In [13]:
# Check to see how the ccat_features have been encoded 
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 74943 entries, 65956 to 15795
Columns: 137 entries, is_same_country to type_arr_small_airport
dtypes: bool(137)
memory usage: 10.4 MB


In [11]:
# Check the target value
print("Target Value Range:")
print(y.describe())

Target Value Range:
count    93679.000000
mean         2.659602
std          1.861438
min          0.000000
25%          0.000000
50%          2.995732
75%          3.931826
max          8.146709
Name: target_log, dtype: float64


In [14]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")


X_train shape: (74943, 137)
X_test shape: (18736, 137)


In [17]:
# Initialize Linear Regression model
baseline_model = LinearRegression()

# Train the model
baseline_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = baseline_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_sqr = r2_score(y_test, y_pred)

# Print results
print(f"Model Performance on Test Set:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R2 : {r_sqr:.2f}")


Model Performance on Test Set:
Mean Absolute Error (MAE): 151971083.12
Root Mean Squared Error (RMSE): 14936703842.84
R2 : -64378388395785641984.00
