## Creating a baseline model using the cleaned data (before advanced feature engineering)

In [2]:
# Import the necessary packages
import os

# For calculating
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#for modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
X_train = pd.read_csv("../data/X_train.csv", dtype={'continent_dep': 'string', 'continent_arr': 'string'}, na_values=[''])
y_train = pd.read_csv("../data/y_train.csv")
X_test = pd.read_csv("../data/X_test.csv", dtype={'continent_dep': 'string', 'continent_arr': 'string'}, na_values=[''])
y_test = pd.read_csv("../data/y_test.csv")

In [4]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")


X_train shape: (74943, 137)
X_test shape: (18736, 137)


In [5]:
print("Remaining Features in X_train:")
print(X_train.columns)

Remaining Features in X_train:
Index(['is_same_country', 'airline_5M', 'airline_6P', 'airline_BJ',
       'airline_D4', 'airline_GJ', 'airline_GW', 'airline_OL', 'airline_PS',
       'airline_QS',
       ...
       'iso_country_arr_SE', 'iso_country_arr_SI', 'iso_country_arr_SK',
       'iso_country_arr_SN', 'iso_country_arr_TG', 'iso_country_arr_TN',
       'iso_country_arr_TR', 'iso_country_arr_UA', 'type_arr_medium_airport',
       'type_arr_small_airport'],
      dtype='object', length=137)


In [6]:
# Initialize Linear Regression model
baseline_model = LinearRegression()

# Train the model
baseline_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = baseline_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print results
print(f"Model Performance on Test Set:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")


Model Performance on Test Set:
Mean Absolute Error (MAE): 151971083.12
Root Mean Squared Error (RMSE): 14936703842.84
