In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Load datasets (assuming they're in the same directory as this script)
X_train = pd.read_csv("../data/X_train.csv", dtype={'continent_dep': 'string', 'continent_arr': 'string'}, na_values=[''])
y_train = pd.read_csv("../data/y_train.csv")

X_test = pd.read_csv("../data/X_test.csv", dtype={'continent_dep': 'string', 'continent_arr': 'string'}, na_values=[''])
y_test = pd.read_csv("../data/y_test.csv")

# Check for missing values and handle them if necessary
X_train.fillna(method='ffill', inplace=True)
X_test.fillna(method='ffill', inplace=True)

# Convert y_train to 1D array (if needed)
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")

# Optional: Feature Importance
feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_model.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print("\nFeature Importances:\n", feature_importances)


  X_train.fillna(method='ffill', inplace=True)
  X_test.fillna(method='ffill', inplace=True)


Mean Absolute Error (MAE): 1.3426
R² Score: 0.1748

Feature Importances:
                      Feature  Importance
89   type_dep_medium_airport    0.134199
25      time_day_std_morning    0.126141
85        iso_country_dep_TN    0.062855
30      time_day_sta_morning    0.052439
144   type_arr_large_airport    0.028048
..                       ...         ...
90    type_dep_small_airport    0.000001
84        iso_country_dep_TG    0.000000
74        iso_country_dep_RO    0.000000
130       iso_country_arr_RO    0.000000
46        iso_country_dep_DJ    0.000000

[147 rows x 2 columns]
