In [2]:
# ✅ Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Try importing imbalanced-learn, install if not found
try:
    from imblearn.under_sampling import RandomUnderSampler
except ModuleNotFoundError:
    print("Installing imbalanced-learn...")
    !pip install imbalanced-learn
    from imblearn.under_sampling import RandomUnderSampler

# ✅ Step 2: Load Data
file_path_X = "../data/X_train.csv"  # Update with correct path
file_path_y = "../data/y_train.csv"

X_train = pd.read_csv(file_path_X)
y_train = pd.read_csv(file_path_y).squeeze()  # Ensure y_train is a Series

# Merge X_train and y_train for easier analysis
df_train = X_train.copy()
df_train["target"] = y_train

# ✅ Step 3: Identify Categorical Features (One-Hot Encoded)
airline_columns = [col for col in df_train.columns if col.startswith("airline_")]
country_dep_columns = [col for col in df_train.columns if col.startswith("iso_country_dep_")]
country_arr_columns = [col for col in df_train.columns if col.startswith("iso_country_arr_")]

# Compute class distributions
feature_groups = {
    "airline": airline_columns,
    "iso_country_dep": country_dep_columns,
    "iso_country_arr": country_arr_columns,
}

class_distributions = {}
for group, columns in feature_groups.items():
    class_distributions[group] = df_train[columns].sum().sort_values(ascending=False) / len(df_train) * 100

# ✅ Step 4: Group Rare Categories into "Other"
rare_threshold = 0.5  # Categories appearing in <0.5% of dataset
rare_features = []

for group, dist in class_distributions.items():
    rare_cols = dist[dist < rare_threshold].index.tolist()
    rare_features.extend(rare_cols)

# Create a new "Other" category by summing rare categories
if rare_features:
    df_train["airline_other"] = df_train[rare_features].sum(axis=1)
    df_train = df_train.drop(columns=rare_features)
    print(f"✅ Grouped {len(rare_features)} rare categories into 'Other'.")

# ✅ Step 5: Convert Continuous Delays into Categories
bins = [-1, 0, 15, 60, 180, np.inf]  # Delay categories: 0, 1-15, 16-60, 61-180, 180+
labels = ["On-time", "Short Delay", "Moderate Delay", "Long Delay", "Extreme Delay"]

df_train["delay_category"] = pd.cut(df_train["target"], bins=bins, labels=labels)

# ✅ Step 6: Apply Undersampling to Balance Categories
X = df_train.drop(columns=["target", "delay_category"])  # Remove original target
y = df_train["delay_category"]  # Use categorical delay as target

undersampler = RandomUnderSampler(sampling_strategy="auto", random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# ✅ Step 7: Convert Back to Regression Format
# Replace delay categories with median delay values
category_medians = df_train.groupby("delay_category")["target"].median()
y_resampled_continuous = y_resampled.map(category_medians)

# ✅ Step 8: Save the Balanced Dataset
X_train_balanced = pd.DataFrame(X_resampled, columns=X.columns)
y_train_balanced = pd.Series(y_resampled_continuous, name="target")

X_train_balanced.to_csv("../data/X_train_balanced.csv", index=False)
y_train_balanced.to_csv("../data/y_train_balanced.csv", index=False)

print("✅ Balanced dataset saved: 'X_train_balanced.csv' and 'y_train_balanced.csv'.")


✅ Grouped 79 rare categories into 'Other'.


  category_medians = df_train.groupby("delay_category")["target"].median()


✅ Balanced dataset saved: 'X_train_balanced.csv' and 'y_train_balanced.csv'.
