In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ===============================
# 1. LOAD DATASET
# ===============================
df = pd.read_csv("ev_charging_dataset_50000.csv")
print("Dataset Loaded!")

# ===============================
# 2. FEATURE ENGINEERING
# ===============================
# Weekend flag
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x in [6,7] else 0)
# Interaction features
df['queue_capacity_ratio'] = df['queue_length'] / df['station_capacity']
df['time_pressure'] = df['travel_time'] + df['queue_length']

# ===============================
# 3. FEATURE ENCODING
# ===============================
le = LabelEncoder()
df['time_category_encoded'] = le.fit_transform(df['time_category'])

# ===============================
# 4. DEFINE FEATURES & TARGET
# ===============================
feature_cols = [
    'travel_time', 'station_number', 'time_of_day',
    'queue_length', 'charging_time', 'day_of_week',
    'is_peak_hours', 'station_capacity', 'is_weekend',
    'queue_capacity_ratio', 'time_pressure', 'time_category_encoded'
]
X = df[feature_cols]
y = df['total_waiting_time']

# ===============================
# 5. SCALING
# ===============================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ===============================
# 6. TRAIN/TEST SPLIT
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# ===============================
# 7. MODEL (Memory Friendly RandomForest)
# ===============================
model = RandomForestRegressor(
    n_estimators=50,  # fewer trees → less memory
    max_depth=None,
    random_state=42,
    n_jobs=1          # single core → less memory spike
)
model.fit(X_train, y_train)
print("Model Trained Successfully!")

# ===============================
# 8. EVALUATION
# ===============================
preds = model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

print("\n===== MODEL PERFORMANCE =====")
print(f"MAE  : {mae:.3f}")
print(f"RMSE : {rmse:.3f}")
print(f"R²   : {r2:.3f}")

# ===============================
# 9. FEATURE IMPORTANCE
# ===============================
fi = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values(by='importance', ascending=False)

print("\n===== FEATURE IMPORTANCE =====")
print(fi)


Dataset Loaded!
Model Trained Successfully!

===== MODEL PERFORMANCE =====
MAE  : 6.384
RMSE : 7.413
R²   : 0.697

===== FEATURE IMPORTANCE =====
                  feature  importance
9    queue_capacity_ratio    0.736417
4           charging_time    0.058825
10          time_pressure    0.047912
0             travel_time    0.041826
2             time_of_day    0.033782
1          station_number    0.028657
5             day_of_week    0.022168
7        station_capacity    0.009581
11  time_category_encoded    0.008654
3            queue_length    0.007867
8              is_weekend    0.002182
6           is_peak_hours    0.002130
