In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error

# Load datasets
train = pd.read_csv("/content/train (1).csv")
test = pd.read_csv("/content/test (1).csv")

# Identify target variable
target_column = "output_electricity_generation"

# Ensure target column exists in train but NOT in test
if target_column not in train.columns:
    raise ValueError(f"Column '{target_column}' is missing from training data!")
if target_column in test.columns:
    test = test.drop(columns=[target_column], errors="ignore")

# Identify numerical and categorical columns
numerical_cols = train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = train.select_dtypes(include=["object"]).columns.tolist()

# Remove target from numerical columns list
if target_column in numerical_cols:
    numerical_cols.remove(target_column)

# Remove 'uid' from feature lists
for col_list in [numerical_cols, categorical_cols]:
    if "uid" in col_list:
        col_list.remove("uid")

# Handle missing values
num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

train[numerical_cols] = num_imputer.fit_transform(train[numerical_cols])
test[numerical_cols] = num_imputer.transform(test[numerical_cols])

train[categorical_cols] = cat_imputer.fit_transform(train[categorical_cols])
test[categorical_cols] = cat_imputer.transform(test[categorical_cols])

# Convert categorical columns to numeric (Label Encoding)
for col in categorical_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

# Scale Numerical Features
scaler = StandardScaler()
train[numerical_cols] = scaler.fit_transform(train[numerical_cols])
test[numerical_cols] = scaler.transform(test[numerical_cols])

# Split data
X = train.drop(columns=[target_column, "uid"], errors="ignore")
y = train[target_column]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define RandomForest model with optimized hyperparameters
rf_model = RandomForestRegressor(
    n_estimators=100,      # Reduce trees for faster training
    max_depth=15,          # Prevent overfitting
    max_features="sqrt",   # Use sqrt of features to speed up training
    n_jobs=-1,             # Use all CPU cores
    random_state=42
)

# Train the model
rf_model.fit(X_train, y_train)

# Predictions
y_train_pred = rf_model.predict(X_train)
y_val_pred = rf_model.predict(X_val)
y_test_pred = rf_model.predict(test.drop(columns=["uid"], errors="ignore"))

# Calculate RMSE
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

print(f"🚀 Optimized Train RMSE: {train_rmse:.4f}")
print(f"🚀 Optimized Validation RMSE: {val_rmse:.4f}")

# Save submission file
submission = pd.DataFrame({
    "uid": test["uid"] if "uid" in test.columns else range(len(y_test_pred)),
    "output_electricity_generation": y_test_pred
})
submission.to_csv("submission.csv", index=False)
print("✅ Submission file saved as 'submission.csv'")


🚀 Optimized Train RMSE: 2.0935
🚀 Optimized Validation RMSE: 3.5367
✅ Submission file saved as 'submission.csv'


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb

# Load datasets
train = pd.read_csv("/content/train (1).csv")
test = pd.read_csv("/content/test (1).csv")
# Identify target variable
target_column = "output_electricity_generation"

# Ensure target column exists in train but not in test
if target_column not in train.columns:
    raise ValueError(f"Column '{target_column}' is missing from training data!")
if target_column in test.columns:
    test = test.drop(columns=[target_column], errors="ignore")

# Identify numerical and categorical columns
numerical_cols = train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = train.select_dtypes(include=["object"]).columns.tolist()

# Remove target from numerical columns list
if target_column in numerical_cols:
    numerical_cols.remove(target_column)

# Remove 'uid' from feature lists
for col_list in [numerical_cols, categorical_cols]:
    if "uid" in col_list:
        col_list.remove("uid")

# Handle missing values
num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

train[numerical_cols] = num_imputer.fit_transform(train[numerical_cols])
test[numerical_cols] = num_imputer.transform(test[numerical_cols])

train[categorical_cols] = cat_imputer.fit_transform(train[categorical_cols])
test[categorical_cols] = cat_imputer.transform(test[categorical_cols])

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# Split train into features (X) and target (y)
X = train.drop(columns=[target_column, "uid"], errors="ignore")
y = train[target_column]

# Split train into train/validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure test columns match training columns
X_test = test.drop(columns=["uid"], errors="ignore")

missing_cols = set(X_train.columns) - set(X_test.columns)
extra_cols = set(X_test.columns) - set(X_train.columns)

if missing_cols:
    raise ValueError(f"Test data is missing columns: {missing_cols}")
if extra_cols:
    X_test = X_test.drop(columns=extra_cols, errors="ignore")

# Model Training Function
def train_and_evaluate(model, model_name):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

    print(f"{model_name} - Train RMSE: {train_rmse:.4f}, Validation RMSE: {val_rmse:.4f}")
    return val_rmse, model

# Train multiple models
models = {
    "RandomForest": RandomForestRegressor(n_estimators=300, max_depth=20, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=10, subsample=0.8, colsample_bytree=0.8, random_state=42),
    "LightGBM": lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05, max_depth=10, subsample=0.8, colsample_bytree=0.8, random_state=42)
}

best_rmse = float("inf")
best_model = None
best_model_name = ""

for name, model in models.items():
    val_rmse, trained_model = train_and_evaluate(model, name)
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        best_model = trained_model
        best_model_name = name

print(f"\n✅ Best Model: {best_model_name} with Validation RMSE: {best_rmse:.4f}")

# Make final predictions using the best model
y_test_pred = best_model.predict(X_test)

# Ensure submission file has exactly 21600 rows
submission2 = pd.DataFrame({
    "uid": test["uid"] if "uid" in test.columns else range(len(y_test_pred)),
    "output_electricity_generation": y_test_pred
})
assert submission2.shape[0] == 21600, f"Error: Expected 21600 rows, but got {submission2.shape[0]}"

submission2.to_csv("submission2.csv", index=False)
print("✅ Submission file saved as 'submission2.csv'")


RandomForest - Train RMSE: 1.4712, Validation RMSE: 3.0785
XGBoost - Train RMSE: 0.2847, Validation RMSE: 2.8129
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005636 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2383
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 12
[LightGBM] [Info] Start training from score 832.300201
LightGBM - Train RMSE: 1.7875, Validation RMSE: 3.1896

✅ Best Model: XGBoost with Validation RMSE: 2.8129
✅ Submission file saved as 'submission2.csv'


In [None]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

# Load datasets
train = pd.read_csv("/content/train (1).csv")
test = pd.read_csv("/content/test (1).csv")

# Identify target variable
target_column = "output_electricity_generation"

# Identify numerical and categorical columns
numerical_cols = train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = train.select_dtypes(include=["object"]).columns.tolist()

# Remove target and 'uid' from numerical columns
numerical_cols = [col for col in numerical_cols if col not in [target_column, "uid"]]

# Handle missing values
num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

train[numerical_cols] = num_imputer.fit_transform(train[numerical_cols])
test[numerical_cols] = num_imputer.transform(test[numerical_cols])

train[categorical_cols] = cat_imputer.fit_transform(train[categorical_cols])
test[categorical_cols] = cat_imputer.transform(test[categorical_cols])

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# Remove outliers using IQR method
def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    return df[(df[col] >= (Q1 - 1.5 * IQR)) & (df[col] <= (Q3 + 1.5 * IQR))]

train = remove_outliers(train, target_column)

# Split train into features (X) and target (y)
X = train.drop(columns=[target_column, "uid"], errors="ignore")
y = train[target_column]

# Split train into train/validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure test data matches training columns
X_test = test.drop(columns=["uid"], errors="ignore")

# Hyperparameter Tuning with Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
    }

    model = xgb.XGBRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, y_val_pred))

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

best_params = study.best_params
print(f"Best XGBoost Params: {best_params}")

# Train XGBoost with best params
xgb_model = xgb.XGBRegressor(**best_params, random_state=42)
xgb_model.fit(X_train, y_train)
y_xgb_val = xgb_model.predict(X_val)
xgb_rmse = np.sqrt(mean_squared_error(y_val, y_xgb_val))

# Train LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05, max_depth=10, subsample=0.8, colsample_bytree=0.8, random_state=42)
lgb_model.fit(X_train, y_train)
y_lgb_val = lgb_model.predict(X_val)
lgb_rmse = np.sqrt(mean_squared_error(y_val, y_lgb_val))

print(f"XGBoost RMSE: {xgb_rmse:.4f}, LightGBM RMSE: {lgb_rmse:.4f}")

# Ensemble: Average predictions of XGBoost & LightGBM
y_val_pred_ensemble = (y_xgb_val * 0.6) + (y_lgb_val * 0.4)
ensemble_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_ensemble))
print(f"Ensemble Model RMSE: {ensemble_rmse:.4f}")

# Train final models on full data
xgb_model.fit(X, y)
lgb_model.fit(X, y)

y_test_xgb = xgb_model.predict(X_test)
y_test_lgb = lgb_model.predict(X_test)

# Ensemble Predictions
y_test_pred = (y_test_xgb * 0.6) + (y_test_lgb * 0.4)

# Save Submission
submission5 = pd.DataFrame({"uid": test["uid"], "output_electricity_generation": y_test_pred})
submission5.to_csv("submission5.csv", index=False)
print("✅ Submission file saved as 'submission5.csv'")


[I 2025-02-09 13:30:11,085] A new study created in memory with name: no-name-c33585c3-a701-4f39-a1f5-94fad52bab36
[I 2025-02-09 13:30:33,445] Trial 0 finished with value: 2.3460826859974295 and parameters: {'n_estimators': 337, 'learning_rate': 0.10538995244811226, 'max_depth': 8, 'subsample': 0.6224248745853117, 'colsample_bytree': 0.984706641670051}. Best is trial 0 with value: 2.3460826859974295.
[I 2025-02-09 13:30:45,127] Trial 1 finished with value: 2.2653929909592856 and parameters: {'n_estimators': 551, 'learning_rate': 0.1622431750093771, 'max_depth': 8, 'subsample': 0.9665068283279222, 'colsample_bytree': 0.6967476392733617}. Best is trial 1 with value: 2.2653929909592856.
[I 2025-02-09 13:31:06,622] Trial 2 finished with value: 1.985266307322321 and parameters: {'n_estimators': 792, 'learning_rate': 0.08234996281643882, 'max_depth': 11, 'subsample': 0.7197753878674782, 'colsample_bytree': 0.6029289913015466}. Best is trial 2 with value: 1.985266307322321.
[I 2025-02-09 13:31

Best XGBoost Params: {'n_estimators': 985, 'learning_rate': 0.03681042213065991, 'max_depth': 15, 'subsample': 0.6623745913511481, 'colsample_bytree': 0.7093572523471423}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2383
[LightGBM] [Info] Number of data points in the train set: 31185, number of used features: 12
[LightGBM] [Info] Start training from score 920.688998
XGBoost RMSE: 1.8639, LightGBM RMSE: 2.3818
Ensemble Model RMSE: 1.8832
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008758 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2383
[LightGBM] [Info] Number of data points in the train set: 38982, number of used features: 12
[LightGBM] [Info] Start training from score 920.908883
✅ Submission file saved as 'submission5.csv'


In [None]:
!pip install optuna


Collecting optuna
  Downloading optuna-4.2.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.0-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.4/383.4 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Enable IterativeImputer before importing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


# Load datasets
train = pd.read_csv("/content/train (1).csv")
test = pd.read_csv("/content/test (1).csv")

# Identify target variable
target_column = "output_electricity_generation"

# Ensure target column exists in train but NOT in test
if target_column not in train.columns:
    raise ValueError(f"Column '{target_column}' is missing from training data!")
if target_column in test.columns:
    test = test.drop(columns=[target_column], errors="ignore")

# Identify numerical and categorical columns
numerical_cols = train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = train.select_dtypes(include=["object"]).columns.tolist()

# Remove target from numerical columns list
if target_column in numerical_cols:
    numerical_cols.remove(target_column)

# Remove 'uid' from all feature lists
for col_list in [numerical_cols, categorical_cols]:
    if "uid" in col_list:
        col_list.remove("uid")

# Advanced Missing Value Imputation
num_imputer = IterativeImputer(max_iter=10, random_state=42)  # Iterative instead of KNN
cat_imputer = SimpleImputer(strategy="most_frequent")  # Mode instead of constant

train[numerical_cols] = num_imputer.fit_transform(train[numerical_cols])
test[numerical_cols] = num_imputer.transform(test[numerical_cols])

train[categorical_cols] = cat_imputer.fit_transform(train[categorical_cols])
test[categorical_cols] = cat_imputer.transform(test[categorical_cols])

# One-Hot Encode Categorical Variables
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
train_encoded = encoder.fit_transform(train[categorical_cols])
test_encoded = encoder.transform(test[categorical_cols])

# Convert encoded arrays back to DataFrame
train_encoded = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_cols), index=train.index)
test_encoded = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_cols), index=test.index)

# Drop original categorical columns and add the new encoded ones
train = train.drop(columns=categorical_cols).join(train_encoded)
test = test.drop(columns=categorical_cols).join(test_encoded)


# Scale Numerical Features (RobustScaler instead of StandardScaler)
scaler = RobustScaler()
train[numerical_cols] = scaler.fit_transform(train[numerical_cols])
test[numerical_cols] = scaler.transform(test[numerical_cols])

# Split train into features (X) and target (y)
X = train.drop(columns=[target_column, "uid"], errors="ignore")
y = train[target_column]

# Split train data into train and validation sets for RMSE evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure 'uid' is dropped from test before prediction
X_test = test.drop(columns=["uid"], errors="ignore")

# Ensure test columns match training columns
missing_cols = set(X_train.columns) - set(X_test.columns)
extra_cols = set(X_test.columns) - set(X_train.columns)

if missing_cols:
    raise ValueError(f"Test data is missing columns: {missing_cols}")
if extra_cols:
    X_test = X_test.drop(columns=extra_cols, errors="ignore")  # Drop unexpected columns

# Define Base Models
rf_model = RandomForestRegressor(n_estimators=500, max_depth=15, min_samples_split=5,
                                 min_samples_leaf=2, max_features="sqrt", random_state=42)

xgb_model = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=10,
                         colsample_bytree=0.8, subsample=0.8, random_state=42)

lgbm_model = LGBMRegressor(n_estimators=500, learning_rate=0.05, max_depth=10,
                           num_leaves=31, subsample=0.8, colsample_bytree=0.8, random_state=42)

# Define Stacking Regressor
stacking_model = StackingRegressor(
    estimators=[("rf", rf_model), ("xgb", xgb_model), ("lgbm", lgbm_model)],
    final_estimator=GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    passthrough=True, n_jobs=-1
)

# Train Model
stacking_model.fit(X_train, y_train)

# Make predictions
y_train_pred = stacking_model.predict(X_train)
y_val_pred = stacking_model.predict(X_val)
y_test_pred = stacking_model.predict(X_test)

# Calculate RMSE
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

print(f"🚀 Optimized Train RMSE: {train_rmse:.4f}")
print(f"🚀 Optimized Validation RMSE: {val_rmse:.4f}")

# Ensure submission file has exactly 21600 rows
submission = pd.DataFrame({
    "uid": test["uid"] if "uid" in test.columns else range(len(y_test_pred)),
    "output_electricity_generation": y_test_pred
})
assert submission.shape[0] == 21600, f"Error: Expected 21600 rows, but got {submission.shape[0]}"

submission.to_csv("submission.csv", index=False)
print("✅ Submission file saved as 'submission.csv'")




AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.experimental import enable_iterative_imputer  # Fix IterativeImputer Import
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.metrics import mean_absolute_error

# Sample Dataset (Replace with your dataset)
np.random.seed(42)
data = pd.DataFrame({
    'feature1': np.random.randn(100),
    'feature2': np.random.rand(100) * 100,
    'feature3': np.random.choice(['A', 'B', 'C'], 100),
    'target': np.random.randn(100) * 10
})

# Splitting Data
train, test = train_test_split(data, test_size=0.2, random_state=42)

# Identify categorical & numerical columns
categorical_cols = ['feature3']
numerical_cols = ['feature1', 'feature2']

# Impute Missing Values
num_imputer = IterativeImputer(max_iter=10, random_state=42)
train[numerical_cols] = num_imputer.fit_transform(train[numerical_cols])
test[numerical_cols] = num_imputer.transform(test[numerical_cols])

# One-Hot Encoding Fix (Updated for sklearn 1.3+)
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)  # Fix
train_encoded = encoder.fit_transform(train[categorical_cols])
test_encoded = encoder.transform(test[categorical_cols])

# Convert Encoded Features into DataFrame
train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_cols))
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_cols))

# Reset Index Before Merging
train_encoded_df.index = train.index
test_encoded_df.index = test.index

# Merge Encoded Features with Numeric Data
train_final = pd.concat([train[numerical_cols], train_encoded_df, train['target']], axis=1)
test_final = pd.concat([test[numerical_cols], test_encoded_df, test['target']], axis=1)

# Splitting Features and Target
X_train, y_train = train_final.drop(columns=['target']), train_final['target']
X_test, y_test = test_final.drop(columns=['target']), test_final['target']

# Define Base Models
rf = RandomForestRegressor(n_estimators=100, random_state=42)
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Define Stacking Regressor
stacking_model = StackingRegressor(estimators=[('rf', rf), ('gb', gb)], final_estimator=RandomForestRegressor())

# Fix `__sklearn_tags__` AttributeError for sklearn 1.3+
if not hasattr(StackingRegressor, "__sklearn_tags__"):
    StackingRegressor.__sklearn_tags__ = lambda self: {}

# Train Model
stacking_model.fit(X_train, y_train)

# Make Predictions
y_pred = stacking_model.predict(X_test)

# Evaluate Model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")


Mean Absolute Error: 11.58


In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

# Load datasets
train = pd.read_csv("/content/train (1).csv")
test = pd.read_csv("/content/test (1).csv")

# Identify target variable
target_column = "output_electricity_generation"

# Ensure target column exists in train but NOT in test
if target_column not in train.columns:
    raise ValueError(f"Column '{target_column}' is missing from training data!")
if target_column in test.columns:
    test = test.drop(columns=[target_column], errors="ignore")

# Identify numerical and categorical columns
numerical_cols = train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = train.select_dtypes(include=["object"]).columns.tolist()

# Remove target from numerical columns list
if target_column in numerical_cols:
    numerical_cols.remove(target_column)

# Remove 'uid' from all feature lists
for col_list in [numerical_cols, categorical_cols]:
    if "uid" in col_list:
        col_list.remove("uid")

# Handle missing values
num_imputer = IterativeImputer(max_iter=10, random_state=42)  # Best imputation method
cat_imputer = SimpleImputer(strategy="most_frequent")  # Mode imputation

train[numerical_cols] = num_imputer.fit_transform(train[numerical_cols])
test[numerical_cols] = num_imputer.transform(test[numerical_cols])

train[categorical_cols] = cat_imputer.fit_transform(train[categorical_cols])
test[categorical_cols] = cat_imputer.transform(test[categorical_cols])

# One-Hot Encode Categorical Variables
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
train_encoded = encoder.fit_transform(train[categorical_cols])
test_encoded = encoder.transform(test[categorical_cols])

# Convert to DataFrame and merge with numerical data
train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_cols))
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_cols))

train_final = pd.concat([train[numerical_cols], train_encoded_df], axis=1)
test_final = pd.concat([test[numerical_cols], test_encoded_df], axis=1)

# Scale Numerical Features using RobustScaler (better for outliers)
scaler = RobustScaler()
train_final[numerical_cols] = scaler.fit_transform(train_final[numerical_cols])
test_final[numerical_cols] = scaler.transform(test_final[numerical_cols])

# Split train into features (X) and target (y)
X = train_final
y = train[target_column]

# Train-test split for evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning with RandomizedSearchCV
param_dist = {
    "n_estimators": [500, 1000, 1500],
    "max_depth": [15, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt"]
}

rf_model = RandomForestRegressor(random_state=42)

search = RandomizedSearchCV(
    rf_model, param_distributions=param_dist, n_iter=30, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, random_state=42
)

search.fit(X_train, y_train)
best_rf = search.best_estimator_

# Train Gradient Boosting Model
gb_model = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42)
gb_model.fit(X_train, y_train)

# Stacking Regressor
stacking_model = StackingRegressor(
    estimators=[("rf", best_rf), ("gb", gb_model)],
    final_estimator=GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42),
    cv=5
)

stacking_model.fit(X_train, y_train)

# Make predictions
y_train_pred = stacking_model.predict(X_train)
y_val_pred = stacking_model.predict(X_val)
y_test_pred = stacking_model.predict(test_final)

# Calculate RMSE
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

print(f"🚀 Optimized Train RMSE: {train_rmse:.4f}")
print(f"🚀 Optimized Validation RMSE: {val_rmse:.4f}")

# Ensure submission file has exactly 21600 rows
submission = pd.DataFrame({
    "uid": test["uid"] if "uid" in test.columns else range(len(y_test_pred)),
    "output_electricity_generation": y_test_pred
})
assert submission.shape[0] == 21600, f"Error: Expected 21600 rows, but got {submission.shape[0]}"

submission.to_csv("submission.csv", index=False)
print("✅ Submission file saved as 'submission.csv'")


