In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd

# Load data
train_df = pd.read_csv('C:/Users/laava/Desktop/sem 6/AOML/train (2).csv')
test_df = pd.read_csv('C:/Users/laava/Desktop/sem 6/AOML/test (1).csv')

# Separate features and target
X_train = train_df.drop(columns=['output_electricity_generation'])
y_train = train_df['output_electricity_generation']
X_test = test_df.copy()  # No target column here

# Identify numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['number']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Impute numerical columns with median
num_imputer = SimpleImputer(strategy="median")
X_train[numerical_cols] = num_imputer.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = num_imputer.transform(X_test[numerical_cols])

# Encode categorical columns for KNN
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[categorical_cols] = encoder.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = encoder.transform(X_test[categorical_cols])

# Impute categorical columns using KNN
knn_imputer = KNNImputer(n_neighbors=5, weights="uniform")
X_train[categorical_cols] = knn_imputer.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = knn_imputer.transform(X_test[categorical_cols])

# Convert back to original categorical labels
X_train[categorical_cols] = encoder.inverse_transform(X_train[categorical_cols])
X_test[categorical_cols] = encoder.inverse_transform(X_test[categorical_cols])

print("Missing values handled successfully!")
print(X_train.isnull().sum())  # Check if nulls are removed

Missing values handled successfully!
uid                            0
day                            0
hour                           0
minute                         0
C_motion                       0
feed_water_motion              0
faucet_hole                    0
vapour_pressure                0
vapour_enthalpy                0
vapour_pressure_at_division    0
vapour_motion                  0
feed_water_enth                0
vapour_temperature             0
dtype: int64


In [3]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])

# Convert to DataFrame
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(categorical_cols))
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(categorical_cols))

# Reset index to match original
X_train_encoded.index = X_train.index
X_test_encoded.index = X_test.index

# Drop original categorical columns and merge encoded ones
X_train = X_train.drop(columns=categorical_cols).join(X_train_encoded)
X_test = X_test.drop(columns=categorical_cols).join(X_test_encoded)

print("One-hot encoding completed! Shape after encoding:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")

One-hot encoding completed! Shape after encoding:
X_train: (50400, 14), X_test: (21600, 14)


In [5]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Ensure 'day' column is included in X_train and X_test
X_train['day'] = train_df['day']
X_test['day'] = test_df['day']

# One-Hot Encoding for categorical columns
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_ohe = ohe.fit_transform(X_train[categorical_cols])
X_test_ohe = ohe.transform(X_test[categorical_cols])

# Convert to DataFrame
X_train_ohe = pd.DataFrame(X_train_ohe, columns=ohe.get_feature_names_out(categorical_cols))
X_test_ohe = pd.DataFrame(X_test_ohe, columns=ohe.get_feature_names_out(categorical_cols))

# Drop original categorical columns and add one-hot encoded columns
X_train = X_train.drop(columns=categorical_cols).reset_index(drop=True)
X_test = X_test.drop(columns=categorical_cols).reset_index(drop=True)
X_train = pd.concat([X_train, X_train_ohe], axis=1)
X_test = pd.concat([X_test, X_test_ohe], axis=1)

# Feature Scaling
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

print("Preprocessing completed: One-Hot Encoding and Scaling done!")
print(X_train.head())

Preprocessing completed: One-Hot Encoding and Scaling done!
        uid      hour    minute  C_motion  feed_water_motion  faucet_hole  \
0 -1.732016 -0.860939 -0.012667  0.698866           0.875207     0.606628   
1 -1.731948  0.017723 -0.073905 -1.175604          -1.645046     0.732016   
2 -1.731879  0.544921 -0.992477  0.082036           0.213797    -0.818066   
3 -1.731810  0.369188 -0.012667 -0.972840          -1.608024     0.757067   
4 -1.731742  1.072118 -0.808762 -1.614805          -1.922442     1.346672   

   vapour_pressure  vapour_enthalpy  vapour_pressure_at_division  \
0         0.579672        -0.728635                     0.593817   
1        -1.270916         2.688440                     0.257985   
2         0.207561        -0.176404                     0.204875   
3        -1.306035         0.400432                    -1.319171   
4        -1.700604         0.350202                    -1.719726   

   vapour_motion  feed_water_enth  vapour_temperature  day_Friday  \

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# 1️⃣ Split data into train (80%) and validation (20%)
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Ensure the input data is in the correct shape
X_train_split = X_train_split.values
X_val = X_val.values

# 2️⃣ Initialize models
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.05, random_state=42)
lgb_model = lgb.LGBMRegressor(n_estimators=500, learning_rate=0.05, random_state=42)
cat_model = cb.CatBoostRegressor(iterations=500, learning_rate=0.05, depth=6, verbose=0, random_state=42)

# 3️⃣ Train models on training split
xgb_model.fit(X_train_split, y_train_split)
lgb_model.fit(X_train_split, y_train_split)
cat_model.fit(X_train_split, y_train_split)

# 4️⃣ Predict on validation set
xgb_val_pred = xgb_model.predict(X_val)
lgb_val_pred = lgb_model.predict(X_val)
cat_val_pred = cat_model.predict(X_val)

# 5️⃣ Compute RMSE for validation set
xgb_val_rmse = np.sqrt(mean_squared_error(y_val, xgb_val_pred))
lgb_val_rmse = np.sqrt(mean_squared_error(y_val, lgb_val_pred))
cat_val_rmse = np.sqrt(mean_squared_error(y_val, cat_val_pred))

# 6️⃣ Voting Regressor (Blended Model)
voting_reg = VotingRegressor(estimators=[
    ('xgb', xgb_model),
    ('lgb', lgb_model),
    ('cat', cat_model)
])
voting_reg.fit(X_train_split, y_train_split)

# 7️⃣ Get validation predictions from Voting Regressor
ensemble_val_pred = voting_reg.predict(X_val)

# 8️⃣ Compute RMSE for Voting Regressor on validation set
ensemble_val_rmse = np.sqrt(mean_squared_error(y_val, ensemble_val_pred))

# 🔥 Print RMSE for validation set
print(f"Validation RMSE for XGBoost: {xgb_val_rmse:.4f}")
print(f"Validation RMSE for LightGBM: {lgb_val_rmse:.4f}")
print(f"Validation RMSE for CatBoost: {cat_val_rmse:.4f}")
print(f"Validation RMSE for Voting Regressor: {ensemble_val_rmse:.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000840 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2646
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 17
[LightGBM] [Info] Start training from score 832.300201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000882 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2646
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 17
[LightGBM] [Info] Start training from score 832.300201
Validation RMSE for XGBoost: 4.4274
Validation RMSE for LightGBM: 3.5104
Validation RMSE for CatBoost: 4.6362
Validation RMSE for Voting Regressor: 3.4747


In [16]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import VotingRegressor

# 1️⃣ Split train-validation set
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Ensure the input data is in the correct format
X_train_split = X_train_split.values
X_val = X_val.values

# 2️⃣ Best Parameters from Optuna
best_xgb_params = {
    "n_estimators": 790,
    "max_depth": 11,
    "learning_rate": 0.0879951300409359,
    "subsample": 0.8168097123116692,
    "colsample_bytree": 0.8391529700118769,
    "reg_lambda": 0.0905254702059362,
    "reg_alpha": 8.71874092645817,
    "objective": "reg:squarederror",
    "random_state": 42
}

best_lgb_params = {
    "n_estimators": 641,
    "num_leaves": 64,
    "learning_rate": 0.16236033385267262,
    "subsample": 0.9985436628526456,
    "colsample_bytree": 0.9516585974076679,
    "reg_lambda": 5.480182346492727,
    "reg_alpha": 0.041685890523880054,
    "random_state": 42
}

# 3️⃣ Train Final Models with Best Params
xgb_model = xgb.XGBRegressor(**best_xgb_params)
lgb_model = lgb.LGBMRegressor(**best_lgb_params)

xgb_model.fit(X_train_split, y_train_split)
lgb_model.fit(X_train_split, y_train_split)

# 4️⃣ Evaluate Final Models on Validation Set
xgb_val_pred = xgb_model.predict(X_val)
lgb_val_pred = lgb_model.predict(X_val)

xgb_val_rmse = np.sqrt(mean_squared_error(y_val, xgb_val_pred))
lgb_val_rmse = np.sqrt(mean_squared_error(y_val, lgb_val_pred))

print(f"Final XGBoost Validation RMSE: {xgb_val_rmse:.4f}")
print(f"Final LightGBM Validation RMSE: {lgb_val_rmse:.4f}")

# 5️⃣ Train Updated Voting Regressor
voting_reg = VotingRegressor(estimators=[
    ('xgb', xgb_model),
    ('lgb', lgb_model)
])
voting_reg.fit(X_train_split, y_train_split)

ensemble_val_pred = voting_reg.predict(X_val)
ensemble_val_rmse = np.sqrt(mean_squared_error(y_val, ensemble_val_pred))

print(f"Final Voting Regressor Validation RMSE: {ensemble_val_rmse:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000743 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2646
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 17
[LightGBM] [Info] Start training from score 832.300201
Final XGBoost Validation RMSE: 2.2135
Final LightGBM Validation RMSE: 2.8311
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2646
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 17
[LightGBM] [Info] Start training from score 832.300201
Final Voting Regressor Validation RMSE: 2.1904


In [18]:
# Ensure the input data is in the correct shape
X_train_values = X_train.values
X_test_values = X_test.values

# Train the final models on FULL training data
xgb_model.fit(X_train_values, y_train)
lgb_model.fit(X_train_values, y_train)
cat_model.fit(X_train_values, y_train)

# Train the Voting Regressor on full training data
voting_reg.fit(X_train_values, y_train)

# Predict on test set
test_preds = voting_reg.predict(X_test_values)

# Create submission dataframe with uid and predictions
submission_df = test_df[["uid"]].copy()
submission_df["output_electricity_generation"] = test_preds

# Save submission file
submission_df.to_csv("submissions3.csv", index=False)

print("✅ Predictions saved with uid as 'final_predictions.csv'")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000956 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2646
[LightGBM] [Info] Number of data points in the train set: 50400, number of used features: 17
[LightGBM] [Info] Start training from score 831.808573
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000817 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2646
[LightGBM] [Info] Number of data points in the train set: 50400, number of used features: 17
[LightGBM] [Info] Start training from score 831.808573
✅ Predictions saved with uid as 'final_predictions.csv'
