In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load data
train_df = pd.read_csv('train (2).csv')
test_df = pd.read_csv('test (1).csv')

# Handle missing values
for col in ['day', 'hour', 'minute']:
    mode_value = train_df[col].mode()[0]
    train_df[col].fillna(mode_value, inplace=True)
    test_df[col].fillna(mode_value, inplace=True)

turbine_cols = ['C_motion', 'feed_water_motion', 'faucet_hole', 'vapour_pressure', 
                'vapour_enthalpy', 'vapour_pressure_at_division', 'vapour_motion', 
                'feed_water_enth', 'vapour_temperature']
for col in turbine_cols:
    median_value = train_df[col].median()
    train_df[col].fillna(median_value, inplace=True)
    test_df[col].fillna(median_value, inplace=True)

# Feature Engineering
train_df['pressure_temp_interaction'] = train_df['vapour_pressure'] * train_df['vapour_temperature']
test_df['pressure_temp_interaction'] = test_df['vapour_pressure'] * test_df['vapour_temperature']

train_df['log_vapour_pressure'] = np.log1p(train_df['vapour_pressure'])
test_df['log_vapour_pressure'] = np.log1p(test_df['vapour_pressure'])

# One-Hot Encoding
categorical_columns = ['day']
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoded_train = pd.DataFrame(encoder.fit_transform(train_df[categorical_columns]))
encoded_test = pd.DataFrame(encoder.transform(test_df[categorical_columns]))

encoded_train.columns = encoder.get_feature_names_out(categorical_columns)
encoded_test.columns = encoder.get_feature_names_out(categorical_columns)

train_df.drop(columns=categorical_columns, inplace=True)
test_df.drop(columns=categorical_columns, inplace=True)

train_df = pd.concat([train_df, encoded_train], axis=1)
test_df = pd.concat([test_df, encoded_test], axis=1)

# Ensure both train and test have the same columns
missing_cols = set(train_df.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0  

test_df.drop(columns=set(test_df.columns) - set(train_df.columns), inplace=True)

# Define features and target variable
X = train_df.drop(columns=['uid', 'output_electricity_generation'])
y = train_df['output_electricity_generation']

# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Selection using RandomForest Importance
rf_temp = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_temp.fit(X_train, y_train)

feature_importance = rf_temp.feature_importances_
important_features = np.array(X.columns)[feature_importance > np.percentile(feature_importance, 20)]

X_train = X_train[important_features]
X_valid = X_valid[important_features]

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

# Train models
rf_model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
lgbm_model = LGBMRegressor(n_estimators=400, learning_rate=0.03, max_depth=7, random_state=42)
xgb_model = XGBRegressor(n_estimators=300, learning_rate=0.03, max_depth=7, random_state=42)
cat_model = CatBoostRegressor(iterations=300, learning_rate=0.03, depth=7, verbose=0, random_state=42)

rf_model.fit(X_train, y_train)
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)

# Predictions & RMSE
rf_pred = rf_model.predict(X_valid)
lgbm_pred = lgbm_model.predict(X_valid)
xgb_pred = xgb_model.predict(X_valid)
cat_pred = cat_model.predict(X_valid)

rf_rmse = np.sqrt(mean_squared_error(y_valid, rf_pred))
lgbm_rmse = np.sqrt(mean_squared_error(y_valid, lgbm_pred))
xgb_rmse = np.sqrt(mean_squared_error(y_valid, xgb_pred))
cat_rmse = np.sqrt(mean_squared_error(y_valid, cat_pred))

print(f"RandomForest RMSE: {rf_rmse}")
print(f"LightGBM RMSE: {lgbm_rmse}")
print(f"XGBoost RMSE: {xgb_rmse}")
print(f"CatBoost RMSE: {cat_rmse}")






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006390 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2830
[LightGBM] [Info] Number of data points in the train set: 40320, number of used features: 12
[LightGBM] [Info] Start training from score 832.300201
RandomForest RMSE: 3.4565630728710897
LightGBM RMSE: 4.916022364203782
XGBoost RMSE: 5.465190129500503
CatBoost RMSE: 6.989862713768145


In [6]:
# Weighted Ensemble
ensemble_pred = (0.1 * lgbm_pred + 0.9 * rf_pred)
ensemble_rmse = np.sqrt(mean_squared_error(y_valid, ensemble_pred))
print(f"Ensemble RMSE: {ensemble_rmse}")

# Prepare test data
test_df_features = test_df[important_features]
test_df_features = scaler.transform(test_df_features)

# Final predictions
final_predictions = (0.1 * lgbm_model.predict(test_df_features) +
                     0.9 * rf_model.predict(test_df_features))

# Save submission
submission = pd.DataFrame({
    'uid': test_df['uid'],
    'output_electricity_generation': final_predictions
})
submission.to_csv(r"C:\\Users\\HP\\Desktop\\AOML\\attempt8.csv", index=False)
print("Submission file 'attempt8.csv' has been saved.")


Ensemble RMSE: 3.4817120377556976
Submission file 'attempt8.csv' has been saved.
