In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import joblib

# Frequency Encoder Transformer
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_map_ = {}
        self.default_ = 0.0

    def fit(self, X, y=None):
        s = pd.Series(X.ravel()) if hasattr(X, "ravel") else pd.Series(X)
        self.freq_map_ = (s.value_counts(normalize=True)).to_dict()
        self.default_ = 0.0
        return self

    def transform(self, X):
        s = pd.Series(X.ravel()) if hasattr(X, "ravel") else pd.Series(X)
        return s.map(self.freq_map_).fillna(self.default_).to_numpy().reshape(-1, 1)

# 1. Load CSV and create target
df = pd.read_csv('../artifacts/global_monthly_electricity_production_final.csv')
median_production = df['electricity_production_gwh'].median()
df['production_level'] = (df['electricity_production_gwh'] >= median_production).astype(int)

# 2. Prepare features and target
X = df.drop(['production_level', 'electricity_production_gwh'], axis=1)
y = df['production_level']

# 3. Split train/test
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4. Columns definition
num_cols = ['year', 'month', 'quarter']
binary_cols = ['is_renewable', 'is_fossil', 'is_developed', 'is_zero']
freq_cols = ['country_clean']
onehot_cols = ['season','energy_source_clean', 'energy_type', 'source_category','year_category']
ordinal_cols = ['energy_intensity_category']

# Fix Here
# 5a. Define the named function (replaces lambda)
def convert_bool_to_int(X):
    """Converts boolean array to integer array for the pipeline."""
    return X.astype(int)

# 5b. Transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

# Use the named function for boolean columns
binary_transformer = Pipeline([
    ('to_int', FunctionTransformer(convert_bool_to_int)),
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

freq_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('freq', FrequencyEncoder())
])

onehot_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Ordinal encoder for energy_intensity_category (specify the order)
intensity_order = [['Low', 'Medium', 'High', 'Very High']]
ordinal_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Low')),
    ('ord', OrdinalEncoder(categories=intensity_order))
])

# 6. ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('bin', binary_transformer, binary_cols),
    ('freq', freq_transformer, freq_cols),
    ('onehot', onehot_transformer, onehot_cols),
    ('ord', ordinal_transformer, ordinal_cols)
], remainder='drop')

# 7. RandomForest with tuned parameters
best_rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=False,
    random_state=42
)

# 8. Full pipeline
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', best_rf)
])

# 9. Fit pipeline on raw training data
print("Fitting the pipeline...")
full_pipeline.fit(X_train_raw, y_train)
print("Pipeline fitting complete.")

# 10. SAVE THE PIPELINE
print("\nSaving pipeline to 'full_pipeline.pkl'...")
joblib.dump(full_pipeline, "full_pipeline.pkl")
print("✅ Full pipeline saved successfully as full_pipeline.pkl")

# 11. Predict and evaluate on test data
print("\n--- Model Evaluation ---")
y_pred = full_pipeline.predict(X_test_raw)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# 12. Predict on new "high production" example data
print("\n--- Prediction on High Production Example ---")
new_raw_high = pd.DataFrame([{
    'year': 2024,
    'month': 11,
    'quarter': 4,
    'is_renewable': True,
    'is_fossil': False,
    'is_developed': True,
    'is_zero': False,
    'country_clean': 'Sri Lanka',
    'season': 'Winter',
    'energy_source_clean': 'Hydro',
    'energy_type': 'Renewable',
    'source_category': 'Primary',
    'year_category': 'Late 2010s',
    'energy_intensity_category': 'Medium'
}])

pred_high = full_pipeline.predict(new_raw_high)
print("Predicted class:", pred_high)

proba_high = full_pipeline.predict_proba(new_raw_high)[:,1]
print("Predicted probability of high production:", proba_high)

#  NEWLY ADDED CODE 
# 13. Predict on new "low production" example data
print("\n--- Prediction on Low Production Example ---")
new_raw_low = pd.DataFrame([{
    'year': 2005,
    'month': 2,
    'quarter': 1,
    'is_renewable': False,
    'is_fossil': True,
    'is_developed': False,
    'is_zero': False,
    'country_clean': 'Nepal',
    'season': 'Winter',
    'energy_source_clean': 'Oil',
    'energy_type': 'Conventional',
    'source_category': 'Fossil Fuel',
    'year_category': 'Before 2010',
    'energy_intensity_category': 'Low'
}])

# Predict class
pred_low = full_pipeline.predict(new_raw_low)
print("Predicted class:", pred_low)

# Predict probability of class 1 (high production level)
proba_low = full_pipeline.predict_proba(new_raw_low)[:, 1]
print("Predicted probability of high production:", proba_low)

Fitting the pipeline...
Pipeline fitting complete.

Saving pipeline to 'full_pipeline.pkl'...
✅ Full pipeline saved successfully as full_pipeline.pkl

--- Model Evaluation ---
Test Accuracy: 0.9322033898305084

--- Prediction on High Production Example ---
Predicted class: [1]
Predicted probability of high production: [0.56708135]

--- Prediction on Low Production Example ---
Predicted class: [0]
Predicted probability of high production: [0.25897024]
