In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv("C:/Users/Hiremath/OneDrive/Desktop/database_IND.csv")

# Step 1: Remove irrelevant columns
irrelevant_columns = ["name", "gppd_idnr", "latitude", "longitude", "other_fuel1", "other_fuel2", "other_fuel3", "owner", "source", "url", "geolocation_source", "wepp_id", "generation_data_source", "estimated_generation_gwh"]
data.drop(columns=irrelevant_columns, inplace=True)

# Step 2: Handle missing values
data.fillna(value={'other_fuel1': 'None', 'other_fuel2': 'None', 'other_fuel3': 'None'}, inplace=True)

# Step 3: Feature Engineering
data['age_of_plant'] = pd.to_datetime(data['year_of_capacity_data'], format='%Y').dt.year - data['commissioning_year']
data['total_generation'] = data[['generation_gwh_2013', 'generation_gwh_2014', 'generation_gwh_2015',
                                 'generation_gwh_2016', 'generation_gwh_2017', 'generation_gwh_2018',
                                 'generation_gwh_2019']].sum(axis=1)

# Step 4: Split data into features and targets
X = data.drop(['capacity_mw'], axis=1)
y_primary = data['primary_fuel']
y_capacity = data['capacity_mw']

# Step 5: Encode categorical variables using LabelEncoder
encoder = LabelEncoder()
X_encoded = X.copy()
X_encoded['country'] = encoder.fit_transform(X['country'])
X_encoded['country_long'] = encoder.fit_transform(X['country_long'])
X_encoded['primary_fuel'] = encoder.fit_transform(X['primary_fuel'])

# Step 6: Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_encoded)

# Step 7: Split data into training and testing sets
X_train, X_test, y_train_primary, y_test_primary, y_train_capacity, y_test_capacity = train_test_split(X_imputed, y_primary, y_capacity, test_size=0.2, random_state=42)

# Step 8: Train models
primary_model = RandomForestClassifier(random_state=42)
capacity_model = RandomForestRegressor(random_state=42)

primary_model.fit(X_train, y_train_primary)
capacity_model.fit(X_train, y_train_capacity)

# Step 9: Predictions
y_pred_primary = primary_model.predict(X_test)
y_pred_capacity = capacity_model.predict(X_test)

# Step 10: Evaluate models
primary_accuracy = accuracy_score(y_test_primary, y_pred_primary)
capacity_rmse = mean_squared_error(y_test_capacity, y_pred_capacity, squared=False)

print("Primary Fuel Classification Accuracy:", primary_accuracy)
print("Capacity Prediction RMSE:", capacity_rmse)


Primary Fuel Classification Accuracy: 0.978021978021978
Capacity Prediction RMSE: 201.43911877723625
