In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

# Load the dataset
file_path = 'seasonal_data_updated.csv'
data = pd.read_csv(file_path)

# Define seasons
seasons = {
    'Spring': [3, 4, 5],
    'Summer': [6, 7, 8],
    'Autumn': [9, 10, 11],
    'Winter': [12, 1, 2]
}

# Assign season to each row
def get_season(month):
    for season, months in seasons.items():
        if month in months:
            return season
    return 'Unknown'

data['Season'] = data['month'].apply(get_season)

# Remove noise and outliers using the IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    df_cleaned = df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
    return df_cleaned

# Apply the outlier removal function to relevant columns
columns_to_clean = ['Temperature', 'Humidity', 'ALLSKY_SFC_UVB', 'ALLSKY_SFC_UV_INDEX', 'ALLSKY_SFC_UVA', 'QV2M', 'RH2M', 'PRECTOTCORR', 'WS10M', 'Inflation', 'Load']
for column in columns_to_clean:
    data = remove_outliers(data, column)

# Prepare the data
features = data.drop(columns=['Load', 'Season'])
target = data['Load']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'KNN': KNeighborsRegressor(),
    'SVM': SVR(),
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
}

# Train and evaluate models
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    results.append({'Model': name, 'RMSE': rmse, 'R2': r2, 'MAPE': mape})

# Convert results to a DataFrame and display
results_df = pd.DataFrame(results)
print(results_df)

# Save the model performance metrics to a CSV file
results_path = 'model_performance_metrics.csv'
results_df.to_csv(results_path, index=False)

results_df.head()


               Model         RMSE        R2      MAPE
0  Linear Regression  4810.255796  0.502850  0.103378
1      Decision Tree  1237.525483  0.967095  0.010723
2      Random Forest  1086.553027  0.974634  0.017333
3                KNN  2513.862990  0.864220  0.045709
4                SVM  7114.930867 -0.087658  0.138871
5            XGBoost  1167.087152  0.970734  0.023017


Unnamed: 0,Model,RMSE,R2,MAPE
0,Linear Regression,4810.255796,0.50285,0.103378
1,Decision Tree,1237.525483,0.967095,0.010723
2,Random Forest,1086.553027,0.974634,0.017333
3,KNN,2513.86299,0.86422,0.045709
4,SVM,7114.930867,-0.087658,0.138871


In [2]:

# Helper function to calculate and return the evaluation metrics
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100  # Convert MAPE to percentage
    return rmse, r2, mape

# Train and evaluate models
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    rmse, r2, mape = evaluate_model(model, X_test, y_test)
    results.append({'Model': name, 'RMSE': rmse, 'R2': r2, 'MAPE (%)': mape})


In [3]:
# Convert results to a DataFrame and display
results_df = pd.DataFrame(results)
print(results_df)

               Model         RMSE        R2   MAPE (%)
0  Linear Regression  4810.255796  0.502850  10.337822
1      Decision Tree  1237.525483  0.967095   1.072338
2      Random Forest  1086.553027  0.974634   1.733261
3                KNN  2513.862990  0.864220   4.570888
4                SVM  7114.930867 -0.087658  13.887150
5            XGBoost  1167.087152  0.970734   2.301658
