In [155]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.impute import SimpleImputer

In [156]:
df=pd.read_csv('Energy Consumption/train.csv')
df.isnull().sum()

Id                                         0
date                                       0
Lagging_Current_Reactive.Power_kVarh       0
Leading_Current_Reactive_Power_kVarh     119
CO2(tCO2)                                  0
Lagging_Current_Power_Factor               0
Leading_Current_Power_Factor              76
NSM                                        0
WeekStatus                              1774
Day_of_week                             1774
Load_Type                                  0
Usage_kWh                                  0
dtype: int64

In [157]:
X = df.drop(['Usage_kWh','Id','date'], axis=1)
y=df['Usage_kWh']

In [158]:
# Impute Missing Values

df.drop_duplicates(inplace=True)
numerical_features = ['Leading_Current_Reactive_Power_kVarh','Leading_Current_Power_Factor']
categorical_features = ['WeekStatus', 'Day_of_week']

numerical_imputer=SimpleImputer(strategy='mean')
X[numerical_features] = numerical_imputer.fit_transform(X[numerical_features])

categorical_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = categorical_imputer.fit_transform(X[categorical_features])

In [159]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

# Encode Categorical Features

df_X=X
ct=ColumnTransformer(transformers=[('encoder', OneHotEncoder(dtype=int),['WeekStatus', 'Day_of_week'])],remainder="passthrough")
X=ct.fit_transform(X) 
feature_names = ct.get_feature_names_out(df_X.columns) 
X = pd.DataFrame(X, columns=feature_names)

X.columns = X.columns.str.replace('remainder__', '')
X.columns = X.columns.str.replace('encoder__', '')

label_encoder = LabelEncoder()
X['Load_Type'] = label_encoder.fit_transform(X['Load_Type'])


print(X)

      WeekStatus_Weekday WeekStatus_Weekend Day_of_week_Friday  \
0                      1                  0                  0   
1                      1                  0                  0   
2                      1                  0                  0   
3                      1                  0                  0   
4                      1                  0                  0   
...                  ...                ...                ...   
27994                  1                  0                  1   
27995                  1                  0                  1   
27996                  1                  0                  1   
27997                  1                  0                  1   
27998                  1                  0                  1   

      Day_of_week_Monday Day_of_week_Saturday Day_of_week_Sunday  \
0                      1                    0                  0   
1                      1                    0                  0   
2  

In [160]:
from sklearn.feature_selection import mutual_info_regression

# Apply Feature Engineering Using Mutual Info  Regression (MIR) to reduce dimensionality 

mi = mutual_info_regression(X, y)
selected_features = X.columns[mi > 0.1]  
X = X[selected_features]
print(X)

      Lagging_Current_Reactive.Power_kVarh  \
0                                     2.95   
1                                     4.46   
2                                     3.28   
3                                     3.56   
4                                      4.5   
...                                    ...   
27994                                32.62   
27995                                35.46   
27996                                30.92   
27997                                58.18   
27998                                40.46   

      Leading_Current_Reactive_Power_kVarh CO2(tCO2)  \
0                                      0.0       0.0   
1                                      0.0       0.0   
2                                      0.0       0.0   
3                                      0.0       0.0   
4                                      0.0       0.0   
...                                    ...       ...   
27994                                  0.0      0.02   

In [161]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_validate,learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
import matplotlib.pyplot as plt

In [162]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
# Define the number of folds for cross-validation
n_splits = 5
degree = 3  # Best degree determined from previous analysis

# Initialize lists to store performance metrics for each fold
r2_scores = []
rmse_scores = []

# Initialize KFold with 5 folds
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Create polynomial features
    poly_features = PolynomialFeatures(degree=degree)
    X_train_poly = poly_features.fit_transform(X_train)
    X_val_poly = poly_features.transform(X_val)

    # Scale features
    best_scaler = StandardScaler()
    X_train_scaled = best_scaler.fit_transform(X_train_poly)
    X_val_scaled = best_scaler.transform(X_val_poly)

    # Train Linear Regression model
    best_model = LinearRegression()
    best_model.fit(X_train_scaled, y_train)

    # Predict on the validation set
    y_val_pred = best_model.predict(X_val_scaled)

    # Calculate R2 score and RMSE
    r2 = r2_score(y_val, y_val_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

    # Append scores to lists
    r2_scores.append(r2)
    rmse_scores.append(rmse)

# Calculate average R2 score and RMSE across all folds
avg_r2_score = np.mean(r2_scores)
avg_rmse_score = np.mean(rmse_scores)

# Print average R2 score and RMSE
print(f"Average R2 Score: {avg_r2_score}")
print(f"Average RMSE: {avg_rmse_score}")

Average R2 Score: 0.9983869122965088
Average RMSE: 1.3617035848439438


In [133]:
# # Plot Learning Curve of Best Degree
# from sklearn.linear_model import LinearRegression
# # Polynomial Linear Regression Model

# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.pipeline import make_pipeline

# k = 5
# kf = KFold(n_splits=k, shuffle=True, random_state=42)


# degree=2



# print(f"\nPolynomial Degree: {degree}")

# rmse_val, mae_val, r2_val = [], [], []
# rmse_test, mae_test, r2_test = [], [], []

# for fold, (train_index, test_index) in enumerate(kf.split(X)):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#     train_size = int(len(X_train) * 0.7)
#     val_size = test_size = int(len(X_train) * 0.15)

#     X_train, X_val, X_test = X_train.iloc[:train_size], X_train.iloc[train_size:train_size + val_size], X_train.iloc[train_size + val_size:]
#     y_train, y_val, y_test = y_train.iloc[:train_size], y_train.iloc[train_size:train_size + val_size], y_train.iloc[train_size + val_size:]

#     best_scaler = StandardScaler()
#     X_train_scaled = best_scaler.fit_transform(X_train)
#     X_val_scaled = best_scaler.transform(X_val)
#     X_test_scaled = best_scaler.transform(X_test)

#     best_model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
#     best_model.fit(X_train_scaled, y_train)

#     y_val_pred = best_model.predict(X_val_scaled)

#     rmse_val.append(np.sqrt(mean_squared_error(y_val, y_val_pred)))
#     mae_val.append(mean_absolute_error(y_val, y_val_pred))
#     r2_val.append(r2_score(y_val, y_val_pred))

#     y_test_pred = best_model.predict(X_test_scaled)

#     rmse_test.append(np.sqrt(mean_squared_error(y_test, y_test_pred)))
#     mae_test.append(mean_absolute_error(y_test, y_test_pred))
#     r2_test.append(r2_score(y_test, y_test_pred))

#     # avg_rmse_val.append(np.mean(rmse_val))
#     # avg_mae_val.append(np.mean(mae_val))
#     # avg_r2_val.append(np.mean(r2_val))

#     # avg_rmse_test.append(np.mean(rmse_test))
#     # avg_mae_test.append(np.mean(mae_test))
#     # avg_r2_test.append(np.mean(r2_test))

# print("\nAverage Metrics:")
# print("Validation Set:")
# print(f"Average RMSE: {np.mean(rmse_val):.4f}")
# print(f"Average MAE: {np.mean(mae_val):.4f}")
# print(f"Average R-squared: {np.mean(r2_val):.4f}")

# print("\nTest Set:")
# print(f"Average RMSE: {np.mean(rmse_test):.4f}")
# print(f"Average MAE: {np.mean(mae_test):.4f}")
# print(f"Average R-squared: {np.mean(r2_test):.4f}")


# # best_degree = degrees_to_try[np.argmin(avg_rmse_val)]
# # print(f"\nBest Polynomial Degree based on Average RMSE : {best_degree}")

# # print("\nAverage Performance Metrics for the Best Degree on Validation Set:")
# # print(f"Average RMSE: {avg_rmse_val[degrees_to_try.index(best_degree)]:.4f}")
# # print(f"Average MAE: {avg_mae_val[degrees_to_try.index(best_degree)]:.4f}")
# # print(f"Average R-squared: {avg_r2_val[degrees_to_try.index(best_degree)]:.4f}")

# # print("\nAverage Performance Metrics for the Best Degree on Test Set:")
# # print(f"Average RMSE: {avg_rmse_test[degrees_to_try.index(best_degree)]:.4f}")
# # print(f"Average MAE: {avg_mae_test[degrees_to_try.index(best_degree)]:.4f}")
# # print(f"Average R-squared: {avg_r2_test[degrees_to_try.index(best_degree)]:.4f}")



Polynomial Degree: 2

Average Metrics:
Validation Set:
Average RMSE: 1.8359
Average MAE: 1.2170
Average R-squared: 0.9964

Test Set:
Average RMSE: 2.0291
Average MAE: 1.3469
Average R-squared: 0.9956


In [163]:
df_test=pd.read_csv('Energy Consumption/test.csv')
df_test.isna().sum()

Id                                        0
date                                      0
Lagging_Current_Reactive.Power_kVarh      0
Leading_Current_Reactive_Power_kVarh      0
CO2(tCO2)                                 0
Lagging_Current_Power_Factor              0
Leading_Current_Power_Factor              0
NSM                                       0
WeekStatus                              692
Day_of_week                             692
Load_Type                                 0
dtype: int64

In [164]:
X_testing = df_test.drop(['Id','date','WeekStatus','Day_of_week'], axis=1)
print(X_testing.shape)

(7041, 7)


In [165]:
X_testing['Load_Type'] = label_encoder.transform(X_testing['Load_Type'])

print(X_testing)

      Lagging_Current_Reactive.Power_kVarh  \
0                                    63.68   
1                                    54.18   
2                                    55.04   
3                                    36.40   
4                                    35.17   
...                                    ...   
7036                                  4.86   
7037                                  3.74   
7038                                  3.17   
7039                                  3.06   
7040                                  3.02   

      Leading_Current_Reactive_Power_kVarh  CO2(tCO2)  \
0                                     0.00       0.05   
1                                     0.00       0.04   
2                                     0.00       0.04   
3                                     0.00       0.03   
4                                     0.00       0.03   
...                                    ...        ...   
7036                                  0.00      

In [166]:
X_testing_poly = poly_features.transform(X_testing)

# Scale test data using the same scaler as used for training
X_testing_scaled = best_scaler.transform(X_testing_poly)

predictions=best_model.predict(X_testing_scaled)
print(X_testing.shape)
print(predictions)


(7041, 7)
[99.09531595 89.48389031 91.02740018 ...  4.29403783  4.56496796
  3.9509105 ]


In [167]:
predictions_df = pd.DataFrame({'Id': df_test['Id'], 'Usage_kWh': predictions})

predictions_df.shape



(7041, 2)

In [125]:
predictions_df.to_csv('predictions.csv', index=False)