In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve

In [4]:
# Visualization
def visualizeSCF(feature, name):  # SCF = Single Categorical feature
    plt.figure(figsize=(8, 6))
    sns.countplot(x=feature)
    plt.title('Countplot for: ', name)
    plt.xlabel(name)
    plt.ylabel('Count')
    plt.show()


def visualizeSNF(feature, name):  # SNF = Single Numerical feature
    plt.figure(figsize=(8, 6))
    sns.histplot(x=feature, kde=True, bins=100)
    plt.title(f'Histogram for: {name}')
    plt.ylabel('Count')
    plt.show()


In [5]:
# Models & Transformers
encoder = LabelEncoder()
scaler = MinMaxScaler()
model = LinearRegression()

In [6]:
def detectOutliersIqr(feature):
    print("Before detectOutliersIqr")
    Q1 = np.percentile(feature, 25)
    Q3 = np.percentile(feature, 75)
    IQR = Q3 - Q1

    # Set a threshold (1.5 * IQR)
    threshold = 1.5
    lowerBound = Q1 - threshold * IQR
    upperBound = Q3 + threshold * IQR

    # Find outliers
    outliers = feature[(feature < lowerBound) | (feature > upperBound)]
    outlierCount = len(outliers)
    print("After detectOutliersIqr")
    return {
        'featureName': feature.name,
        'outlierIndices': outliers.index.tolist(),
        'outlierCount': outlierCount
    }



In [7]:
def transformDate(features):
    print("Before transformDate")
    features[['tempDate', 'time']] = features['date'].str.split(' ', n=1, expand=True)
    features['tempDate'] = pd.to_datetime(features['tempDate'], format='%d/%m/%Y')
    features['Month'] = features['tempDate'].dt.month
    features['Hour'] = pd.to_datetime(features['time'], format='%H:%M').dt.hour
    features['Minute'] = pd.to_datetime(features['time'], format='%H:%M').dt.minute
    features['Day'] = features['tempDate'].dt.day_name()
    print("After transformDate")
    return features.drop(['tempDate', 'date', 'time', 'Day_of_week'], axis=1)

def findWeekStatus(features):
    print("Before findWeekStatus")
    weekdays = ['Friday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday']
    weekends = ['Saturday', 'Sunday']
    indicies = features[features['WeekStatus'].isnull()].index
    for index in indicies:
        day = features.loc[index, 'Day']
        if day in weekdays:
            features.loc[index, 'WeekStatus'] = 'Weekday'
        elif day in weekends:
            features.loc[index, 'WeekStatus'] = 'Weekend'
    print("After findWeekStatus")
    return features

In [9]:
# Variables
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
features = data.drop(['Id', 'Usage_kWh'], axis=1)
target = data['Usage_kWh']
features = transformDate(features)
features = findWeekStatus(features)
test = transformDate(test)
test = findWeekStatus(test)
xTrain, xVal, yTrain, yVal = train_test_split(features, target, test_size=0.3, random_state=42)
xTest = test.drop('Id', axis=1)
START_ID = 28000

Before transformDate
After transformDate
Before findWeekStatus
After findWeekStatus
Before transformDate
After transformDate
Before findWeekStatus
After findWeekStatus


In [10]:
print("Data Shape: ", data.shape)
print("Data Description: ",data.describe())

Data Shape:  (27999, 12)
Data Description:                  Id  Lagging_Current_Reactive.Power_kVarh  \
count  27999.00000                          27999.000000   
mean   14000.00000                             13.230912   
std     8082.75943                             16.469935   
min        1.00000                              0.000000   
25%     7000.50000                              2.740000   
50%    14000.00000                              5.040000   
75%    20999.50000                             22.820000   
max    27999.00000                             96.910000   

       Leading_Current_Reactive_Power_kVarh     CO2(tCO2)  \
count                          27880.000000  27999.000000   
mean                               3.881065      0.011790   
std                                7.495718      0.016354   
min                                0.000000      0.000000   
25%                                0.000000      0.000000   
50%                                0.000000      

In [11]:
# Normalization & Preprocessing

def encode(feature, xTrain, xVal, xTest):
    print("Before encode")
    xTrain[feature] = encoder.fit_transform(xTrain[feature])
    xVal[feature] = encoder.transform(xVal[feature])
    xTest[feature] = encoder.transform(xTest[feature])
    print("After encode")


In [13]:

def scale(feature, xTrain, xVal, xTest):
    print("Before scale")
    xTrain[feature] = scaler.fit_transform(xTrain[feature].values.reshape(-1, 1))
    xVal[feature] = scaler.transform(xVal[feature].values.reshape(-1, 1))
    xTest[feature] = scaler.transform(xTest[feature].values.reshape(-1, 1))
    print("After scale")


In [14]:
def impute(feature, xTrain, xVal, xTest):
    print("Before impute")
    xTrain[feature] = xTrain[feature].fillna(xTrain[feature].median())
    xVal[feature] = xVal[feature].fillna(xTrain[feature].median())
    xTest[feature] = xTest[feature].fillna(xTrain[feature].median())
    print("After impute")


In [15]:
def learningCurve(model, xTrain, yTrain,degree=0,):
        trainSizes = np.linspace(0.1, 1.0, 10)
        trainSizes, trainScores, valScores = learning_curve(
            model, xTrain, yTrain, train_sizes=trainSizes, cv=5, scoring='r2'
        )
        trainScoresMean = np.mean(trainScores, axis=1)
        trainScoresStd = np.std(trainScores, axis=1)
        valScoresMean = np.mean(valScores, axis=1)
        valScoresStd = np.std(valScores, axis=1)

        # Plot learning curve
        plt.figure(figsize=(10, 6))
        plt.fill_between(trainSizes * len(xTrain), trainScoresMean - trainScoresStd,
                         trainScoresMean + trainScoresStd, alpha=0.1, color="r")
        plt.fill_between(trainSizes * len(xTrain), valScoresMean - valScoresStd,
                         valScoresMean + valScoresStd, alpha=0.1, color="g")
        plt.plot(trainSizes * len(xTrain), trainScoresMean, 'o-', color="r", label="Training score")
        plt.plot(trainSizes * len(xTrain), valScoresMean, 'o-', color="g", label="Cross-validation score")
        plt.xlabel("Number of training examples")
        plt.ylabel("R2 Score")
        if degree > 0:  plt.title(f"Learning Curve for Polynomial Regression of Degree ({degree}) ")
        else: plt.title(f"Learning Curve for Linear Regression")
        plt.legend(loc="best")
        plt.grid(True)
        plt.show()

        return model

In [None]:
def simpleLinearRegression(model, xTrain, yTrain, xVal, yVal):
    print("Simple Linear Regression")
    learningCurve(model, xTrain, yTrain)
    model.fit(xTrain, yTrain)
    yPredTrain = model.predict(xTrain)
    print(f"Training R2 Score : {r2_score(yTrain, yPredTrain)}")
    print(f"Training Mean Squared Error : {mean_squared_error(yTrain,yPredTrain)}")
    yPredVal = model.predict(xVal)
    print(f"Validation R2 Score : { r2_score(yVal, yPredVal)}")
    print(f"Validation Mean Squared Error : {mean_squared_error(yVal,yPredVal)}")


In [None]:
def ridgeRegularization(xTrain, yTrain, xVal, yVal):
    alphas = [0.01, 0.1, 1.0, 10.0]  
    bestR2 = -float('inf')
    bestMSE = float('inf')
    bestAlpha = None
    
    for alpha in alphas:
        ridge = Ridge(alpha=alpha, random_state=42)
        print(f"Ridge Regression with Regularization Parameter: {alpha}")
        _,r2, MSE = comparePolynomialDegrees(ridge, xTrain, yTrain, xVal, yVal)
        
        if r2 > bestR2:
            bestR2 = r2
            bestMSE = MSE
            bestAlpha = alpha
    
    print(f"Best Ridge Regularization Parameter (alpha): {bestAlpha}")
    print(f"Ridge (alpha={bestAlpha}) R2 Score: {bestR2}")
    print(f"Ridge (alpha={bestAlpha}) Mean Squared Error: {bestMSE}")
    return bestAlpha, bestR2, bestMSE
    
def lassoRegularization(xTrain, yTrain, xVal, yVal):
    alphas = [0.01, 0.1, 1.0, 10.0]  
    bestR2 = -float('inf')
    bestMSE = float('inf')
    bestAlpha = None
    
    for alpha in alphas:
        lasso = Lasso(alpha=alpha, random_state=42)
        print(f"Lasso Regression with Regularization Parameter: {alpha}")
        _,r2, MSE = comparePolynomialDegrees(lasso, xTrain, yTrain, xVal, yVal)
        
        if r2 > bestR2:
            bestR2 = r2
            bestMSE = MSE
            bestAlpha = alpha
    
    print(f"Best Lasso Regularization Parameter (alpha): {bestAlpha}")
    print(f"Lasso (alpha={bestAlpha}) R2 Score: {bestR2}")
    print(f"Lasso (alpha={bestAlpha}) Mean Squared Error: {bestMSE}")
    return bestAlpha, bestR2, bestMSE

In [16]:
def comparePolynomialDegrees(model, xTrain, yTrain, xVal, yVal):
    trainErrors = []
    valErrors = []
    degrees = range(2, 5)
    minDiff = float('inf')
    bestDegree = None
    bestR2 = None
    bestMSE = None

    for degree in degrees:
        poly = PolynomialFeatures(degree=degree)
        xPolyTrain = poly.fit_transform(xTrain)
        xPolyVal = poly.transform(xVal)
        learningCurve(model, xPolyTrain, yTrain,degree)
        model.fit(xPolyTrain, yTrain)
        yPredTrain = model.predict(xPolyTrain)
        trainR2 = r2_score(yTrain, yPredTrain)
        trainMSE = mean_squared_error(yTrain, yPredTrain)
        trainErrors.append([np.log1p(trainR2), np.log1p(trainMSE)])
        print(f"Training R2 Score (Degree {degree}): {trainR2}")
        print(f"Training MSE (Degree {degree}): {trainMSE}")

        yPredVal = model.predict(xPolyVal)
        valR2 = r2_score(yVal, yPredVal)
        valMSE = mean_squared_error(yVal, yPredVal)
        valErrors.append([np.log1p(valR2), np.log1p(valMSE)])
        print(f"Validation R2 Score (Degree {degree}): {valR2}")
        print(f"Validation MSE (Degree {degree}): {valMSE}")
        
        diff = abs(valR2 - trainR2)
        if diff < minDiff:
            minDiff = diff
            bestDegree = degree
            bestR2 = valR2
            bestMSE = valMSE

    trainErrors = np.array(trainErrors)
    valErrors = np.array(valErrors)

    plt.figure(figsize=(10, 6))
    plt.plot(degrees, trainErrors[:, 0], label='Train R2 Score', marker='o')
    plt.plot(degrees, valErrors[:, 0], label='Validation R2 Score', marker='o')
    plt.plot(degrees, trainErrors[:, 1], label='Train MSE', marker='o')
    plt.plot(degrees, valErrors[:, 1], label='Validation MSE', marker='o')
    plt.xlabel('Degree of Polynomial Features')
    plt.ylabel('Error (log scale)')
    plt.title('Train and Validation Errors vs. Polynomial Model Complexity')
    plt.legend()
    plt.grid(True)
    plt.show()

    print(f"Best Degree: {bestDegree}, Minimum difference: {minDiff}")
    print(f"Degree ({bestDegree}) R2 Score: {bestR2}")
    print(f"Degree ({bestDegree}) Mean Squared Error: {bestMSE}")
    return bestDegree,bestR2,bestMSE


In [None]:
def compareRegularization(xTrain, yTrain, xVal, yVal):
    bestR2 = -float('inf')
    bestMSE = float('inf')
    bestAlpha = None
    bestType = None

    ridgeAlpha, ridgeR2, ridgeMSE = ridgeRegularization(xTrain, yTrain, xVal, yVal)
    lassoAlpha, lassoR2, lassoMSE = lassoRegularization(xTrain, yTrain, xVal, yVal)
   
    # Compare R2 scores
    if ridgeR2 > bestR2:
        bestR2 = ridgeR2
        bestMSE = ridgeMSE
        bestAlpha = ridgeAlpha
        bestType = "Ridge"

    elif lassoR2 > bestR2:
        bestR2 = lassoR2
        bestMSE = lassoMSE
        bestAlpha = lassoAlpha
        bestType = "Lasso"

    print("\nBest Model:", bestType)
    print(f"{bestType} R2 Score: {bestR2}")
    print(f"{bestType} Mean Squared Error: {bestMSE}")
    return bestType,bestAlpha

In [17]:
categoricalFeatures = ['WeekStatus', 'Load_Type', 'Day']
featuresWithNan = ['Leading_Current_Reactive_Power_kVarh', 'Leading_Current_Power_Factor']
skewedFeatures = ['Lagging_Current_Power_Factor', 'Lagging_Current_Reactive.Power_kVarh'] + featuresWithNan
for feature in categoricalFeatures:
    encode(feature, xTrain, xVal, xTest)
for feature in featuresWithNan:
    impute(feature, xTrain, xVal, xTest)
for feature in skewedFeatures + ['CO2(tCO2)']:
    scale(feature, xTrain, xVal, xTest)

Before encode
After encode
Before encode
After encode
Before encode
After encode
Before impute
After impute
Before impute
After impute
Before scale
After scale
Before scale
After scale
Before scale
After scale
Before scale
After scale
Before scale
After scale
