<br>Table of Content:
* [Import Libraries](#1)
* [Load Data](#2)
* [Data Preprocessing](#3)
    * [Data Cleaning](#3a)
    * [Data Transformation](#3b)
    * [Feature Selection/Extraction](#3c)
    * [Handling Imbalanced Data](#3d)
    * [Data Reduction](#3e)
* [Exploratory Data Analysis (EDA)](#4)
* [Selecting and Training the Model](#5) 
* [Model Evaluation](#6) 
* [Model Optimization](#7) 
* [Model Deployment](#8) 

<a id="1"></a> <br>
## Import Libraries

In [32]:
# Data Analysis     
import pandas as pd          # data analysis library for handling structured data   
from pandas import DataFrame            
import numpy as np           # mathematical library for working with numerical data
from pandas.plotting import parallel_coordinates 
from statsmodels.tsa.seasonal import seasonal_decompose # library for performing statistical analysis
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from typing import List
import pickle
import calendar 
import scipy.stats as stats
import datetime 

# Visualization
import matplotlib.pyplot as plt     # data visualization library for creating graphs and charts
%matplotlib inline
import seaborn as sns        # data visualization library based on matplotlib for creating more attractive visualizations
import plotly.io as pio
import plotly.express as px   # interactive data visualization library
import plotly.graph_objects as go   # library for creating interactive graphs and charts
import matplotlib 
import kaleido 
import missingno as msno 

# Machine Learning/Time Series 
import tensorflow as tf
from prophet import Prophet
from xgboost import XGBRegressor 
#ML - Preprocessing data
from sklearn.preprocessing import FunctionTransformer, PowerTransformer #variable transformation 
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder, OneHotEncoder # Preprocessing feature scaling/categorical encoding
from sklearn.preprocessing import Normalizer, Binarizer 
from sklearn.model_selection import train_test_split    #split data into train and test 
#ML - Handling imbalanced data
from imblearn.over_sampling import RandomOverSampler 
from imblearn.under_sampling import RandomUnderSampler  
#ML - Create your model
from sklearn.linear_model import LinearRegression, LogisticRegression  #linear and logistics regression models
from sklearn.tree import DecisionTreeRegressor 
from sklearn.neighbors import KNeighborsRegressor
#ML - Evaluate model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, confusion_matrix, accuracy_score, classification_report 
from sklearn.metrics import adjusted_rand_score, v_measure_score, homogeneity_score 
#ML - Tune your model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


# Ignore warnings
import warnings
warnings.filterwarnings("ignore") 

pd.set_option('display.max_rows', 15)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

<a id="2"></a> <br>
## Load Data

In [124]:
df = pd.read_csv("Dataset/household_power_consumption.txt", sep=";",  parse_dates={'Datetime' : ['Date', 'Time']}, 
                          infer_datetime_format=True, low_memory=False, index_col='Datetime')

df.head(4)

>> Data DESCR

In [None]:
df.describe()
df.info() 

>> Visualize Dataframe

In [None]:
def visualize_df(df):       #best for time series analysis when your index is in datetime 
    """
    Creates an interactive plot of the dataframe.

    Args:
        df: A Pandas DataFrame containing energy consumption data for the PJM Interconnection region.

    Returns:
        None. Displays an interactive plot of the energy consumption data using Plotly.

    Example:
        >>> visualize_df(my_df)
    """
    import plotly.graph_objects as go

    fig = go.Figure(layout=go.Layout(
        height=500,
        width=800,
    ))

    for col in df.columns:
        fig.add_trace(go.Scatter(x=df.index, y=df[col], name=col))

    fig.update_layout(
        title={
            'text': 'PJM Energy Consumption',
            'font': {'size': 25, 'family': 'Arial', 'color': 'black'}
        },
        xaxis_title='Date',
        yaxis_title='Energy Consumption (MW)'
    )

    return fig.show(renderer='svg')

visualize_df(df) 

>> Data Profiling

In [None]:
def do_data_profiling(df, filename):
    '''
    Function to do basic data profiling
    Required Input - 
        - df = Pandas DataFrame
        - filename = Path for output file with a .html extension
    Expected Output -
        - HTML file with data profiling summary
    '''
    profile = pandas_profiling.ProfileReport(df)
    profile.to_file(output_file = filename)
    print("Data profiling done")

do_data_profiling(df, data_profiling.html) 

<a id="3"></a> <br>
## Data Pre-processing

<a id="3a"></a> <br>
### Data Cleaning

>> Basic formating (renaming cols, duplicates detection, datetime etc.)

In [None]:
# Remove duplicates 
df.drop_duplicates(inplace=True) 

#formating columns 
df.



# Rename columns
df.rename(columns={'price': 'selling_price', 'bedrooms': 'num_bedrooms'}, inplace=True)

#replace non numeric columns
def replace_non_numeric(df: pd.DataFrame, columns):
    """
    Replaces non-numeric values in the specified columns of a Pandas dataframe with NaN.

    Parameters:
        df (pd.DataFrame): The dataframe to process.
        columns (list): A list of column names to replace non-numeric values in.

    Returns:
        pd.DataFrame: The updated dataframe with non-numeric values replaced by NaN.
    """
    for col in columns:
        df.dropna(subset = col, inplace= True)
        if df[col].dtype == 'object' or df[col].dtype == 'float':
            # df.dropna(subset = col, inplace= True)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df.dropna(subset = col, inplace= True)
        else:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df.dropna(subset = col, inplace= True)
    return df

>> format datetime

In [None]:
def convert_timestamp(ts):
    """
    Converts a Unix timestamp to a formatted date and time string.

    Args:
        ts (int): The Unix timestamp to convert.

    Returns:
        str: A formatted date and time string in the format 'YYYY-MM-DD HH:MM:SS'.
    """
    utc_datetime = datetime.datetime.utcfromtimestamp(ts)
    formatted_datetime = utc_datetime.strftime('%Y-%m-%d %H:%M:%S')
    formatted_datetime = pd.to_datetime(formatted_datetime, infer_datetime_format=True) 
    return formatted_datetime

convert_timestamp(ts) 

>> Remove unwanted Data

In [None]:
# Remove irrelevant columns
df.drop(['id', 'date'], axis=1, inplace=True) 

>> Missing values

In [None]:
def missing_value_analysis(df):
    '''
    Function to do basic missing value analysis
    Required Input - 
        - df = Pandas DataFrame
    Expected Output -
        - Chart of Missing value co-occurance
        - Chart of Missing value heatmap
    '''
    msno.matrix(df)
    msno.heatmap(df)

def view_NaN(df):
    """
    Prints the name of any column in a Pandas DataFrame that contains NaN values.

    Parameters:
        - df: Pandas DataFrame

    Returns:
        - None
    """
    for col in df.columns:
        if df[col].isnull().any() == True: 
            print(f"there is {df[col].isnull().sum()} NaN present in column:", col)
        else:
            print("No NaN present in column:", col)  

missing_value_analysis (df)
view_NaN(df) 

In [None]:
# Replace missing values
# df.fillna(df.mean(), inplace=True)

def treat_missing_numeric(df,columns,how = 'mean', value = None):
    '''
    Function to treat missing values in numeric columns
    Required Input - 
        - df = Pandas DataFrame
        - columns = List input of all the columns need to be imputed
        - how = valid values are 'mean', 'mode', 'median','ffill', numeric value
    Expected Output -
        - Pandas dataframe with imputed missing value in mentioned columns
    '''
    if how == 'mean':
        for i in columns:
            print("Filling missing values with mean for columns - {0}".format(i))
            df[i] = df[i].fillna(df[i].mean())
            
    elif how == 'mode':
        for i in columns:
            print("Filling missing values with mode for columns - {0}".format(i))
            df[i] = df[i].fillna(df[i].mode())
    
    elif how == 'median':
        for i in columns:
            print("Filling missing values with median for columns - {0}".format(i))
            df[i] = df[i].fillna(df[i].median())
    
    elif how == 'ffill':
        for i in columns:
            print("Filling missing values with forward fill for columns - {0}".format(i))
            df[i] = df[i].fillna(method ='ffill')
    
    elif how == 'digit':
        for i in columns:
            print("Filling missing values with {0} for columns - {1}".format(how, i))
            df[i] = df[i].fillna(str(value)) 
      
    else:
        print("Missing value fill cannot be completed")
    return df.head(5)


treat_missing_numeric(smart_home, ["cloudCover"], how="digit", value = 0.1) 

>> Outliers

In [None]:
#visualize outliers
def visualize_outlier (df: pd.DataFrame):
    # Select only numeric columns
    numeric_cols = df.select_dtypes(include=["float64", "int64"])
    # Set figure size and create boxplot
    fig, ax = plt.subplots(figsize=(12, 6))
    numeric_cols.boxplot(ax=ax, rot=90)
    # Set x-axis label
    ax.set_xlabel("Numeric Columns")
    # Adjust subplot spacing to prevent x-axis labels from being cut off
    plt.subplots_adjust(bottom=0.4) 
    # Increase the size of the plot
    fig.set_size_inches(10, 6)
    # Show the plot
    plt.show()

visualize_outlier (df) 

<a id="3b"></a> <br>
### Data Transformation (scaling, encoding categorical data)

>> Categorical Variable Encoding (data transformation)

In [None]:
# Label Encoding: replaces each category with a numerical label. This technique is suitable for data 
#     where the categories have an intrinsic order, such as "low," "medium," and "high." Works well with linear models

# Ordinal encoding: assigns a numerical value to each category based on their frequency. This technique is suitable 
#     for data where the categories do not have an intrinsic order, but where their frequency may be informative.
#     Suitable for non-linear models

# One-hot encoding: creates a binary variable for each category, indicating its presence or absence. 
#     This technique is suitable for data where the categories do not have an intrinsic order and the 
#     number of categories is small



>> Data Spliting 

In [None]:
def holdout_cv(X,y,size = 0.3, seed = 1): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = size, random_state = seed) 
    X_train = X_train.reset_index(drop='index') 
    X_test = X_test.reset_index(drop='index') 
    return X_train, X_test, y_train, y_test 

X_train, X_test, y_train, y_test = holdout_cv(X, y, size=0.3, seed=1) 

In [None]:
def plot_data_splitting(train, test): 
    """ 
    Plots the training and test sets of a time series.

    Args:
    train (pandas.DataFrame): DataFrame containing the training set with a DatetimeIndex and a 'PJME_MW' column.
    test (pandas.DataFrame): DataFrame containing the test set with a DatetimeIndex and a 'PJME_MW' column.

    Returns:
    None
    """
    plt.figure(figsize=(20,8))

    plt.plot(train.index, train['PJME_MW'], label='Training Set')
    plt.plot(test.index, test['PJME_MW'], label='Test Set')

    plt.title('Data Splitting', weight='bold', fontsize=25, loc= "center", pad=20)
    plt.axvline('2015-09-01', color='black', ls='--', lw=3) 
    plt.legend()
    plt.show() 

plot_data_splitting(train, test) 

>> Feature Scaling (data transformation) - apply to train, and then to test

In [None]:
#Z-score standardized feature scaling (most common)
#min-max scaling (common) 

def standardize_data(X_train, X_test): 
    """
    Standardizes the training and testing data using the mean and standard deviation
    learned from the training set.
    
    Args:
    - X_train: numpy array or pandas dataframe, training data
    - X_test: numpy array or pandas dataframe, testing data
    
    Returns:
    - X_train_scaled: numpy array or pandas dataframe, standardized training data
    - X_test_scaled: numpy array or pandas dataframe, standardized testing data
    """
    from sklearn.preprocessing import StandardScaler 
    # Set up the scaler
    scaler = StandardScaler()
    
    # Fit the scaler to the training set
    scaler.fit(X_train) 
    
    # Transform the training and testing sets
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled

standardize_data(X_test, X_test) 

>> Variable Transformation - apply to train, and then to test

In [None]:
#Variable transformation involves transforming the values of variables to make them more suitable for analysis
#the idea is to make the variables normally/gaussian distributed. Hence, 

#first step is to assess normality using a histogram or QQ-plot (to explore the variable distribution)

def diagnostic_plots(df, variable):

    # function to plot a histogram and a Q-Q plot
    # side by side, for a certain variable

    plt.figure(figsize=(15, 6))

    # histogram
    plt.subplot(1, 2, 1)
    df[variable].hist(bins=30)
    plt.title(f"Histogram of {variable}")

    # q-q plot
    plt.subplot(1, 2, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.title(f"Q-Q plot of {variable}")

    # check for skewness
    skewness = df[variable].skew()
    if skewness > 0:
        skew_type = "positively skewed"
    elif skewness < 0:
        skew_type = "negatively skewed"
    else:
        skew_type = "approximately symmetric"
        
    # print message indicating skewness type
    print(f"The variable {variable} is {skew_type} (skewness = {skewness:.2f})")
    
    plt.show()

# Check function output
diagnostic_plots(X, "MedInc")

In [None]:
#If the variables are NOT normally distributed, we then transform it. It is necessary to test several variable 
# transformation methods, and choose the best for that feature. One variable transformation method is log_transform

#log transform 
def log_transform(df, columns):
     """
    Transforms specified columns of a pandas DataFrame using the natural logarithm function.

    Parameters:
    -----------
    df : pandas DataFrame
        The DataFrame to transform.
    columns : list
        A list of column names to transform.

    Returns:
    --------
    pandas DataFrame
        The transformed DataFrame.
    """
    transformer = FunctionTransformer(np.log1p, validate=True)
    X = df.values.copy()
    X[:, df.columns.isin(columns)] = transformer.transform(X[:, df.columns.isin(columns)])
    X_log = pd.DataFrame(X, index=df.index, columns=df.columns)
    return X_log

df_log = log_transform(df, columns)

diagnostic_plots(df_log, columns) 

>> Discretization (data transformation - apply to train, and then to test)

In [None]:
# Discretization in machine learning is the process of transforming continuous variables into discrete or 
# categorical variables. This process involves dividing the range of a continuous variable into a finite number of 
# intervals or bins, and then assigning each observation to a particular bin based on the value of the continuous 
# variable. 

#Discretization approaches: equal width, equal frequency, K means, Decision Trees



<a id="3c"></a> <br>
### Feature Selection/Extraction

In [None]:
# apply to train, and then to test

<a id="3d"></a> <br>
### Handling Imbalanced Data and Biases

In [None]:
# apply to train, and then to test

#Imbalanced data refers to a situation where the number of observations in one class or category is much larger 
# or smaller than the number of observations in other classes or categories. Imbalanced data can pose challenges 
# in machine learning because it can lead to biased models that perform poorly on the minority class.

#we typically look at the 'target data' when checking for imbalanced data. However, it is also important 
# to consider the features

def check_imbalance(dataset, columns=None, threshold=10):
    """
    This function takes a dataset and one or more columns as input and returns True if any of the specified columns
    are imbalanced, False otherwise. A column is considered imbalanced if the percentage of the minority class is less
    than the specified threshold.
    """
    # If no columns are specified, use all columns except for the last one as the features
    if columns is None:
        features = dataset.iloc[:, :-1]
        columns = features.columns
    
    # Check the imbalance of each specified column
    for col in columns:
        # Get the counts of each class in the column
        class_counts = dataset[col].value_counts()

        # Calculate the percentage of each class in the column
        class_percentages = class_counts / len(dataset) * 100

        # Plot the class percentages
        plt.bar(class_counts.index, class_percentages)
        plt.xlabel(col)
        plt.ylabel('Percentage')
        plt.title(f'{col} Distribution')
        plt.show()

        # Check if the column is imbalanced
        minority_class = class_counts.index[-1]
        minority_class_percentage = class_percentages.iloc[-1]
        if minority_class_percentage < threshold:
            print(f'{col} is imbalanced. Minority class: {minority_class}, Percentage: {minority_class_percentage:.2f}%')
            return True

    # If none of the specified columns are imbalanced, return False
    print('No imbalance found.')
    return False

check_imbalance(df, columns=['target'], threshold=10) 


<a id="3e"></a> <br>
### Data Reduction

<a id="4"></a> <br>
## Exploratory Data Analysis (EDA)

In [None]:
#visualize with individual plots 
def visualize_subplots_boxplots(df: DataFrame, columns: List[str], nrows: int, ncols: int) -> None:
    """
    Creates a grid of subplots containing boxplots of daily average energy consumption.

    Args:
        df: A Pandas DataFrame containing energy consumption data.
        columns: A list of column names to include in the boxplots.
        nrows: The number of rows in the subplot grid.
        ncols: The number of columns in the subplot grid.

    Returns:
        None. Displays a grid of subplots containing boxplots of daily average energy consumption.

    Example:
        >>> visualize_subplots_boxplots(my_df, ['Consumption', 'Generation'], 3, 4)
    """
    from typing import List
    from pandas import DataFrame
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16, 12))
    fig.suptitle('Hourly Average Energy Consumption', weight='bold', fontsize=25)

    # We just need 11 figures, so we delete the last one
    if nrows*ncols > len(columns):
        fig.delaxes(axes[nrows-1][ncols-1])

    for i, col in enumerate(columns):
        sns.boxplot(data=df, x='Hour', y=col, ax=axes.flatten()[i], color='#cc444b')

    plt.tight_layout()
    fig.savefig("Images/xxx.png", dpi=300, bbox_inches='tight')
    plt.show()
    
visualize_subplots_boxplots(df=result_2, columns=['AEP_MW', 'COMED_MW', 'DAYTON_MW', 'DEOK_MW', 'DOM_MW', 'DUQ_MW',
        'EKPC_MW', 'FE_MW', 'NI_MW', 'PJME_MW', 'PJMW_MW'], nrows=6, ncols=2)

<a id="5"></a> <br>
## Selecting and Training the Model 

<a id="6"></a> <br>
## Model Evaluation

In [None]:
#Regression
    # Error = Actual value - Predicted value

    # MSE (Mean Square Error)
        # The square of the error over all samples is called Mean Squarred Error(MSE).
        # MSE = SQUARE(Actual value - Predicted value)/Number of Samples
    #RMSE (Root Mean Square Error)
    # MAE (Mean Absolute Error)
        # MAE = ABSOLUTE (Actual value - Predicted Value)


#Classification
    #Accuracy
    #Precision
    #Recall
    #F1 score 

# True Positives(TP): Number of samples that are correctly classified as positive, and their actual label is positive.

# False Positives (FP): Number of samples that are incorrectly classified as positive, when in fact their actual label 
#     is negative.

# True Negatives (TN): Number of samples that are correctly classified as negative, and their actual label is negative.

# False Negatives (FN): Number of samples that are incorrectly classified as negative, when in fact their actual label 
#     is positive.


<a id="7"></a> <br>
## Model Optimization

<a id="8"></a> <br>
## Model Deployment 