# Electricity consumtpion prediction

## Sections
1. Read and prepare datasets
2. Linear regression model
    - Define the model
    - Define the Cross-validation plan
    - Train the model
    - Evaluate the model
    - If necessary, refine the model
3. Random forest model
    - Define the model
    - Define the Cross-validation plan
    - Train the model
    - Evaluate the model
    - If necessary, refine the model
4. Compare performances of both models
5. Interpret the results

### 1. Read and prepare datasets

##### Import libraries

In [3]:
# Import libraries for plotting, reading and wrangling data
library(tidyverse)
library(scales)
library(here)

##### Read datasets

In [6]:
# Read electricity datasets: Hourly, Daily, Monthly
ElecHourDF  <-  read_csv(here::here("curated/electricity", "gold_hourly_electricity.csv"),   show_col_types = FALSE)
ElecDayDF   <-  read_csv(here::here("curated/electricity", "gold_daily_electricity.csv"),    show_col_types = FALSE)
ElecMonthDF <-  read_csv(here::here("curated/electricity", "gold_monthly_electricity.csv"),  show_col_types = FALSE) %>%
                mutate(year_month = factor(paste(year, month, sep = "-")))

# Read weather datasets: Hourly, Daily, Monthly
WeatherHourDF  <-   read_csv(here::here("curated/weather", "gold_hourly_weather.csv"),   show_col_types = FALSE)
WeatherDayDF   <-   read_csv(here::here("curated/weather", "gold_daily_weather.csv"),    show_col_types = FALSE)
WeatherMonthDF <-   read_csv(here::here("curated/weather", "gold_monthly_weather.csv"),  show_col_types = FALSE) %>%
                    mutate(year_month = factor(paste(year, month, sep = "-")))

##### Prepare dataframes

In [7]:
# Join hourly weather and electricity data
WEHourDF <- WeatherHourDF %>%
    left_join(ElecHourDF,   by = c("year", "month", "hour"), suffix = c("", "_elec")) %>%
    select(year, month, hour, avg_temp, avg_dewpt_temp, avg_rel_hum_pct, avg_wind_dir, avg_wind_spd, avg_visib, avg_stn_press, avg_hmdx, avg_wind_chill, consumption)
# Join daily weather and electricity data
WEDayDF <- WeatherDayDF %>%
    left_join(ElecDayDF,   by = c("date", "year", "month", "day"), suffix = c("", "_elec")) %>%
    select(date, year, month, day, avg_temp, avg_dewpt_temp, avg_rel_hum_pct, avg_wind_dir, avg_wind_spd, avg_visib, avg_stn_press, avg_hmdx, avg_wind_chill, consumption)
# Join monthly weather and electricity data
WEMonthDF <- WeatherMonthDF %>%
    left_join(ElecMonthDF,   by = c("year_month", "year", "month"), suffix = c("", "_elec")) %>%
    select(year_month, year, month, avg_temp, avg_dewpt_temp, avg_rel_hum_pct, avg_wind_dir, avg_wind_spd, avg_visib, avg_stn_press, avg_hmdx, avg_wind_chill, consumption)

### 2. Linear regression model

##### Define the linear regression model

In [9]:
#=========================
# FORMULAS
#=========================
# Define the model formula with no interactions between independant variables
fmla_no_inter <- as.formula("consumption ~ avg_temp + avg_dewpt_temp + avg_rel_hum_pct + avg_wind_dir + avg_visib")

# Define the model formula with interactions between independant variables
fmla_inter <- as.formula("consumption ~ avg_temp * avg_dewpt_temp * avg_rel_hum_pct * avg_wind_dir * avg_visib")

#=========================
# LINEAR REGRESSION MODELS
#=========================
# Define the linear regression model based on hourly consumption
lin_reg_mdl_hour_no_inter   <- lm(fmla_no_inter,    data = WEHourDF)
lin_reg_mdl_hour_inter      <- lm(fmla_inter,       data = WEHourDF)

# Define the linear regression model based on daily consumption
lin_reg_mdl_day_no_inter   <- lm(fmla_no_inter,     data = WEDayDF)
lin_reg_mdl_day_inter      <- lm(fmla_inter,        data = WEDayDF)

# Define the linear regression model based on monthly consumption
lin_reg_mdl_month_no_inter   <- lm(fmla_no_inter,     data = WEMonthDF)
lin_reg_mdl_month_inter      <- lm(fmla_inter,        data = WEMonthDF)

In [21]:
print("R-Squared")
print(summary(lin_reg_mdl_hour_no_inter)$r.squared)
print(summary(lin_reg_mdl_hour_inter)$r.squared)
print(summary(lin_reg_mdl_day_no_inter)$r.squared)
print(summary(lin_reg_mdl_day_inter)$r.squared)
print(summary(lin_reg_mdl_month_no_inter)$r.squared)
print(summary(lin_reg_mdl_month_inter)$r.squared)
print("Adjusted R-Squared")
print(summary(lin_reg_mdl_hour_no_inter)$adj.r.squared)
print(summary(lin_reg_mdl_hour_inter)$adj.r.squared)
print(summary(lin_reg_mdl_day_no_inter)$adj.r.squared)
print(summary(lin_reg_mdl_day_inter)$adj.r.squared)
print(summary(lin_reg_mdl_month_no_inter)$adj.r.squared)
print(summary(lin_reg_mdl_month_inter)$adj.r.squared)

[1] "R-Squared"
[1] 0.0322931
[1] 0.0370133
[1] 0.2404551
[1] 0.3702803
[1] 0.6798885
[1] 1
[1] "Adjusted R-Squared"
[1] 0.03228402
[1] 0.03695727
[1] 0.2352097
[1] 0.3423128
[1] 0.5909687
[1] NaN
