In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm


In [2]:
df_banes = pd.read_csv("BANES_cleaned_final.csv", parse_dates=["time"])
df_banes.head()


Unnamed: 0,time,location,postcode,energy
0,2006-10-01 00:30:00+00:00,Guildhall Electricity Supply 1 (HH),BA1 5AW,32.8
1,2006-10-01 01:00:00+00:00,Guildhall Electricity Supply 1 (HH),BA1 5AW,33.0
2,2006-10-01 01:30:00+00:00,Bath Central Library Electricity (HH),BA1 5AL,5.9
3,2006-10-01 02:00:00+00:00,Guildhall Electricity Supply 1 (HH),BA1 5AW,32.2
4,2006-10-01 02:30:00+00:00,Guildhall Electricity Supply 1 (HH),BA1 5AW,31.3


In [3]:
df_weather = pd.read_csv("Bath_hourly_cleaned_30min.csv", parse_dates=["time"])

# drop unnamed columns
df_weather.drop("Unnamed: 0", axis=1, inplace=True)

df_weather.head()


Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,coco
0,2006-10-01 00:00:00,15.1,14.1,94.0,0.0,0.0,170.0,13.0,,1001.9,
1,2006-10-01 00:30:00,14.95,13.8,93.0,0.0,0.0,175.0,13.9,,1001.8,
2,2006-10-01 01:00:00,14.8,13.5,92.0,0.0,0.0,180.0,14.8,,1001.7,
3,2006-10-01 01:30:00,14.75,13.35,91.5,0.0,0.0,180.0,13.9,,1001.7,
4,2006-10-01 02:00:00,14.7,13.2,91.0,0.0,0.0,180.0,13.0,,1001.7,


In [4]:
df_banes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7980288 entries, 0 to 7980287
Data columns (total 4 columns):
 #   Column    Dtype              
---  ------    -----              
 0   time      datetime64[ns, UTC]
 1   location  object             
 2   postcode  object             
 3   energy    float64            
dtypes: datetime64[ns, UTC](1), float64(1), object(2)
memory usage: 243.5+ MB


In [5]:
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234146 entries, 0 to 234145
Data columns (total 11 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   time    234146 non-null  datetime64[ns]
 1   temp    234146 non-null  float64       
 2   dwpt    234146 non-null  float64       
 3   rhum    234146 non-null  float64       
 4   prcp    234146 non-null  float64       
 5   snow    234146 non-null  float64       
 6   wdir    234146 non-null  float64       
 7   wspd    234146 non-null  float64       
 8   wpgt    12339 non-null   float64       
 9   pres    234146 non-null  float64       
 10  coco    16926 non-null   float64       
dtypes: datetime64[ns](1), float64(10)
memory usage: 19.7 MB


BANES has type `datetime64[ns,UTC]`. Parse it to `datetime64[ns]`.

In [6]:
# parse datetime64[ns, UTC] to datetime64[ns]
df_banes["time"] = df_banes["time"].dt.tz_localize(None)

df_banes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7980288 entries, 0 to 7980287
Data columns (total 4 columns):
 #   Column    Dtype         
---  ------    -----         
 0   time      datetime64[ns]
 1   location  object        
 2   postcode  object        
 3   energy    float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 243.5+ MB


In [7]:
# merge datasets on time
df_merged = pd.merge(df_banes, df_weather, on="time")

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7980288 entries, 0 to 7980287
Data columns (total 14 columns):
 #   Column    Dtype         
---  ------    -----         
 0   time      datetime64[ns]
 1   location  object        
 2   postcode  object        
 3   energy    float64       
 4   temp      float64       
 5   dwpt      float64       
 6   rhum      float64       
 7   prcp      float64       
 8   snow      float64       
 9   wdir      float64       
 10  wspd      float64       
 11  wpgt      float64       
 12  pres      float64       
 13  coco      float64       
dtypes: datetime64[ns](1), float64(11), object(2)
memory usage: 913.3+ MB


In [8]:
df_merged.isnull().sum()

time              0
location          0
postcode          0
energy            0
temp              0
dwpt              0
rhum              0
prcp              0
snow              0
wdir              0
wspd              0
wpgt        7381489
pres              0
coco        7173801
dtype: int64

In [9]:
df_merged.head()

Unnamed: 0,time,location,postcode,energy,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,coco
0,2006-10-01 00:30:00,Guildhall Electricity Supply 1 (HH),BA1 5AW,32.8,14.95,13.8,93.0,0.0,0.0,175.0,13.9,,1001.8,
1,2006-10-01 00:30:00,Guildhall Electricity Supply 1 (HH),BA1 5AW,32.8,14.95,13.8,93.0,0.0,0.0,175.0,13.9,,1001.8,
2,2006-10-01 01:00:00,Guildhall Electricity Supply 1 (HH),BA1 5AW,33.0,14.8,13.5,92.0,0.0,0.0,180.0,14.8,,1001.7,
3,2006-10-01 01:00:00,Guildhall Electricity Supply 1 (HH),BA1 5AW,33.0,14.8,13.5,92.0,0.0,0.0,180.0,14.8,,1001.7,
4,2006-10-01 01:30:00,Bath Central Library Electricity (HH),BA1 5AL,5.9,14.75,13.35,91.5,0.0,0.0,180.0,13.9,,1001.7,


Weird values, repeated at the beggining of the merged dataset. Are they also on the BANES dataset, or is it an error?

In [10]:
df_banes.sort_values(by="time", inplace=True)
df_banes.head()


Unnamed: 0,time,location,postcode,energy
0,2006-10-01 00:30:00,Guildhall Electricity Supply 1 (HH),BA1 5AW,32.8
48,2006-10-01 00:30:00,Guildhall Electricity Supply 1 (HH),BA1 5AW,32.8
49,2006-10-01 01:00:00,Guildhall Electricity Supply 1 (HH),BA1 5AW,33.0
1,2006-10-01 01:00:00,Guildhall Electricity Supply 1 (HH),BA1 5AW,33.0
2,2006-10-01 01:30:00,Bath Central Library Electricity (HH),BA1 5AL,5.9


The weird values are indeed in the BANES dataset, everything is fine, let's export the merged dataset.

In [11]:
df_merged.to_csv("BANES_weather_merged.csv", index=False)


# Machine Learning

The rest of this notebook will use the state-of-the-art algorithms reviewed with the 