In [1]:
from datetime import datetime
from meteostat import Point, Hourly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('BANES_Energy_Data_Electricity_cleaned.csv', parse_dates=['date'])

df['date'].min(), df['date'].max()

(Timestamp('2006-10-01 00:00:00+0000', tz='UTC'),
 Timestamp('2020-02-07 00:00:00+0000', tz='UTC'))

We need to get the weather info from **01/10/2006** to **07/02/2020** (including).

In [3]:
start = datetime(2006, 10, 1)
end = datetime(2020, 2, 8) # plus one day, because the last day is not included in the data

# create point for Bath, UK
bath = Point(51.3751,-2.3617, 96)

# Get hourly data for Bath, UK
hourly = Hourly(bath, start, end)
data = hourly.fetch()

In [4]:
data.to_csv('Bath_hourly.csv')

We've downloaded the data, let's see if everything is alright.

In [19]:
df = pd.read_csv('Bath_hourly.csv', parse_dates=['time'])
df.head()


Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
0,2006-10-01 00:00:00,15.1,14.1,94.0,,,170.0,13.0,,1001.9,,
1,2006-10-01 01:00:00,14.8,13.5,92.0,,,180.0,14.8,,1001.7,,
2,2006-10-01 02:00:00,14.7,13.2,91.0,,,180.0,13.0,,1001.7,,
3,2006-10-01 03:00:00,14.4,13.1,92.0,,,180.0,9.4,,1001.5,,
4,2006-10-01 04:00:00,13.7,12.6,93.0,,,170.0,13.0,,1001.3,,


In [6]:
df.describe()

Unnamed: 0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
count,116978.0,116977.0,116977.0,13625.0,0.0,116887.0,116973.0,12339.0,115412.0,30.0,16926.0
mean,10.478574,7.674395,83.97791,0.104778,,190.760829,17.042215,26.050118,1015.157834,0.0,4.295522
std,5.350079,4.853305,12.993031,0.359381,,90.850028,8.586724,12.3298,11.192936,0.0,3.013802
min,-9.4,-12.3,15.0,0.0,,1.0,0.0,3.7,960.2,0.0,1.0
25%,6.6,4.5,77.0,0.0,,110.0,11.2,16.7,1008.6,0.0,2.0
50%,10.6,7.7,87.0,0.0,,210.0,16.6,24.1,1016.1,0.0,3.0
75%,14.6,11.6,93.0,0.0,,260.0,22.3,33.3,1022.7,0.0,7.0
max,31.6,22.7,100.0,7.5,,360.0,79.2,76.0,1048.9,0.0,25.0


In [7]:
# rows where tsun is not null
df[df['tsun'].notna()].head(3)

Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco
106590,2018-11-28 06:00:00,10.8,10.3,97.0,0.4,,180.0,22.2,35.2,1004.0,0.0,7.0
106614,2018-11-29 06:00:00,12.0,10.6,91.0,0.3,,172.0,33.3,50.0,996.2,0.0,8.0
106662,2018-12-01 06:00:00,8.7,7.5,92.0,1.2,,154.0,18.5,31.5,1003.3,0.0,8.0


About the columns:

- ``temp``: Temperature (°C)
- ``dwpt``: Dew Point (°C)
- ``rhum``: Relative Humidity (%)
- ``prcp``: The one hour precipitation total (mm)
- ``snow``: Snow depth (mm)
- ``wdir``: Wind direction (°)
- ``wspd``: Wind Speed (km/h)
- ``wpgt``: Peak Gust (km/h)
- ``pres``: Air pressure (hPa)
- ``tsun``: Sunshine Duration (m)
- ``coco``: Weather condition Code

### Weather Condition Codes

<center>

| Code | Weather Condition |
| ---- | ----------------- |
1  |	Clear
2  |	Fair
3  |	Cloudy
4  |	Overcast
5  |	Fog
6  |	Freezing Fog
7  |	Light Rain
8  |	Rain
9  |	Heavy Rain
10 | 	Freezing Rain
11 | 	Heavy Freezing Rain
12 | 	Sleet
13 | 	Heavy Sleet
14 | 	Light Snowfall
15 | 	Snowfall
16 | 	Heavy Snowfall
17 | 	Rain Shower
18 | 	Heavy Rain Shower
19 | 	Sleet Shower
20 | 	Heavy Sleet Shower
21 | 	Snow Shower
22 | 	Heavy Snow Shower
23 | 	Lightning
24 | 	Hail
25 | 	Thunderstorm
26 | 	Heavy Thunderstorm
27 | 	Storm

</center>

In [8]:
df.isnull().sum()

time         0
temp        95
dwpt        96
rhum        96
prcp    103448
snow    117073
wdir       186
wspd       100
wpgt    104734
pres      1661
tsun    117043
coco    100147
dtype: int64

Lots of null values, they must be addressed.

In [9]:
# tsun is always null or zero, so drop it
df.drop(['tsun'], axis=1, inplace=True)

In [10]:
# when precipitation is null, it is 0, so we can replace NaN with 0
df['prcp'].fillna(0, inplace=True)
df['snow'].fillna(0, inplace=True)

In [11]:
# interpolate missing values, when it is admisible
df['temp'] = df['temp'].interpolate(method='linear', limit_direction='forward', axis=0)
df['dwpt'] = df['dwpt'].interpolate(method='linear', limit_direction='forward', axis=0)
df['rhum'] = df['rhum'].interpolate(method='linear', limit_direction='forward', axis=0)
df['wdir'] = df['wdir'].interpolate(method='linear', limit_direction='forward', axis=0)
df['wspd'] = df['wspd'].interpolate(method='linear', limit_direction='forward', axis=0)
df['pres'] = df['pres'].interpolate(method='linear', limit_direction='forward', axis=0)

In [12]:
df.isnull().sum()

time         0
temp         0
dwpt         0
rhum         0
prcp         0
snow         0
wdir         0
wspd         0
wpgt    104734
pres         0
coco    100147
dtype: int64

What we did was interpolate the values in the columns where there aren't too many values missing. ``Peak Gust`` and ``Weather Condition Code`` have too many missing values, but we'll leave as is, because they may be important. Lastly, ``Sunshine Duration`` is unnecessary and was droped.

We are going to check now, if there are missing rows.

In [13]:
# numpy array of dates
dates = np.array(df['time'])

# numpy array of a range of dates hourly from start to end
dates_range = np.arange(df['time'].min(), df['time'].max(), dtype='datetime64[h]')
dates_range[:10], dates_range[-10:]


(array(['2006-10-01T00', '2006-10-01T01', '2006-10-01T02', '2006-10-01T03',
        '2006-10-01T04', '2006-10-01T05', '2006-10-01T06', '2006-10-01T07',
        '2006-10-01T08', '2006-10-01T09'], dtype='datetime64[h]'),
 array(['2020-02-07T14', '2020-02-07T15', '2020-02-07T16', '2020-02-07T17',
        '2020-02-07T18', '2020-02-07T19', '2020-02-07T20', '2020-02-07T21',
        '2020-02-07T22', '2020-02-07T23'], dtype='datetime64[h]'))

In [14]:
# dates in dates_range that are not in dates
dates_range[~np.in1d(dates_range, dates)]

array([], dtype='datetime64[h]')

In [15]:
# export to csv
df.to_csv('Bath_hourly_cleaned.csv')

In [36]:
from tqdm import tqdm

df_copy = pd.read_csv('Bath_hourly_cleaned.csv', parse_dates=['time'])

# iterrows with progress bar
for index in tqdm(range(df_copy.shape[0])):
    row = df_copy.iloc[index]

    # add 30 minutes to the current time
    time = row['time'] + pd.Timedelta(minutes=30)
    # create a new row with only the time
    # the rest of the columns in the line will be NaN
    line = pd.Series({'time': time})
    # append the new row to the end of the dataframe
    df_copy = df_copy.append(line, ignore_index=True)
    
# sort the dataframe by time
df_copy.sort_values(by='time', inplace=True)
# reset the index
df_copy.reset_index(inplace=True, drop=True)

df_copy.head()

100%|██████████| 117073/117073 [1:05:22<00:00, 29.84it/s]


Unnamed: 0.1,Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,coco
0,0,2006-10-01 00:00:00,15.1,14.1,94.0,0.0,0.0,170.0,13.0,,1001.9,
1,NaT,2006-10-01 00:30:00,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
2,1,2006-10-01 01:00:00,14.8,13.5,92.0,0.0,0.0,180.0,14.8,,1001.7,
3,,2006-10-01 01:30:00,,,,,,,,,,
4,2,2006-10-01 02:00:00,14.7,13.2,91.0,0.0,0.0,180.0,13.0,,1001.7,


In [38]:
df_copy.drop(['Unnamed: 0'], axis=1, inplace=True)
df_copy.head()

Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,coco
0,2006-10-01 00:00:00,15.1,14.1,94.0,0.0,0.0,170.0,13.0,,1001.9,
1,2006-10-01 00:30:00,NaT,NaT,NaT,0.0,0.0,NaT,NaT,NaT,NaT,NaT
2,2006-10-01 01:00:00,14.8,13.5,92.0,0.0,0.0,180.0,14.8,,1001.7,
3,2006-10-01 01:30:00,,,,0.0,0.0,,,,,
4,2006-10-01 02:00:00,14.7,13.2,91.0,0.0,0.0,180.0,13.0,,1001.7,


In [43]:
# fill NaT with NaN
df_copy['temp'] = df_copy['temp'].fillna(np.nan)
df_copy['dwpt'] = df_copy['dwpt'].fillna(np.nan)
df_copy['rhum'] = df_copy['rhum'].fillna(np.nan)
df_copy['wdir'] = df_copy['wdir'].fillna(np.nan)
df_copy['wspd'] = df_copy['wspd'].fillna(np.nan)
df_copy['pres'] = df_copy['wpgt'].fillna(np.nan)

df_copy.head()

Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,coco
0,2006-10-01 00:00:00,15.1,14.1,94.0,0.0,0.0,170.0,13.0,,1001.9,
1,2006-10-01 00:30:00,,,,0.0,0.0,,,,,NaT
2,2006-10-01 01:00:00,14.8,13.5,92.0,0.0,0.0,180.0,14.8,,1001.7,
3,2006-10-01 01:30:00,,,,0.0,0.0,,,,,
4,2006-10-01 02:00:00,14.7,13.2,91.0,0.0,0.0,180.0,13.0,,1001.7,


In [44]:

# interpolate the missing values
df_copy['temp'] = df_copy['temp'].interpolate(method='linear', limit_direction='forward', axis=0)
df_copy['dwpt'] = df_copy['dwpt'].interpolate(method='linear', limit_direction='forward', axis=0)
df_copy['rhum'] = df_copy['rhum'].interpolate(method='linear', limit_direction='forward', axis=0)
df_copy['wdir'] = df_copy['wdir'].interpolate(method='linear', limit_direction='forward', axis=0)
df_copy['wspd'] = df_copy['wspd'].interpolate(method='linear', limit_direction='forward', axis=0)
df_copy['pres'] = df_copy['pres'].interpolate(method='linear', limit_direction='forward', axis=0)

# fill the missing values with 0
df_copy['prcp'].fillna(0, inplace=True)
df_copy['snow'].fillna(0, inplace=True)

df_copy.head()

Unnamed: 0,time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,coco
0,2006-10-01 00:00:00,15.1,14.1,94.0,0.0,0.0,170.0,13.0,,1001.9,
1,2006-10-01 00:30:00,14.95,13.8,93.0,0.0,0.0,175.0,13.9,,1001.8,NaT
2,2006-10-01 01:00:00,14.8,13.5,92.0,0.0,0.0,180.0,14.8,,1001.7,
3,2006-10-01 01:30:00,14.75,13.35,91.5,0.0,0.0,180.0,13.9,,1001.7,
4,2006-10-01 02:00:00,14.7,13.2,91.0,0.0,0.0,180.0,13.0,,1001.7,


In [45]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234146 entries, 0 to 234145
Data columns (total 11 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   time    234146 non-null  datetime64[ns]
 1   temp    234146 non-null  float64       
 2   dwpt    234146 non-null  float64       
 3   rhum    234146 non-null  float64       
 4   prcp    234146 non-null  float64       
 5   snow    234146 non-null  float64       
 6   wdir    234146 non-null  float64       
 7   wspd    234146 non-null  float64       
 8   wpgt    12339 non-null   float64       
 9   pres    234146 non-null  float64       
 10  coco    16926 non-null   object        
dtypes: datetime64[ns](1), float64(9), object(1)
memory usage: 19.7+ MB


In [46]:
df_copy.describe()

Unnamed: 0,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres
count,234146.0,234146.0,234146.0,234146.0,234146.0,234146.0,234146.0,12339.0,234146.0
mean,10.478138,7.673656,83.976062,0.006097,0.0,190.729026,17.041686,26.050118,1015.13083
std,5.341295,4.844895,12.849243,0.090093,0.0,88.848756,8.463624,12.3298,11.165056
min,-9.4,-12.3,15.0,0.0,0.0,1.0,0.0,3.7,960.2
25%,6.6,4.35,77.0,0.0,0.0,120.0,11.2,16.7,1008.5
50%,10.6,7.8,87.0,0.0,0.0,210.0,15.75,24.1,1016.1
75%,14.6,11.6,93.5,0.0,0.0,260.0,22.3,33.3,1022.65
max,31.6,22.7,100.0,7.5,0.0,360.0,79.2,76.0,1048.9


In [47]:
df_copy.isnull().sum()

time         0
temp         0
dwpt         0
rhum         0
prcp         0
snow         0
wdir         0
wspd         0
wpgt    221807
pres         0
coco    217220
dtype: int64

In [48]:
df_copy.to_csv('Bath_hourly_cleaned_30min.csv')

### Summary

- Download data using Meteostat's api.
    - Location is Bath, UK, with coordinates ``51.3751,-2.3617, 96``
- Missing data is interpolated, except Peak Gust and Weather Condition, which have too many missing values
- Data is hourly, added 30min. intervals with interpolated values.