In [12]:
import pandas as pd
import holidays

df = pd.read_csv('./data/consumption_temp.csv')


In [13]:
### Some data cleaning...

# Remove Helsingfors as this is outside of Norway
df = df[df['location'] != 'helsingfors']
# Convert 'time' to datetime
df['time'] = pd.to_datetime(df['time'])


### Holiday binary encoding

# Extract date from 'time'
df['date'] = df['time'].dt.date

# Get Norwegian holidays
no_holidays = holidays.country_holidays('NO')

# Create a feature for whether or not the date is a holiday
df['is_holiday'] = df['date'].isin(no_holidays)


### Other time related features

# Extracting weekday feature
df['weekday'] = df['time'].dt.weekday

# Creating features for time of day
df['hour_of_day'] = df['time'].dt.hour


### Lag feature

# Lag features for how the consumption was 5 days earlier
df['consumption_lag_days_5'] = df.sort_values('date').groupby(['hour_of_day', 'location'])['consumption'].shift(5)


In [14]:
### Seasonality feature

# Assuming that:
# 1 represents spring (March, April, May),
# 2 represents summer (June, July, August),
# 3 represents fall (September, October, November), 
# 4 represents winter (December, January, February)
seasons = {1: '4', 2: '4', 3: '1', 4: '1', 5: '1', 6: '2', 7: '2', 8: '2', 9: '3', 10: '3', 11: '3', 12: '4'}

# Apply the mapping to the 'month' column
df['season'] = df['time'].dt.month.map(seasons)


### Temperature difference from the previous day at the same time

# If direction of change matters, we should keep neg/pos values (this is usually
# the case when working with data involving temperature differences)
# If only magnitude of change matters, we should take the absolute value
df['temperature_diff_prev_day'] = df.sort_values('date').groupby(['hour_of_day', 'location'])['temperature'].diff()


### Moving averages for consumption over the past week

# We apply a transform function to each location. Since each row in the data
# is a  by hour, and each location has 24 hours of data for each day,
# we can apply a rolling window of 168 (24*7) to get the past week's consumption
# grouped by each location
df['consumption_ma_week'] = df.groupby('location')['consumption'].transform(lambda x: x.rolling(24*7).mean())


### Rolling window statistics for consumption over the past week

df['consumption_max_week'] = df.groupby(['location'])['consumption'].transform(lambda x: x.rolling(24*7).max())
df['consumption_min_week'] = df.groupby(['location'])['consumption'].transform(lambda x: x.rolling(24*7).min())


### Exponential smoothing for consumption

# here x.ewm(alpha=0.5).mean() calculates the exponential weighted moving 
# average for consumption for each location. alpha=0.5 is the decay factor, 
# which controls the rate of decay. A large alpha will make the EWM more 
# responsive to recent values, while a small alpha will make the EWM respond 
# more to historical values. We can play around with this value.
df['consumption_ewm_alpha_0.5'] = df.groupby(['location'])['consumption'].apply(lambda x: x.ewm(alpha=0.5).mean()).reset_index(level=0, drop=True)

# Save results to csv
df.to_csv('consumption_temp_w_features.csv', index=False)


In [15]:
# Apply to model by setting target = consumption, scaling, fitting model, etc...
X = df.drop('consumption', axis=1)
y = df['consumption']
# ... and so on
