# Feature Engineering

### Import libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, date, time 

In [2]:
# Options
pd.set_option('display.max_columns', None)

### Import Data

In [3]:
## For train-data
outputfile = "02_pp_sg_train_features.csv"
df = pd.read_csv("./01_pp_sg_train_cleaned.csv", sep=";")
df.head()

Unnamed: 0,P24,P44,P42,P33,P23,P25,P21,P31,P53,P32,P22,P52,P51,P43,datetime,ferien,feiertag,covid_19,olma_offa,temperature_2m_max,temperature_2m_min,rain_sum,snowfall_sum
0,206.0,253.0,0.0,89.0,127.0,198.0,221.0,130.0,0.0,63.0,74.0,0.0,57.0,32.0,02.10.2019 07:25,1,0,0,0,14.701,6.601,16.9,0.0
1,87.0,87.0,0.0,4.0,24.0,173.0,69.0,25.0,0.0,26.0,0.0,0.0,13.0,5.0,04.10.2019 15:28,1,0,0,0,12.301,3.551,8.0,0.0
2,99.0,106.0,0.0,3.0,38.0,175.0,68.0,21.0,0.0,27.0,3.0,0.0,14.0,11.0,04.10.2019 15:43,1,0,0,0,12.301,3.551,8.0,0.0
3,105.0,109.0,0.0,3.0,37.0,178.0,78.0,22.0,0.0,30.0,1.0,0.0,18.0,15.0,04.10.2019 15:58,1,0,0,0,12.301,3.551,8.0,0.0
4,104.0,133.0,0.0,8.0,44.0,183.0,91.0,22.0,0.0,32.0,1.0,0.0,24.0,21.0,04.10.2019 16:13,1,0,0,0,12.301,3.551,8.0,0.0


In [4]:
## For test-data
#outputfile = "pp_sg_test_features.csv"
#df = pd.read_csv("./pp_sg_test_cleaned.csv", sep=";")
#df.head()

### Extract Time Components

In [5]:
# Make Object to datetime
df['datetime'] = pd.to_datetime(df['datetime'], format='%d.%m.%Y %H:%M')

# Extract Date
df['date'] = df['datetime'].dt.date

# Extract Year
df['year'] = df['datetime'].dt.year

# Extract Month
df['month'] = df['datetime'].dt.month

# Extract Day
df['day'] = df['datetime'].dt.day

# Extract Weekday
df['weekdayname'] = df['datetime'].dt.day_name()
df['weekday'] = df['datetime'].dt.dayofweek

# Extract Time
df['time'] = df['datetime'].dt.strftime('%H:%M')

# Extract Hour
df['hour'] = df['datetime'].dt.hour

# Extract Minute
df['minute'] = df['datetime'].dt.minute

In [6]:
df.head()

Unnamed: 0,P24,P44,P42,P33,P23,P25,P21,P31,P53,P32,P22,P52,P51,P43,datetime,ferien,feiertag,covid_19,olma_offa,temperature_2m_max,temperature_2m_min,rain_sum,snowfall_sum,date,year,month,day,weekdayname,weekday,time,hour,minute
0,206.0,253.0,0.0,89.0,127.0,198.0,221.0,130.0,0.0,63.0,74.0,0.0,57.0,32.0,2019-10-02 07:25:00,1,0,0,0,14.701,6.601,16.9,0.0,2019-10-02,2019,10,2,Wednesday,2,07:25,7,25
1,87.0,87.0,0.0,4.0,24.0,173.0,69.0,25.0,0.0,26.0,0.0,0.0,13.0,5.0,2019-10-04 15:28:00,1,0,0,0,12.301,3.551,8.0,0.0,2019-10-04,2019,10,4,Friday,4,15:28,15,28
2,99.0,106.0,0.0,3.0,38.0,175.0,68.0,21.0,0.0,27.0,3.0,0.0,14.0,11.0,2019-10-04 15:43:00,1,0,0,0,12.301,3.551,8.0,0.0,2019-10-04,2019,10,4,Friday,4,15:43,15,43
3,105.0,109.0,0.0,3.0,37.0,178.0,78.0,22.0,0.0,30.0,1.0,0.0,18.0,15.0,2019-10-04 15:58:00,1,0,0,0,12.301,3.551,8.0,0.0,2019-10-04,2019,10,4,Friday,4,15:58,15,58
4,104.0,133.0,0.0,8.0,44.0,183.0,91.0,22.0,0.0,32.0,1.0,0.0,24.0,21.0,2019-10-04 16:13:00,1,0,0,0,12.301,3.551,8.0,0.0,2019-10-04,2019,10,4,Friday,4,16:13,16,13


### Decompose Time-Features in sine and cosine component

In [7]:
# Inspired by https://medium.com/mlearning-ai/transformer-implementation-for-time-series-forecasting-a9db2db5c820 
# (vgl. https://github.com/nok-halfspace/Transformer-Time-Series-Forecasting/blob/main/Preprocessing.py) 

minutes_in_hour = 60
hours_in_day = 24
days_in_week = 7
days_in_month = 30
month_in_year = 12


df['sin_minute'] = np.sin(2*np.pi*df['minute']/minutes_in_hour)
df['cos_minute'] = np.cos(2*np.pi*df['minute']/minutes_in_hour)
df['sin_hour'] = np.sin(2*np.pi*df['hour']/hours_in_day)
df['cos_hour'] = np.cos(2*np.pi*df['hour']/hours_in_day)
df['sin_weekday'] = np.sin(2*np.pi*df['weekday']/days_in_week)
df['cos_weekday'] = np.cos(2*np.pi*df['weekday']/days_in_week)
df['sin_day'] = np.sin(2*np.pi*df['day']/days_in_month)
df['cos_day'] = np.cos(2*np.pi*df['day']/days_in_month)
df['sin_month'] = np.sin(2*np.pi*df['month']/month_in_year)
df['cos_month'] = np.cos(2*np.pi*df['month']/month_in_year)

In [8]:
df.head()

Unnamed: 0,P24,P44,P42,P33,P23,P25,P21,P31,P53,P32,P22,P52,P51,P43,datetime,ferien,feiertag,covid_19,olma_offa,temperature_2m_max,temperature_2m_min,rain_sum,snowfall_sum,date,year,month,day,weekdayname,weekday,time,hour,minute,sin_minute,cos_minute,sin_hour,cos_hour,sin_weekday,cos_weekday,sin_day,cos_day,sin_month,cos_month
0,206.0,253.0,0.0,89.0,127.0,198.0,221.0,130.0,0.0,63.0,74.0,0.0,57.0,32.0,2019-10-02 07:25:00,1,0,0,0,14.701,6.601,16.9,0.0,2019-10-02,2019,10,2,Wednesday,2,07:25,7,25,0.5,-0.866025,0.965926,-0.258819,0.974928,-0.222521,0.406737,0.913545,-0.866025,0.5
1,87.0,87.0,0.0,4.0,24.0,173.0,69.0,25.0,0.0,26.0,0.0,0.0,13.0,5.0,2019-10-04 15:28:00,1,0,0,0,12.301,3.551,8.0,0.0,2019-10-04,2019,10,4,Friday,4,15:28,15,28,0.207912,-0.978148,-0.707107,-0.707107,-0.433884,-0.900969,0.743145,0.669131,-0.866025,0.5
2,99.0,106.0,0.0,3.0,38.0,175.0,68.0,21.0,0.0,27.0,3.0,0.0,14.0,11.0,2019-10-04 15:43:00,1,0,0,0,12.301,3.551,8.0,0.0,2019-10-04,2019,10,4,Friday,4,15:43,15,43,-0.978148,-0.207912,-0.707107,-0.707107,-0.433884,-0.900969,0.743145,0.669131,-0.866025,0.5
3,105.0,109.0,0.0,3.0,37.0,178.0,78.0,22.0,0.0,30.0,1.0,0.0,18.0,15.0,2019-10-04 15:58:00,1,0,0,0,12.301,3.551,8.0,0.0,2019-10-04,2019,10,4,Friday,4,15:58,15,58,-0.207912,0.978148,-0.707107,-0.707107,-0.433884,-0.900969,0.743145,0.669131,-0.866025,0.5
4,104.0,133.0,0.0,8.0,44.0,183.0,91.0,22.0,0.0,32.0,1.0,0.0,24.0,21.0,2019-10-04 16:13:00,1,0,0,0,12.301,3.551,8.0,0.0,2019-10-04,2019,10,4,Friday,4,16:13,16,13,0.978148,0.207912,-0.866025,-0.5,-0.433884,-0.900969,0.743145,0.669131,-0.866025,0.5


### Write CSV

In [9]:
df.to_csv(outputfile, sep=";", index=False)