In [1]:
import pandas as pd

from datetime import datetime, timedelta
from meteostat import Point, Daily, Hourly, Stations
from typing import Tuple, List

In [2]:
def find_nearby_weather_stations(amount: int = 1) -> pd.DataFrame:
    p1: Tuple =(40.6347,-8.66038, 'p1') # P1 Instituto de telecomunicações
    p3: Tuple =(40.64074, -8.65705, 'p3') #P3 Ponte Dobadoura
    p35: Tuple =(40.63028, -8.65423, 'p35') #P35 ISCA - UA
    
    posts: List[Tuple] = [p1, p3, p35]
    data: pd.DataFrame = pd.DataFrame()

    for post in posts:
        stations: Stations = Stations()
        temp: pd.DataFrame = stations.nearby(post[0], post[1]).fetch(amount)
        temp['post']: pd.Series = post[2]
        data: pd.DataFrame = pd.concat([data, temp])
    return data

In [3]:
def round_date(dt: datetime) -> datetime:
    return dt.replace(second=0, microsecond=0, minute=0, hour=dt.hour) + timedelta(hours=dt.minute // 30)

In [4]:
def add_weather_features(data: pd.DataFrame) -> pd.DataFrame:
    plant: Point = Point(40.64074, -8.65705)
    start: datetime = round_date(min(data["time_index"]))
    end: datetime = round_date(max(data["time_index"]))
    weather_data: pd.DataFrame = Hourly(plant, start, end).fetch()
    
    data["temperature"]: pd.Series = data["time_index"].apply(lambda x: weather_data.temp.loc[round_date(x)])
    data["humidity"]: pd.Series = data["time_index"].apply(lambda x: weather_data.rhum.loc[round_date(x)])
    data["precipitation"]:pd.Series = data["time_index"].apply(lambda x: weather_data.prcp.loc[round_date(x)])
    #data["wind"]:pd.Series=data["time_index"].apply(lambda x: weather_data.wspd.loc[round_date(x)])
    
    return data

In [5]:
def add_time_features(data: pd.DataFrame) -> pd.DataFrame:
    data['weekday']: pd.DataFrame = data['time_index'].dt.weekday
    data["hour"]: pd.DataFrame = data["time_index"].dt.hour
    data["minute"]: pd.DataFrame = data["time_index"].dt.minute
    return data

In [6]:
# check if the nearest weather station is the same for all posts
find_nearby_weather_stations()

Unnamed: 0_level_0,name,country,region,wmo,icao,latitude,longitude,elevation,timezone,hourly_start,hourly_end,daily_start,daily_end,monthly_start,monthly_end,distance,post
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
8547,Aveiro / São Jacinto,PT,AV,8547,LPAV,40.65,-8.7333,19.0,Europe/Lisbon,NaT,NaT,NaT,NaT,NaT,NaT,6383.408476,p1
8547,Aveiro / São Jacinto,PT,AV,8547,LPAV,40.65,-8.7333,19.0,Europe/Lisbon,NaT,NaT,NaT,NaT,NaT,NaT,6515.076682,p3
8547,Aveiro / São Jacinto,PT,AV,8547,LPAV,40.65,-8.7333,19.0,Europe/Lisbon,NaT,NaT,NaT,NaT,NaT,NaT,7022.74994,p35


In [7]:
# load
data: pd.DataFrame =  pd.read_parquet('s3://datalake-eu-central-1/ugiO-atchackathon/preprocessed/indicator.parquet')

In [8]:
# preprocess
data: pd.DataFrame = data.reset_index()
data['time_index']: pd.Series = data["time_index"].dt.tz_localize(None)

data: pd.DataFrame = add_weather_features(data)
data: pd.DataFrame = add_time_features(data)

data: pd.DataFrame = data.drop(['vehiclelight','speedlight','vehicleheavy','speedheavy','vehicleothers','speedothers','class_count'], axis=1)
data: pd.DataFrame = data.set_index('time_index')

In [9]:
# verify
data.head()

Unnamed: 0_level_0,n_p,indicator,temperature,humidity,precipitation,weekday,hour,minute
time_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-09-01 00:00:00,p1,-0.352757,16.6,98.0,0.0,3,0,0
2022-09-01 00:15:00,p1,-0.352757,16.6,98.0,0.0,3,0,15
2022-09-01 00:30:00,p1,-0.352757,16.4,98.0,0.0,3,0,30
2022-09-01 00:45:00,p1,-0.352757,16.4,98.0,0.0,3,0,45
2022-09-01 01:00:00,p1,-0.352757,16.4,98.0,0.0,3,1,0


In [10]:
# export
data.to_parquet('s3://datalake-eu-central-1/ugiO-atchackathon/preprocessed/indicator_regression.parquet')