In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
skopje_df = pd.read_csv("./Data/Skopje/skopje_pm25.csv", index_col=0)

skopje_df['time'] = pd.to_datetime(skopje_df['time'])
skopje_df = skopje_df.set_index(['time'])
skopje_df.index.names = ['Date']
skopje_df = skopje_df.resample("D").mean()
skopje_df = skopje_df["2015-01-01":]
skopje_df = skopje_df[["Centar", "Karpos"]]

In [3]:
def impute(station_df, column="PM2.5", window_length=24):
    try:
        for idx, value in enumerate(station_df[column].values):
            if np.isnan(value):
                station_df[column][idx] = station_df[column][idx - window_length]
    except:
        pass
    return station_df

In [4]:
stations = skopje_df.columns

In [5]:
lat_lng = {
    "Centar": {
        "latitude": 41.9954,
        "longitude": 21.4246,
    },
    "Karpos": {
        "latitude": 42.0030,
        "longitude": 21.3978,
    },
}

In [6]:
dataset = []

for i, station in enumerate(stations):
    df = skopje_df[[station]]
    df.columns = ['PM2.5']
    df = impute(df, column="PM2.5", window_length=24)
    
    city = "Skopje"
    
    latitude, longitude = lat_lng[station]["latitude"], lat_lng[station]["longitude"]
    
    item = {
        "station_name": station,
        "station_id": f"SK{(i+1):03}", 
        "city": city,
        "latitude": latitude,
        "longitude": longitude,
        "df": df
    }
    
    dataset.append(item)

In [7]:
import pickle

pickle.dump(dataset, open("./Data/skopje_dataset.pkl", "wb"), protocol=4)