In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
seoul_pm25_df = pd.read_csv("./Data/Seoul/seoul_summary.csv")
seoul_stations_df = pd.read_csv("./Data/Seoul/seoul_station_info.csv")

In [3]:
station_codes = seoul_stations_df['Station code'].unique()

In [4]:
dataset = []

In [5]:
def impute(station_df, column="PM2.5", window_length=24):
    try:
        for idx, value in enumerate(station_df[column].values):
            if np.isnan(value):
                station_df[column][idx] = station_df[column][idx - window_length]
    except:
        pass
    return station_df

In [6]:
for station_code in station_codes:
    station_name, _, latitude, longitude = seoul_stations_df[seoul_stations_df['Station code'] == station_code].values[0][1:]
    
    df = seoul_pm25_df[seoul_pm25_df['Station code'] == station_code]
    df['Measurement date'] = pd.to_datetime(df['Measurement date'])
    df = df.set_index(['Measurement date']).asfreq('H')
    df.index.names = ['Date']
    df = df[['PM2.5']]
    df = df.resample('d').mean()
    df = df.asfreq('D')
    
    df = impute(df, column="PM2.5", window_length=90)
    
    if(df.values < 0).any():
        continue

    city = "Seoul"

    if (station_code == 106) or (station_code == 109) \
        or (station_code == 110) or (station_code == 107):
            continue

    item = {
        "station_name": station_name,
        "station_id": f"SL{station_code}", 
        "city": city,
        "latitude": latitude,
        "longitude": longitude,
        "df": df
    }

    dataset.append(item)

In [7]:
dataset = dataset[:8]

In [8]:
import pickle

pickle.dump(dataset, open("./Data/seoul_dataset.pkl", "wb"), protocol=4)