In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
ulaanbaatar_df = pd.read_csv("./Data/Mongolia/ulaanbaatar.csv")

In [3]:
station_names = ulaanbaatar_df['location'].unique()

In [4]:
def convert_string_to_datetime(dt_string):
    dt_string = dt_string.split(",")[0][5:]
    dt_obj = pd.to_datetime(dt_string)
    
    return dt_obj

In [5]:
stations = []

for station_name in station_names:
    coordinates = ulaanbaatar_df[ulaanbaatar_df['location'] == station_names[0]].iloc[0]['coordinates']
    latitude, longitude = coordinates.split(", ")
    latitude = float(latitude[1:].split("=")[-1])
    longitude = float(longitude[:-1].split("=")[-1])
    
    stations.append([station_name, latitude, longitude])

In [6]:
def impute(station_df, column="PM2.5", window_length=24):
    try:
        for idx, value in enumerate(station_df[column].values):
            if np.isnan(value):
                station_df[column][idx] = station_df[column][idx - window_length]
    except:
        pass
    return station_df.fillna(station_df.mean())

In [7]:
dataset = []

for i, (station_name, latitude, longitude) in  enumerate(stations):  
    
    df = ulaanbaatar_df[(ulaanbaatar_df['location'] == station_name) & (ulaanbaatar_df['parameter'] == "pm25")]
    
    if df.shape[0] == 0:
        continue
    
    df = df[['date', 'value']].reset_index(drop=True)
    df['date'] = df['date'].map(lambda x: convert_string_to_datetime(x))
    df = df.set_index(['date'])
    df.index.names = ['Date']
    df.columns = ["PM2.5"]
    df = df.sort_index()
    df = df.resample("d").mean()
    df = df.asfreq('D')
    df.index = df.index.astype('datetime64[ns]')
    
    df = impute(df, column="PM2.5", window_length=365)
      
    city = "Ulaanbaatar"
    
    if ((i == 10) or (i == 6)):
        continue
    
    item = {
        "station_name": station_name,
        "station_id": f"UB{(i+1):03}", 
        "city": city,
        "latitude": latitude,
        "longitude": longitude,
        "df": df
    }
    
    dataset.append(item)

In [8]:
dataset = dataset[:8]

In [9]:
import pickle

pickle.dump(dataset, open("./Data/ulaanbaatar_dataset.pkl", "wb"), protocol=4)