In [1]:
import os
from os.path import expanduser
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
import shutil
from shapely import Point
from joblib import Parallel, delayed as delayed_joblib
from sklearn.model_selection import KFold

import matplotlib.pyplot as plt

### Reference

In [2]:
df = pd.read_csv("/home/patel_zeel/STGNP/dataset/bjair/NP_backup/processed_raw.csv")
df.head()

Unnamed: 0,time,station_id,v_speed,u_speed,PM25_Concentration,PM10_Concentration,NO2_Concentration,CO_Concentration,O3_Concentration,SO2_Concentration,...,wind_direction_0.0,wind_direction_1.0,wind_direction_2.0,wind_direction_3.0,wind_direction_4.0,wind_direction_9.0,wind_direction_13.0,wind_direction_14.0,wind_direction_23.0,wind_direction_24.0
0,2014-05-01 00:00:00,1001,5.600286,-5.600286,138.0,159.4,56.3,0.9,50.8,17.2,...,0,0,0,0,0,0,1,0,0,0
1,2014-05-01 01:00:00,1001,5.345727,-5.345727,124.0,163.9,38.7,0.9,51.1,17.9,...,0,0,0,0,0,0,1,0,0,0
2,2014-05-01 02:00:00,1001,4.072935,-4.072935,127.0,148.4,55.6,1.0,27.2,16.6,...,0,0,0,0,0,0,1,0,0,0
3,2014-05-01 03:00:00,1001,4.327494,-4.327494,129.0,145.6,65.7,1.0,9.7,16.7,...,0,0,0,0,0,0,1,0,0,0
4,2014-05-01 04:00:00,1001,0.0,-4.68,119.0,119.3,66.9,1.0,2.0,16.5,...,0,1,0,0,0,0,0,0,0,0


In [3]:
station_df = pd.read_csv("/home/patel_zeel/STGNP/dataset/bjair/NP_backup/stations.csv")
station_df.head()

Unnamed: 0,station_id,name_chinese,name_english,latitude,longitude,district_id
0,1001,海淀北部新区,HaiDianBeiBuXinQu,40.090679,116.173553,101
1,1002,海淀北京植物园,HaiDianBeiJingZhiWuYuan,40.00395,116.20531,101
2,1003,石景山古城,ShiJingShanGuCheng,39.914409,116.184239,102
3,1004,丰台云岗,FengTaiYunGang,39.815128,116.17115,103
4,1005,房山良乡,FangShanLiangXiang,39.742767,116.136045,104


In [4]:
df.columns

Index(['time', 'station_id', 'v_speed', 'u_speed', 'PM25_Concentration',
       'PM10_Concentration', 'NO2_Concentration', 'CO_Concentration',
       'O3_Concentration', 'SO2_Concentration', 'latitude', 'longitude',
       'temperature', 'pressure', 'humidity', 'wind_speed', 'PM25_Missing',
       'PM10_Missing', 'NO2_Missing', 'CO_Missing', 'O3_Missing',
       'SO2_Missing', 'weather_0.0', 'weather_1.0', 'weather_2.0',
       'weather_3.0', 'weather_4.0', 'weather_5.0', 'weather_6.0',
       'weather_7.0', 'weather_8.0', 'weather_9.0', 'weather_11.0',
       'weather_12.0', 'weather_13.0', 'weather_14.0', 'weather_15.0',
       'weather_16.0', 'wind_direction_0.0', 'wind_direction_1.0',
       'wind_direction_2.0', 'wind_direction_3.0', 'wind_direction_4.0',
       'wind_direction_9.0', 'wind_direction_13.0', 'wind_direction_14.0',
       'wind_direction_23.0', 'wind_direction_24.0'],
      dtype='object')

In [5]:
metadata = pd.read_pickle("/home/patel_zeel/STGNP/dataset/bjair/NP_backup/meta_data.pkl")
metadata

{'cont_cols': ['v_speed',
  'u_speed',
  'PM25_Concentration',
  'PM10_Concentration',
  'NO2_Concentration',
  'CO_Concentration',
  'O3_Concentration',
  'SO2_Concentration',
  'latitude',
  'longitude',
  'temperature',
  'pressure',
  'humidity',
  'wind_speed']}

In [6]:
locs = np.load("/home/patel_zeel/STGNP/dataset/bjair/NP_backup/test_nodes.npy")
locs

array([ 0,  1,  3,  4,  7,  8, 13, 14, 18, 29, 31, 34])

### Prepare

In [8]:
pa_lov = xr.open_dataset(f"/opt/aqmsp_models/data/purpleair/lov/data.nc")

# rename "station" to "station_id"
pa_lov = pa_lov.rename({"station": "station_id", "time": "time", "value": "PM25_Concentration", "lat": "latitude", "lon": "longitude"})


missing = pa_lov.PM25_Concentration.isnull().values
pa_lov["PM25_Missing"] = (("time", "station_id"), missing)

pa_lov

In [9]:
save_df = pa_lov.to_dataframe().reset_index()
save_df["pressure"] = np.random.rand(len(save_df))
save_df.head()

Unnamed: 0,time,station_id,PM25_Concentration,latitude,longitude,PM25_Missing,pressure
0,2021-06-01,64724,9.663158,34.017452,-118.436905,False,0.552573
1,2021-06-01,72255,16.125,34.15984,-119.18229,False,0.544259
2,2021-06-01,67566,15.448276,33.953278,-118.24304,False,0.018288
3,2021-06-01,70490,9.510345,34.1089,-118.4452,False,0.551038
4,2021-06-01,222949,12.683333,34.067818,-118.25429,False,0.631995


In [10]:
save_df.to_csv("/home/patel_zeel/STGNP/dataset/bjair/NP/processed_raw.csv", index=False)

### Create station df

In [11]:
pa_lov = xr.open_dataset(f"/opt/aqmsp_models/data/purpleair/lov/data.nc")

pa_lov = pa_lov.isel(time=0).rename({"station": "station_id", "lat": "latitude", "lon": "longitude"}).to_dataframe().reset_index().drop(["time", "value"], axis=1)
pa_lov.head(), len(pa_lov)

pa_lov.to_csv("/home/patel_zeel/STGNP/dataset/bjair/NP/stations.csv", index=False)

In [12]:
metadata

{'cont_cols': ['v_speed',
  'u_speed',
  'PM25_Concentration',
  'PM10_Concentration',
  'NO2_Concentration',
  'CO_Concentration',
  'O3_Concentration',
  'SO2_Concentration',
  'latitude',
  'longitude',
  'temperature',
  'pressure',
  'humidity',
  'wind_speed']}

In [13]:
metadata["cont_cols"] = ["latitude", "longitude", "PM25_Concentration", "pressure"]
print(metadata)
pd.to_pickle(metadata, "/home/patel_zeel/STGNP/dataset/bjair/NP/meta_data.pkl")

{'cont_cols': ['latitude', 'longitude', 'PM25_Concentration', 'pressure']}


In [14]:
# 80% of the locations are used for testing

len_test_locs = int(1022*0.8)
test_locs = np.random.permutation(1022)[:len_test_locs]
np.save("/home/patel_zeel/STGNP/dataset/bjair/NP/test_nodes.npy", test_locs)