In [None]:
import glob
import pandas as pd
import xarray as xr
import numpy as np

In [None]:
df_openaq = pd.read_csv(
    '/nfs/b0122/Users/earlacoa/openaq/csv/openaq_data_2015_noduplicates.csv',
    index_col="date.utc",
    parse_dates=True
)

In [None]:
df_obs_summaries = {'2014': [], '2015': [], '2016': [], '2017': [], '2018': [], '2019': [], '2020': []}
china_obs_files = glob.glob('/nfs/a68/earlacoa/china_measurements_corrected/*nc')
parameters = {'CO': 'co', 'NO2': 'no2', 'O3': 'o3', 'PM10': 'pm10', 'PM2.5': 'pm25', 'SO2': 'so2'}
years = ['2014', '2015', '2016', '2017', '2018', '2019', '2020']


for china_obs_file in china_obs_files:
    ds_obs = xr.open_dataset(china_obs_file)
    
    for parameter in parameters.keys():
        dict_obs = {
            'date.utc': ds_obs.time.values,
            'city': ds_obs.city,
            'unit': 'µg/m³',
            'value': ds_obs[parameter].values,
            'country': 'CN',
            'location': ds_obs.name,
            'parameter': parameters[parameter],
            'sourceName': 'China measurements',
            'sourceType': 'government',
            'date.local': ds_obs.time.values + np.timedelta64(8, 'h'),
            'coordinates.latitude': ds_obs.lat,
            'coordinates.longitude': ds_obs.lon, 
            'averagingPeriod.unit': 'hours',
            'averagingPeriod.value': 1
        }
        df_obs = pd.DataFrame.from_dict(dict_obs)
        df_obs.set_index('date.utc', inplace=True)
        
        for year in years:
            df_obs_summaries[year].append(df_obs[year])
        
    ds_obs.close()

In [None]:
for year in years:
    df_obs_summaries_concat = pd.concat(df_obs_summaries[year])
    df_obs_summaries_concat.to_csv(f'/nfs/a68/earlacoa/china_measurements_corrected/df_obs_summary_{year}.csv')