In [1]:
import pandas as pd
import numpy as np
import os
import re
import time
from datetime import datetime
from tqdm import tqdm_notebook

In [2]:
input_root = "../../data/raw/aqi/aqi_2015_2016/"
output_root = "../../data/csv/aqi_csv/"
# input_root = "../../data/raw/aqi/aqi_2014/"
# output_root = "../../data/csv/aqi_csv_test/"
files = [f for f in os.listdir(input_root) if f.startswith('china_sites_')]
print(files[:5])

['china_sites_20150730.csv', 'china_sites_20160225.csv', 'china_sites_20150102.csv', 'china_sites_20150103.csv', 'china_sites_20150104.csv']


In [3]:
# test case
# files = files[:5]

In [4]:
def convert(pm10, no2, so2):
    """Accept three 1d arrays of the same length"""
    
    api_breaks = np.array([0, 50, 100, 200, 300, 400, 500])
    
    # convert from mg/m3 to ug/m3
    pm10_breaks = np.array([0, 0.050, 0.150, 0.350, 0.420, 0.500, 0.600]) * 1000
    so2_breaks = np.array([0, 0.050, 0.150, 0.800, 1.600, 2.100, 2.620]) * 1000
    no2_breaks = np.array([0, 0.080, 0.120, 0.280, 0.565, 0.750, 0.940]) * 1000
    
    # piecewise linear functions
    pm10_idx = np.interp(pm10, pm10_breaks, api_breaks)
    so2_idx = np.interp(so2, so2_breaks, api_breaks)
    no2_idx = np.interp(no2, no2_breaks, api_breaks)
    
    return(np.amax([pm10_idx, so2_idx, no2_idx], axis=0))

In [5]:
for file in tqdm_notebook(files):
    # parse and format dates
    csv_date = re.findall('\d+', file)[0]
    csv_date = datetime.strptime(csv_date, "%Y%m%d").strftime("%Y-%m-%d")
    # load file and take means
    csv = pd.read_csv(os.path.join(input_root, file))
    # csv = csv[csv.hour.between(10, 14)]
    if csv.shape[0] > 0:
        csv = csv.drop(['date', 'hour'], axis=1)
        csv = csv.set_index('type')
        csv = csv.drop(['CO_24h', 'NO2_24h', 'O3_24h', 'O3_8h',
                        'O3_8h_24h', 'PM10_24h', 'PM2.5_24h', 'SO2_24h'], axis=0)
        csv = csv.groupby('type').agg(np.nanmean)
        csv = csv.transpose()
        csv.columns = ["target_" + s for s in csv.columns]
        csv = csv.assign(date = csv_date)
        csv.index.name = 'id'
        csv = csv.assign(target_API = convert(
            pm10=csv.target_PM10.values,
            no2=csv.target_NO2.values,
            so2=csv.target_SO2.values))
        csv.to_csv(os.path.join(output_root, csv_date + '_AQI.csv'))

  f = lambda x: func(x, *args, **kwargs)



