In [27]:
import pandas as pd
import numpy as np
from glob import glob
import os, re
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from tqdm import tqdm_notebook

plt.style.use('ggplot')

In [28]:
input_root = "../../data/output/xgbooster/"
monitor_root = "../../data/csv/monitor"
test_root = "../../data/csv/aqi_csv_test/"

In [29]:
# load coordinates
monitor_coords = pd.read_csv(os.path.join(monitor_root, "monitor_coords.csv"))
monitor_coords.set_index("id", inplace=True)

In [54]:
freq = "day"
# freq = "7day"
# freq = "14day"
method = "cv"
# method = "test"
main_file = "train_output.csv"
# main_file = "test_output.csv"
r2_file = freq + "_r2_" + method + ".csv"

In [55]:
drop_file = "day_r2_cv.csv"

In [56]:
results = pd.DataFrame({
    "median": np.nan, "5p": np.nan, "10p": np.nan,
    "90p": np.nan, "95p": np.nan, "overall": np.nan},
    index=["API", "PM10", "PM2.5", "O3", "SO2", "NO2", "CO"])

for target_name in tqdm_notebook(["API", "PM10", "PM2.5", "O3", "SO2", "NO2", "CO"]):
    # calculate overall r2
    main = pd.read_csv(os.path.join(input_root, target_name, main_file))
    main['date'] = pd.to_datetime(main['date'])
    main = main.assign(
        year = main.date.dt.year,
        week = main.date.dt.week,
        biweek = ((main.date.dt.week + 1) / 2).astype(int))
    if freq == "day":
        results.loc[target_name, 'overall'] = r2_score(
            main.true.values, main.pred.values)
    elif freq == "7day":
        results.loc[target_name, 'overall'] = r2_score(
            main.groupby(['year', 'week']).true.mean().values,
            main.groupby(['year', 'week']).pred.mean().values)
    elif freq == "14day":
        results.loc[target_name, 'overall'] = r2_score(
            main.groupby(['year', 'biweek']).true.mean().values,
            main.groupby(['year', 'biweek']).pred.mean().values)
    else:
        raise ValueError("Specify Frequency")
    # calculate station-specific r2
    r2 = pd.read_csv(os.path.join(input_root, target_name, r2_file), index_col='id')
    # drop half of the stations
    drop = pd.read_csv(os.path.join(input_root, target_name, drop_file))
    ids = set(drop.loc[drop.r2 >= np.percentile(drop.r2, q=50), 'id'])
    ids = ids.intersection(set(r2.index))
    r2 = r2.loc[ids, :]
    results.loc[target_name, '5p'] = np.percentile(r2.r2.values, q=5)
    results.loc[target_name, '10p'] = np.percentile(r2.r2.values, q=10)
    results.loc[target_name, 'median'] = np.percentile(r2.r2.values, q=50)
    results.loc[target_name, '90p'] = np.percentile(r2.r2.values, q=90)
    results.loc[target_name, '95p'] = np.percentile(r2.r2.values, q=95)




In [57]:
results

Unnamed: 0,10p,5p,90p,95p,median,overall
API,0.293208,0.280258,0.467399,0.514954,0.362494,0.547382
PM10,0.284773,0.275206,0.459421,0.500277,0.354984,0.508091
PM2.5,0.332238,0.322423,0.502587,0.522303,0.391452,0.503471
O3,0.444517,0.430429,0.688065,0.726127,0.545479,0.618097
SO2,0.187106,0.157242,0.586689,0.646528,0.358325,0.57289
NO2,0.305031,0.288977,0.527643,0.564381,0.411268,0.600298
CO,0.161985,0.142338,0.505114,0.562199,0.302467,0.529981


In [58]:
for col in results:
    results[col] = results[col].round(2)
results['target'] = [
    "API", "PM$_{10}$", "PM$_{2.5}$", "O$_3$", "SO$_2$", "NO$_2$", "CO"]
results = results[['target', 'overall', '5p', '10p', 'median', '90p', '95p']]

In [59]:
results.columns = pd.MultiIndex.from_tuples([
    ("Target Variable", " "), ("Overall R$^2$", " "),
    ('Station-Specific R$^2$ Percentiles', "5\\%"),
    ('Station-Specific R$^2$ Percentiles', "10\\%"),
    ('Station-Specific R$^2$ Percentiles', "50\\%"),
    ('Station-Specific R$^2$ Percentiles', "90\\%"),
    ('Station-Specific R$^2$ Percentiles', "95\\%")])

In [60]:
results

Unnamed: 0_level_0,Target Variable,Overall R$^2$,Station-Specific R$^2$ Percentiles,Station-Specific R$^2$ Percentiles,Station-Specific R$^2$ Percentiles,Station-Specific R$^2$ Percentiles,Station-Specific R$^2$ Percentiles
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,5\%,10\%,50\%,90\%,95\%
API,API,0.55,0.28,0.29,0.36,0.47,0.51
PM10,PM$_{10}$,0.51,0.28,0.28,0.35,0.46,0.5
PM2.5,PM$_{2.5}$,0.5,0.32,0.33,0.39,0.5,0.52
O3,O$_3$,0.62,0.43,0.44,0.55,0.69,0.73
SO2,SO$_2$,0.57,0.16,0.19,0.36,0.59,0.65
NO2,NO$_2$,0.6,0.29,0.31,0.41,0.53,0.56
CO,CO,0.53,0.14,0.16,0.3,0.51,0.56


In [61]:
results.to_latex("../../draft/" + freq + "_r2_" + method + "_output.tex", index=False,
                 escape=False, na_rep="",
                 column_format="L{1.2in}C{0.8in}C{0.6in}C{0.6in}C{0.6in}C{0.6in}C{0.6in}")

In [62]:
! sed -i -e 's/multicolumn{5}{l}{Station-Specific R$^2$ Percentiles} \\\\/multicolumn{5}{c}{Station-Specific R$^2$ Percentiles}\\\\ \\cmidrule{3-7} \&/' ../../draft/day_r2_test_output.tex

In [63]:
! sed -i -e 's/multicolumn{5}{l}{Station-Specific R$^2$ Percentiles} \\\\/multicolumn{5}{c}{Station-Specific R$^2$ Percentiles}\\\\ \\cmidrule{3-7} \&/' ../../draft/7day_r2_test_output.tex

In [64]:
! sed -i -e 's/multicolumn{5}{l}{Station-Specific R$^2$ Percentiles} \\\\/multicolumn{5}{c}{Station-Specific R$^2$ Percentiles}\\\\ \\cmidrule{3-7} \&/' ../../draft/day_r2_cv_output.tex

In [65]:
! sed -i -e 's/multicolumn{5}{l}{Station-Specific R$^2$ Percentiles} \\\\/multicolumn{5}{c}{Station-Specific R$^2$ Percentiles}\\\\ \\cmidrule{3-7} \&/' ../../draft/7day_r2_cv_output.tex