In [1]:
import pandas as pd
import numpy as np
from glob import glob
import os
import matplotlib.pyplot as plt

plt.style.use('ggplot')

In [2]:
monitor_root = "../../data/csv/monitor"
policy_root = "../../data/policy/"
input_root = "../../data/output/xgbooster/"

In [3]:
monitor_coords = pd.read_csv(os.path.join(monitor_root, "monitor_processed.csv"),
                             index_col='id')
monitor_coords = monitor_coords[['city_cn']]

In [4]:
policy = pd.read_csv(os.path.join(policy_root, "PM2.5_reporting_date.csv"),
                     index_col='city_cn')

In [5]:
pred = pd.DataFrame()
for target_name in ["PM2.5", "PM10", "SO2", "NO2", "CO", "O3"]:
    df = pd.read_csv(os.path.join(input_root, target_name, "prediction_output.csv"),
                     index_col=['id', 'date'])
    df.columns = ['pred_' + target_name]
    if pred.empty:
        pred = df
    else:
        pred = pred.join(df, how='outer')

In [6]:
pred.reset_index(inplace=True)

In [10]:
pred['date'] = pd.to_datetime(pred['date'])
pred['year'] = pred['date'].dt.year
pred['month'] = pred['date'].dt.month
pred['week'] = pred['date'].dt.dayofyear // 7

In [15]:
pred = pred.groupby(['year', 'week', 'id']).agg({
    'pred_PM2.5': 'mean',
    'pred_PM10': 'mean',
    'pred_SO2': 'mean',
    'pred_NO2': 'mean',
    'pred_CO': 'mean',
    'pred_O3': 'mean',
    'month': 'median'
})

In [16]:
pred = pred.join(monitor_coords)

In [17]:
pred = pred.join(policy, on='city_cn')

In [18]:
pred

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pred_PM10,pred_O3,pred_NO2,pred_PM2.5,pred_CO,month,pred_SO2,city_cn,start_date
year,week,id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2005,0,1001A,93.011356,28.608202,53.519597,63.725131,1.545789,1,28.616465,北京,2012-01-21
2005,0,1002A,100.247579,36.950301,45.941632,95.209747,1.789880,1,23.264679,北京,2012-01-21
2005,0,1003A,98.808754,29.325108,52.563054,75.921958,1.881769,1,31.146584,北京,2012-01-21
2005,0,1004A,89.367277,30.325090,56.883238,63.517094,1.544300,1,23.842271,北京,2012-01-21
2005,0,1005A,110.084605,31.851208,69.765810,97.002444,1.889709,1,30.963602,北京,2012-01-21
2005,0,1006A,90.303267,23.959218,59.433468,60.578987,1.479513,1,27.868104,北京,2012-01-21
2005,0,1007A,115.930890,18.711527,68.313146,83.160965,2.276866,1,28.928995,北京,2012-01-21
2005,0,1008A,,33.594915,48.745627,91.114088,1.619710,1,20.578966,北京,2012-01-21
2005,0,1009A,,31.012222,38.890380,79.014516,1.387865,1,16.324206,北京,2012-01-21
2005,0,1010A,123.840881,27.307504,65.695341,91.302496,1.989450,1,29.432250,北京,2012-01-21


In [19]:
pred.to_csv(os.path.join(input_root, "regression_main.csv"))