## Extract relevant information from WHOGMDatabase.csv - from Shaddick et al.

* Find ground monitor locations which have reported PM25 values later than 2015
* Get latest year
* create categorical country code
* add binary classification "is_urban"

Creates two data files:

* `world_pm25.csv`
* `north_america_pm25.csv`


In [None]:
import json
import pandas as pd
import numpy as np


In [None]:
who_clean = pd.read_csv('WHOGMDatabase.csv', encoding = "ISO-8859-1")

In [None]:
who_clean.keys()
world = who_clean.drop(columns=['StationIDOrig',
    'StationIDOldDatabase', 'City', 'CityGiulia', 'CityClean',
    'PM25PercCoverage', 'PM25Grading',
    'PM10', 'PM10PercCoverage', 'PM10Grading',
    'LocationInfo', 'Source', 'MonitorTypeOrig', 'PM25Conv',                           
    'UnspecifiedType', 'WebLink', 'Version', 'WHOStatus', 'WHORegion',
    'WHOIncomeRegion', 'SDG1Region', 'SDG2Region', 'SDG3Region',
    'GBDRegion', 'GBDSuperRegion'], axis= 1)

In [None]:
world.keys()

In [None]:
print(world.shape)
world = world[world['Year']>2015]
print(world.shape)

In [None]:
world = world.dropna(subset =['PM25'])
world.shape

In [None]:
world = world.groupby(['StationID']).apply(lambda x: x.loc[x['Year'].idxmax()], include_groups=False).reset_index(drop=True)
world.shape


In [None]:
world['is_urban'] = np.where(world['MonitorType'].isin(['Urban', 'urban', 'Industrial', 'industrial']), 1, 0)
world['country'] = world['ISO3'].astype('category').cat.codes + 1


In [None]:
world

In [None]:
import csv
world.to_csv('world_pm25.csv', 
             columns=['Year', 'CityReverseGeocoded', 'Longitude', 'Latitude', 
                      'PM25', 'is_urban', 'country'], 
                     quoting=csv.QUOTE_NONNUMERIC, index=False)

In [None]:
na_slice = world[world['ISO3'].isin(['CAN','USA','MEX'])]
north_america = na_slice.copy(deep=True).reset_index(drop=True)
north_america['country'] = north_america['ISO3'].astype('category').cat.codes + 1

In [None]:
north_america.head(3)

In [None]:
north_america.tail(3)

In [None]:
north_america.shape

In [None]:
north_america.to_csv('north_america_pm25.csv', 
             columns=['Year', 'CityReverseGeocoded', 'Longitude', 'Latitude', 
                      'PM25', 'is_urban', 'country'], 
                     quoting=csv.QUOTE_NONNUMERIC, index=False)