# Main script to clean UW final satellite pm2.5 data

## Model 1: zip_ssn

Modules: N/A <br>
Author: Cornelia Ilin <br>
Email: cilin@wisc.edu <br>
Date created: Oct 20, 2020 <br>

**Citations (data sources)**

``PM 25 at the monthly level:``
1. https://sites.wustl.edu/acag/datasets/historical-pm2-5-across-north-america/


``Shapefiles for California ZIP codes (2010 census):``

3. https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2010&layergroup=ZIP+Code+Tabulation+Areas
    
**Citations (persons)**
1. N/A

**Preferred environment**
1. Code written in Jupyter Notebooks

#### Step 1: Import packages

In [1]:
# standard
import pandas as pd
import numpy as np
import os
import h5py

# geography
import geopandas as gpd
import osmnx as ox
import shapely

#### Step 2: Define working directories

In [1]:
in_dir_zip_shapes = '/Users/cilin/Desktop/CA_hospitals/pm25/UW/data/shapefiles/'
in_dir = "/Users/cilin/Desktop/CA_hospitals/pm25/UW/data/pm25_monthly/"
out_dir = '/Users/cilin/Desktop//CA_hospitals/pm25/UW/output/'

#### Step 3: Define functions

``read data``

In [3]:
def read_census_geom():
    """ Read Census (lat, lon) coordinates for California zip-codes
    parameters:
    -----------
    None
    
    return:
    -------
    Df with osmnx_geom
    """
    ### Step 1 ### 
    ##############
    # Read the shapefiles for California's ZIP codes
    for file in os.listdir(in_dir_zip_shapes):
        if file.endswith('.shp'):
            gdf = gpd.read_file(in_dir_zip_shapes + file)

    # keep only cols of interest 
    # ('ZCTA5CE10' = 2010 Census ZIP codes,	'GEOID10' = 2010 Census Tract codes)
    gdf = gdf[['ZCTA5CE10',	'GEOID10', 'geometry']]
    
    
    ### Step 2 ###
    ###############
    # For each zip cpde extract polygon with (lat, lon) info

    zip_poly = pd.DataFrame()

    for idx, multipoly in enumerate(gdf.geometry):
        if isinstance(multipoly, shapely.geometry.polygon.Polygon):
            temp_df = pd.DataFrame({'latitude': multipoly.exterior.coords.xy[1], 
                                    'longitude': multipoly.exterior.coords.xy[0],
                                    'ZIP10': gdf.loc[idx, 'ZCTA5CE10'],
                                    'GEOID10': gdf.loc[idx, 'GEOID10']})
            zip_poly = pd.concat([zip_poly, temp_df], axis=0)

        if isinstance(multipoly, shapely.geometry.multipolygon.MultiPolygon):
            for poly in multipoly:
                temp_df = pd.DataFrame({'latitude': poly.exterior.coords.xy[1], 
                                        'longitude': poly.exterior.coords.xy[0],
                                        'ZIP10': gdf.loc[idx, 'ZCTA5CE10'],
                                        'GEOID10': gdf.loc[idx, 'GEOID10']})
                zip_poly = pd.concat([zip_poly, temp_df], axis=0)   
    

    # round (lat, lon) to 2 decimal points and add 0.005 to match the UW (lat, lon) values
    zip_poly['latitude'] = zip_poly.latitude.round(2) + 0.005
    zip_poly['longitude'] = zip_poly.longitude.round(2) + 0.005
    zip_poly.sort_values(by=['ZIP10', 'latitude', 'longitude'], inplace=True)
    zip_poly.drop_duplicates(subset=['ZIP10', 'latitude', 'longitude'], inplace=True)

    
    return zip_poly

for the function below try to read the data using the ftplib in python (secure file transfer)

In [4]:
def read_uw_pm25(zip_poly):
    """Read UW pm25 data
    parameters:
    -----------
    osmnx_geom: df, contains osmnx_geom and county name/code
    
    return:
    df with pm25 values by year and county in California
    """
    df = pd.DataFrame()
    
    for idx, file in enumerate(os.listdir(in_dir)):
        if file.endswith('.h5'):
            print(file[:4])
            # read data
            f = h5py.File(os.path.join(in_dir, file), 'r')
            # read latitude
            row_index = f['latitude']
            row_index = pd.DataFrame(row_index, columns=['latitude'])
            # read longitude
            col_index = f['longitude']
            col_index = pd.DataFrame(col_index, columns = ['longitude'])
            # read pm25 (divide by 100 as indicated here: https://zenodo.org/record/2616769#.X4999NBKg4c)
            pm25 = f['CorrectedPM2.5']
            pm25 = pd.DataFrame(pm25)/100

            # add col and row index to pm25_df
            pm25.set_index(row_index.latitude.values, inplace=True)
            pm25.columns = col_index.longitude.values
            pm25.reset_index(drop=False, inplace=True)
            pm25.rename(columns={'index':'latitude'}, inplace=True)

            # melt pm25_df
            pm25 = pd.melt(pm25, id_vars='latitude', var_name='longitude', value_vars=col_index.longitude.values, value_name='pm25')
            pm25.sort_values(by=['latitude', 'longitude'], inplace=True)

            # set lat and lon to 3 decimals
            pm25['latitude'] = pm25.latitude.round(3)
            pm25['longitude'] = pm25.longitude.astype(float).round(3)

            # add year column
            pm25['year'] = file[:4]

            # merge with zip_poly
            pm25 = zip_poly.merge(pm25, on=['latitude', 'longitude'], how='inner')

            # group by zip code and census tract (get mean for each zip code)
            pm25 = pm25.groupby(['year', 'ZIP10', 'GEOID10'], as_index=False).agg({'pm25': np.mean})

            # add year and zip column
            pm25['year_zip'] = pm25.year.astype(str) + '.0_' + pm25.ZIP10.astype(str) + '.0'

            # append to df
            df = pd.concat([df, pm25], axis=0)
    
    # sort and reset index
    df.sort_values(by=['year'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df

#### Step 4: Read data

In [5]:
zip_poly = read_census_geom()

In [6]:
df = read_uw_pm25(zip_poly)

1992
2013
2003
1996
2007
1997
2016
2006
1993
2012
2002
1998
2009
1999
2008
1994
2015
2005
2011
2001
1991
2010
2000
1995
2014
2004


In [7]:
df.sort_values(by=['ZIP10', 'GEOID10', 'year'], inplace=True)

In [8]:
df.to_csv(out_dir + 'UW_pm25_zip_monthly.csv')