# Libraries

In [1]:
import numpy as np
import pandas as pd
import requests
import requests_cache
import lxml.html as lx
import censusgeocode as cg
from save_load import save_load
save_load = save_load()

requests_cache.install_cache("internal/mycache")

# Retrieve Locations

In [2]:
def get_info(url, names):
    """Given a url a AQMIS page with one site, retrieve the link to get location info, return a row of a dataframe"""     
    try:
        response = requests.get(url)
        response.raise_for_status()
        html = lx.fromstring(response.text)
        
        # get location name and location description link
        location_name = html.xpath("//td/span[@class = 'title4']/a/text()")[0]
        link = html.xpath("//td/span[@class = 'title4']/a")[0].attrib["href"]
        
        if location_name in names:
            return None, names
        
        response2 = requests.get(link)
        response.raise_for_status()
        html2 = response2.content
        
        # get table from location description link
        df_list = pd.read_html(html2)
        
        # table with index 1 is what we want
        df = df_list[1]
        df["Name"] = location_name
        df.set_index("Name", inplace=True)
        names.add(location_name)
        return df, names
    except:
        return None, names


def get_all_info(links):
    """Given a list of links to all AQMIS tables for all sites, create a data frame with unique locations and location info"""
    # will hold one row data frames
    df_list = []
    names = set()
    for link in links:
        url = "".join(["https://www.arb.ca.gov/aqmis2/", link])
        row, names = get_info(url, names)
        df_list.append(row)
    df = pd.concat(df_list)
    return df

# OZONE Locations

In [3]:
#ozone_links = save_load.load_object("internal/ozone_links.pkl")
#ozone_locations = get_all_info(ozone_links)
#ozone_locations.to_excel("datasets/ozone_locations.xlsx")

# PM2.5 Locations

In [4]:
#pm25_links = save_load.load_object("internal/pm25_links.pkl")
#pm25_locations = get_all_info(pm25_links)
#pm25_locations.to_excel("datasets/pm25_locations.xlsx")

# Get Census Tracts

In [17]:
def find_geoid(df):
    """Given a df with with columns for x and y coordinates, return a modified df with a new column for geoid"""
    geoids = []
    blocks = []
    tracts = []
    counties = []
    for i in range(0, df.shape[0]):
        x_coord = df["Longitude (W)"][i]
        y_coord = df["Latitude (N)"][i]
        info = cg.coordinates(x = x_coord, y = y_coord)
        geoid = info["2010 Census Blocks"][0]["GEOID"]
        block = info["2010 Census Blocks"][0]["BLOCK"]
        tract = info["2010 Census Blocks"][0]["TRACT"]
        county = info["2010 Census Blocks"][0]["COUNTY"]
        geoids.append(geoid)
        blocks.append(block)
        tracts.append(tract)
        counties.append(county)
    df["GEOID10"] = geoids
    df["BLOCK10"] = blocks
    df["TRACT10"] = tracts
    df["COUNTY"] = counties
    return df

In [16]:
cg.coordinates(x = -116.7, y = 32.8)["2010 Census Blocks"][0]

{'SUFFIX': '',
 'GEOID': '060730212022060',
 'CENTLAT': '+32.7984118',
 'BLOCK': '2060',
 'AREAWATER': 0,
 'STATE': '06',
 'BASENAME': '2060',
 'OID': 210404038360686,
 'LSADC': 'BK',
 'FUNCSTAT': 'S',
 'INTPTLAT': '+32.7984118',
 'NAME': 'Block 2060',
 'OBJECTID': 5278411,
 'TRACT': '021202',
 'CENTLON': '-116.7052746',
 'BLKGRP': '2',
 'AREALAND': 8992794,
 'INTPTLON': '-116.7052746',
 'MTFCC': 'G5040',
 'LWBLKTYP': 'L',
 'COUNTY': '073',
 'CENT': (-116.7052746, 32.7984118),
 'INTPT': (-116.7052746, 32.7984118)}

In [19]:
#ozone_locations = pd.read_excel("datasets/ozone_locations.xlsx")
#ozone_locations = ozone_locations.set_index("Name")
#ozone_locations = find_geoid(ozone_locations)
#ozone_locations.to_excel("datasets/ozone_locations.xlsx")

#pm25_locations = pd.read_excel("datasets/pm25_locations.xlsx")
#pm25_locations = pm25_locations.set_index("Name")
#pm25_locations = find_geoid(pm25_locations)
#pm25_locations.to_excel("datasets/pm25_locations.xlsx")

# Merging

In [4]:
def merge_locations(file_path, location_data):
    """Given a file path to the toxin data and the location, merge it by location name and save to excel"""
    df = pd.read_excel(file_path)
    df = df.rename(columns={"Unnamed: 0": "Name"})
    df = df.set_index("Name")
    df = df.join(location_data)
    df.to_excel(file_path)
    return

In [9]:
ozone_locations = pd.read_excel("datasets/ozone_locations.xlsx")
ozone_locations = ozone_locations.set_index("Name")

pm25_locations = pd.read_excel("datasets/pm25_locations.xlsx")
pm25_locations = pm25_locations.set_index("Name")

In [10]:
merge_locations("datasets/ozone_month.xlsx", ozone_locations)
merge_locations("datasets/ozone_year.xlsx", ozone_locations)
merge_locations("datasets/pm25_month.xlsx", pm25_locations)
merge_locations("datasets/pm25_year.xlsx", pm25_locations)