In [8]:
import os
import pandas as pd
import numpy as np

import logging
import time
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# NEEDS TO BE ABSOLUTE PATH
PATH_TO_DOWNLOADS = "/Users/mdong/dataScience/projects-ml/waste-management/calrecycle-data"
options = Options()
prefs = {'download.default_directory' : PATH_TO_DOWNLOADS}
options.add_experimental_option('prefs', prefs)
options.headless = True

## Notes before running script

> path to downloads folder needs to EMPTY before running, or at least should be no files with the default download name

In [9]:
CA_counties = ['Alameda',
 'Alpine',
 'Amador',
 'Butte',
 'Calaveras',
 'Colusa',
 'Contra Costa',
 'Del Norte',
 'El Dorado',
 'Fresno',
 'Glenn',
 'Humboldt',
 'Imperial',
 'Inyo',
 'Kern',
 'Kings',
 'Lake',
 'Lassen',
 'Los Angeles',
 'Madera',
 'Marin',
 'Mariposa',
 'Mendocino',
 'Merced',
 'Modoc',
 'Mono',
 'Monterey',
 'Napa',
 'Nevada',
 'Orange',
 'Placer',
 'Plumas',
 'Riverside',
 'Sacramento',
 'San Benito',
 'San Bernardino',
 'San Diego',
 'San Francisco',
 'San Joaquin',
 'San Luis Obispo',
 'San Mateo',
 'Santa Barbara',
 'Santa Clara',
 'Santa Cruz',
 'Shasta',
 'Sierra',
 'Siskiyou',
 'Solano',
 'Sonoma',
 'Stanislaus',
 'Sutter',
 'Tehama',
 'Trinity',
 'Tulare',
 'Tuolumne',
 'Ventura',
 'Yolo',
 'Yuba']

county_selector_map = dict(zip(CA_counties, np.arange(2, 60)))
county_selector_map

{'Alameda': 2,
 'Alpine': 3,
 'Amador': 4,
 'Butte': 5,
 'Calaveras': 6,
 'Colusa': 7,
 'Contra Costa': 8,
 'Del Norte': 9,
 'El Dorado': 10,
 'Fresno': 11,
 'Glenn': 12,
 'Humboldt': 13,
 'Imperial': 14,
 'Inyo': 15,
 'Kern': 16,
 'Kings': 17,
 'Lake': 18,
 'Lassen': 19,
 'Los Angeles': 20,
 'Madera': 21,
 'Marin': 22,
 'Mariposa': 23,
 'Mendocino': 24,
 'Merced': 25,
 'Modoc': 26,
 'Mono': 27,
 'Monterey': 28,
 'Napa': 29,
 'Nevada': 30,
 'Orange': 31,
 'Placer': 32,
 'Plumas': 33,
 'Riverside': 34,
 'Sacramento': 35,
 'San Benito': 36,
 'San Bernardino': 37,
 'San Diego': 38,
 'San Francisco': 39,
 'San Joaquin': 40,
 'San Luis Obispo': 41,
 'San Mateo': 42,
 'Santa Barbara': 43,
 'Santa Clara': 44,
 'Santa Cruz': 45,
 'Shasta': 46,
 'Sierra': 47,
 'Siskiyou': 48,
 'Solano': 49,
 'Sonoma': 50,
 'Stanislaus': 51,
 'Sutter': 52,
 'Tehama': 53,
 'Trinity': 54,
 'Tulare': 55,
 'Tuolumne': 56,
 'Ventura': 57,
 'Yolo': 58,
 'Yuba': 59}


### If we want city, use Single-year Countywide Origin Detail page instead of the Multi-year Countywide Origin Summary because the multi-year countywide summary does not sum for each city across each quarter

how to iterate through each county? 

- each county doesn't map to URL like on waste characterization page
- use the n-th child selector - one for each county, starts at 2 + 58 counties
```
document.querySelector("#CountyID > option:nth-child(59)")
<option value="58">Yuba</option>
```

how to iterate through each year?

- same idea: 
```
document.querySelector("#Year > option:nth-child(27)")
<option>1995</option>
```

- Create a map from year to CSS idx and county to CSS idx so you can update the function definition to be get_waste_origin_data(county, year)

In [2]:
# for county_idx in range(2, 60):
#     print(county_idx)

In [2]:
def convert_to_csv(path):
    """Convert the Excel file into a csv with the same name given path to the file
    """
    directory = os.path.dirname(path)
    file_name_w_ext = os.path.basename(path)
    file_name = os.path.splitext(file_name_w_ext)[0]
    csv_file_path = os.path.join(directory, file_name + ".csv")
    
    data_xls = pd.read_excel(path)
    data_xls.to_csv(csv_file_path, index=False)

In [3]:
def get_waste_origin_data(idx, download_path):
    """Exports excel file with origin waste data for the county specified by idx to download_path
    """
    DATA_URL = "https://www2.calrecycle.ca.gov/LGCentral/DisposalReporting/Origin/CountywideSummary"
    driver = webdriver.Chrome(options=options)
    driver.get(DATA_URL)
    
    county_selector = '#CountyID > option:nth-child({})'.format(idx)
    county_element = driver.find_element_by_css_selector(county_selector)
    county = county_element.text
    county_element.click()

    driver.find_element_by_css_selector('#SearchButton').click()

    default_name_download = PATH_TO_DOWNLOADS + "/CountywideSummary.xlsx"
    print("Downloading data for county: {}".format(county))

    seconds_waited = 0
    while not os.path.exists(default_name_download):
        time.sleep(1)
        seconds_waited += 1

    print("Download took {} seconds...".format(seconds_waited))
    updated_file_name = download_path + "/" + county + "_waste_production.xlsx"
    
    if os.path.isfile(default_name_download):
        os.rename(default_name_download, updated_file_name)
        try:
            os.remove(default_name_download)
        except:
            pass
    
    convert_to_csv(updated_file_name)

    driver.quit()

In [4]:
get_waste_origin_data(2, PATH_TO_DOWNLOADS)

Downloading data for county: Alameda
Download took 1 seconds...


### get it working for one county

In [11]:
DATA_URL = "https://www2.calrecycle.ca.gov/LGCentral/DisposalReporting/Origin/CountywideSummary"

driver = webdriver.Chrome(options=options)

driver.get(DATA_URL)

county_element = driver.find_element_by_css_selector('#CountyID > option:nth-child(5)')
county = county_element.text
county_element.click()

driver.find_element_by_css_selector('#SearchButton').click() # .send_keys("\n")

most_recent_waste_data = PATH_TO_DOWNLOADS + "/CountywideSummary.xlsx"
print("Downloading data for county: {}".format(county_element.text))

seconds_waited = 0

while not os.path.exists(most_recent_waste_data):
    time.sleep(1)
    seconds_waited += 1

print("Download took {} seconds...".format(seconds_waited))

driver.quit()

Downloading data for county: Butte
Download took 1 seconds...


In [10]:
convert_to_csv("/Users/mdong/dataScience/projects-ml/waste-management/calrecycle-data/Calaveras_waste_production.xlsx")