In [20]:
import os
import pandas as pd

import logging
import time
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

PATH_TO_DOWNLOADS = "/Users/mdong/dataScience/projects-ml/waste-management/calrecycle-data"
options = Options()
prefs = {'download.default_directory' : PATH_TO_DOWNLOADS}
options.add_experimental_option('prefs', prefs)
options.headless = True

In [2]:
def generate_URLs(county_codes=range(1, 59), city_codes=range(59, 507)):
    """Generate all possible county / city combinations
    """
    DATA_PATH = "https://www2.calrecycle.ca.gov/WasteCharacterization/ResidentialStreams?cy={}&lg={}&mt=0&bg=0&mtf=0"
    possible_URLs = []
    for county in county_codes:
        for city in city_codes:
            possible_URLs.append(DATA_PATH.format(county, city))

    return possible_URLs

In [3]:
def get_data(URL, download_path):
    """Exports excel file for a given county and city
    """
    driver.get(URL)
    wait = WebDriverWait(driver, 1.5)
    # NEED TO USE SINGLE QUOTES FOR CSS SELECTOR
    city_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#LocalGovernmentIDList_taglist > li > span:nth-child(1)')))
    county_element = driver.find_element_by_css_selector('#CountyID > option:nth-child(2)')
    
    driver.find_element_by_css_selector('#ExportToExcel').send_keys("\n")
    most_recent_waste_data = download_path + "/ResidentialStreamsExport.xlsx"
    print("Downloading data for county: {}, city: {}".format(county_element.text, city_element.text))
    seconds_waited = 0
    while not os.path.exists(most_recent_waste_data):
        print("Download taking {} seconds...".format(seconds_waited))
        time.sleep(1)
        seconds_waited += 1
        
    new_file_name = download_path + "/" + county_element.text + "_" + city_element.text + ".xlsx"
    
    if os.path.isfile(most_recent_waste_data):
        os.rename(most_recent_waste_data, new_file_name)
    
    convert_to_csv(new_file_name)

In [4]:
def convert_to_csv(path):
    """Convert the Excel file into a csv with the same name given path to the file
    """
    directory = os.path.dirname(path)
    file_name_w_ext = os.path.basename(path)
    file_name = os.path.splitext(file_name_w_ext)[0]
    csv_file_path = os.path.join(directory, file_name + ".csv")
    
    data_xls = pd.read_excel(path, 'Data-Residential Composition')
    data_xls.to_csv(csv_file_path, index=False)

Need to instatiate a new driver every time otherwise you get following error

> `HTTPConnectionPool(host='127.0.0.1', port=55581): Max retries exceeded with url: /session/773aebc403b4ea773048eac8bc0eff19/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x10885b3d0>: Failed to establish a new connection: [Errno 61] Connection refused'))`

Tried to put driver in the get_data function but if there is an exception the driver will never close and you get the same error.

In [11]:
if __name__ == "__main__":
#     logging.basicConfig(filename="scraping_progress.log",
#                             filemode='a',
#                             format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
#                             level=logging.INFO)

    test_county_codes = range(1, 3)
    test_city_codes = range(59, 61)
    possible_URLs = generate_URLs(test_county_codes, test_city_codes)
    for url in tqdm(possible_URLs):
        try:
            driver = webdriver.Chrome(options=options)
            get_data(url, PATH_TO_DOWNLOADS)
        except TimeoutException as exception:
            print("No data for this URL: ", url)
            driver.quit()
    driver.quit()

 25%|██▌       | 1/4 [00:05<00:15,  5.05s/it]

No data for this URL:  https://www2.calrecycle.ca.gov/WasteCharacterization/ResidentialStreams?cy=1&lg=59&mt=0&bg=0&mtf=0


 50%|█████     | 2/4 [00:09<00:09,  4.96s/it]

No data for this URL:  https://www2.calrecycle.ca.gov/WasteCharacterization/ResidentialStreams?cy=1&lg=60&mt=0&bg=0&mtf=0


 75%|███████▌  | 3/4 [00:14<00:04,  4.91s/it]

No data for this URL:  https://www2.calrecycle.ca.gov/WasteCharacterization/ResidentialStreams?cy=2&lg=59&mt=0&bg=0&mtf=0


100%|██████████| 4/4 [00:19<00:00,  4.83s/it]

No data for this URL:  https://www2.calrecycle.ca.gov/WasteCharacterization/ResidentialStreams?cy=2&lg=60&mt=0&bg=0&mtf=0





---

## Complexity analysis

> `caffeinate python3 scrape-calrecycle.py`

### Runtime

- counties 1-58 and cities from 59 - 506
- 25984 possibilities
- 25984 * 1.5 = total seconds = 10.8 hrs (but according to tqdm it'll take 37 hrs)

```
Downloading data for county: Alameda, city: Newark
  1%|▌                                                      | 284/25984 [27:44<38:12:23,  5.35s/it]Downloading data for county: Alameda, city: Piedmont
  1%|▌                                                      | 291/25984 [28:20<37:36:16,  5.27s/it]Downloading data for county: Alameda, city: Pleasanton
  1%|▋                                                      | 343/25984 [33:20<37:58:30,  5.33s/it]Downloading data for county: Alameda, city: San Leandro
  1%|▋                                                      | 353/25984 [34:12<37:54:20,  5.32s/it]
```

### Space


- Each excel file = 17kb, each csv file = 8 kb so 25kb per city
- Total cities in CA = 482 according to 2010 census
- 500 cities * 25kb = 12.5MB

In [13]:
58 * (506 - 59 + 1)

25984

In [14]:
25984 * 1.5 / 3600

10.826666666666666

In [19]:
500 * 25 / 1e3

12.5

In [14]:
# get_data("https://www2.calrecycle.ca.gov/WasteCharacterization/ResidentialStreams?cy=1&lg=96&mt=0&bg=0&mtf=0", PATH_TO_DOWNLOADS)

In [7]:
# driver = webdriver.Chrome()

# driver.get('https://www2.calrecycle.ca.gov/WasteCharacterization/ResidentialStreams?cy=1&lg=96&mt=0&bg=0&mtf=0')
# driver.implicitly_wait(5)

# grand_totals = driver.find_element_by_xpath('//*[@id="ResidentialStreamsGrid"]/table/tfoot/tr/td[9]')
# grand_totals.text

---

### Attempt to use request library

requests can't handle dynamically generated elements by JavaScript??  Use headless browser

- https://stackoverflow.com/questions/45448994/wait-page-to-load-before-getting-data-with-requests-get-in-python-3


In [None]:
import requests
from bs4 import BeautifulSoup

In [10]:
url_endpoint = "https://www2.calrecycle.ca.gov/WasteCharacterization/ResidentialStreams"
query_params = {
    "cy":1, 
    "lg":96, 
    "mt":0,
    "bg":0,
    "mtf":0 
}

In [13]:
resp = requests.get(url_endpoint, params=query_params)

In [16]:
soup = BeautifulSoup(resp.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE doctype html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en"> <![endif]-->
<!--[if IE 9]>    <html class="no-js ie9 oldie" lang="en"> <![endif]-->
<!--[if (gt IE 9)]><!-->
<html class="no-js" lang="en">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="#648D32" name="theme-color">
   <title>
    Residential Waste Stream by Material Type
   </title>
   <!-- Google Fonts -->
   <link href="https://fonts.googleapis.com/css?family=Source+Sans+Pro:400,700" rel="stylesheet" type="text/css"/>
   <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet"/>
   <meta content="1.04.0314.03" name="CR.R3.version"/>
   <meta content="RCWEB04" name="CR.server.name"/>
   <meta content="WasteCharacteriz

In [19]:
grand_totals_row = soup.select("#ResidentialStreamsGrid > table > tfoot > tr")
grand_totals_row

[<tr class="k-footer-template"><td class="k-group-cell"> </td><td style="display:none"> </td><td style="text-align:right;font-style: italic;"> </td><td style="text-align:right;font-style: italic;"> </td><td style="text-align:right;font-style: italic;"> </td><td style="text-align:right;font-style: italic;"> </td><td style="text-align:right;font-style: italic;"> </td><td style="text-align:right;font-style: italic;"> </td><td style="text-align:right;font-style: italic;"> </td></tr>]