# NYC sources download

By Ben Welsh

Download all the CSV and PDF files posted by the New York City Board of Elections on its [download page](https://www.vote.nyc/page/election-results-summary)

## Import

Pull in the Python tools we'll use

In [1]:
import urllib.parse
from pathlib import Path

import requests
import pandas as pd
from bs4 import BeautifulSoup

## Download manifest

Download the election results page

In [2]:
url = "https://www.vote.nyc/page/election-results-summary"

In [3]:
r = requests.get(url)

In [4]:
assert r.ok

Parse out the tables with data

In [5]:
soup = BeautifulSoup(r.text)

In [6]:
content = soup.find("article").find("div", class_="content").find("div", class_="field--type-field-collection")

In [7]:
election_list = content.find_all("div", class_="field__item", recursive=False)

Pull the data out of each table

In [8]:
def parse_election(s):
    """Parse the provided election table."""
    # Grab the title
    title = s.find("h2").text
    
    # If this is the archive, skip it
    if title == "Archive":
        print("Skipping archive")
        return []
    
    # Parse the eleciton title
    print(f"Parsing {title}")
    
    # Grab the results table
    table = s.find("table", class_="simple-table")
    
    # Get all the rows
    row_list = table.find_all("tr")
    print(f"{len(row_list)} rows found")
    
    # Loop through them, skipping the header
    data_list = []
    for row in row_list[1:]:
        # Grab all the cells
        cell_list = row.find_all("td")
        
        # Pop out the title
        contest = cell_list.pop(0).text
        
        # Start up a clean record
        d = dict(
            election=title,
            contest=contest,
        )
        
        # Pull out all the CSVs
        csv_list = [c.a for c in cell_list if c.a['href'].endswith(".csv")]
        
        # Pop out the recap and ed level CSVs, if they exist
        base_url = "https://www.vote.nyc"
        for csv in csv_list:
            if 'recap' in csv['href'].lower():
                d['recap_csv_url'] = base_url + urllib.parse.quote(csv['href'])
            if 'edlevel' in csv['href'].lower():
                d['ed_csv_url'] = base_url + urllib.parse.quote(csv['href'])
        
        # Do the same for PDFs
        pdf_list = [c.a for c in cell_list if c.a['href'].endswith(".pdf")]
        for pdf in pdf_list:
            if 'manual' in pdf['href'].lower():
                d['manual_pdf_url'] = base_url + urllib.parse.quote(pdf['href'])
            elif 'recap' in pdf['href'].lower():
                d['recap_pdf_url'] = base_url + urllib.parse.quote(pdf['href'])
            elif 'edlevel' in pdf['href'].lower():
                d['ed_pdf_url'] = base_url + urllib.parse.quote(pdf['href'])
        
        # All record to list
        data_list.append(d)

    # Return the list    
    return data_list

In [9]:
contest_list = []
for election in election_list:
    contest_list += parse_election(election)

Parsing GENERAL - NOVEMBER 8, 2022
209 rows found
Parsing PRIMARY - AUGUST 23, 2022
49 rows found
Parsing PRIMARY - JUNE 28, 2022
391 rows found
Parsing SPECIAL- MAY 24, 2022
2 rows found
Parsing SPECIAL- MARCH 22, 2022
2 rows found
Parsing SPECIAL- FEBRUARY 15, 2022
3 rows found
Parsing SPECIAL- JANUARY 18, 2022
2 rows found
Skipping archive


Write out the result

In [10]:
df = pd.DataFrame(contest_list)

In [11]:
df.head()

Unnamed: 0,election,contest,recap_csv_url,ed_csv_url,recap_pdf_url,ed_pdf_url,manual_pdf_url
0,"GENERAL - NOVEMBER 8, 2022",Citywide Governor/Lieutenant Governor Citywide,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,
1,"GENERAL - NOVEMBER 8, 2022",Citywide State Comptroller Citywide,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,
2,"GENERAL - NOVEMBER 8, 2022",Citywide Attorney General Citywide,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,
3,"GENERAL - NOVEMBER 8, 2022",Citywide United States Senator Citywide,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,
4,"GENERAL - NOVEMBER 8, 2022",Crossover Representative in Congress 7th Congr...,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,https://www.vote.nyc/sites/default/files/pdf/e...,


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651 entries, 0 to 650
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   election        651 non-null    object
 1   contest         651 non-null    object
 2   recap_csv_url   464 non-null    object
 3   ed_csv_url      464 non-null    object
 4   recap_pdf_url   465 non-null    object
 5   ed_pdf_url      464 non-null    object
 6   manual_pdf_url  164 non-null    object
dtypes: object(7)
memory usage: 35.7+ KB


In [27]:
df.to_csv("nyc-2020-sources-manifest.csv", index=False)

## Download sources

Download all files

In [14]:
indexed_df = df.set_index(["election", "contest"])

In [15]:
download_dir = Path("").absolute() / "data"

In [16]:
download_dir.mkdir(exist_ok=True, parents=True)

In [33]:
def download_url(url: str, timeout: int = 180):
    """Download the provided URL to the provided path."""
    # Quit if there's no URL
    if not url or pd.isnull(url):
        return
    
    # Get the output path
    output_path = download_dir / urllib.parse.urlparse(url).path.split("/")[-1]
    
    # Quit if we've already got it
    if output_path.exists():
        return
    
    # Download it
    with requests.get(url, stream=True, timeout=timeout) as r:
        # Quit if there's a 404. We expect a couple.
        if r.status_code == 404:
            return
        r.raise_for_status()

        # Write out the file
        print(f"Downloading {url}")
        with open(output_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

In [35]:
dev_null = indexed_df.applymap(download_url)