In [134]:
import os
import pandas as pd
import pdfplumber
import re

In [135]:
def extract_votes_from_row(data_row,header):
    votes_ls = re.search(r'[0-9]+\s[0-9]+\s[0-9]+\s[0-9]+',data_row).group(0).split(' ')
    votes = {}
    for i in range(0,len(header)):
        votes[header[i]] = [votes_ls[i]]
    df = pd.DataFrame(votes)
    df['candidate'] = re.search(r'[^0-9]*',data_row).group(0).strip()
    return df

def extract_votes(data_rows,header):
    return pd.concat([extract_votes_from_row(data_row,header) for data_row in data_rows])

def extract_data_rows(table_rows):
    data_rows = []
    for row in table_rows:
        if not re.search(r'[0-9]+\s[0-9]+\s[0-9]+\s[0-9]+',row) is None:
            data_rows.append(row)

    return data_rows

def extract_box_data(page,bbox):
    data_section = page.crop(bbox)
    data_text = data_section.extract_text()
    table_rows = data_text.split('\n')
    race_title = table_rows[0]

    table_header = [x.replace('\n',' ') for x in list(filter(lambda x: len(x) > 0,data_section.extract_table()[0]))]
    
    data_rows = extract_data_rows(table_rows)

    df = extract_votes(data_rows,table_header)

    if not re.search(r'^[A-Z]{,3}',race_title) is None:
        df['party'] = re.search(r'^[A-Z]{,3}',race_title).group(0)
    if not re.search(r'[0-9]+.*',race_title) is None:
        df['district'] = re.search(r'[0-9]+.*',race_title).group(0)
    if not re.search(r'\s\D+',race_title) is None:
        df['office'] = re.search(r'\s\D+',race_title).group(0).strip()

    return df

def extract_precinct_name(page,strip_start=80,strip_height=25):
    return page.crop((0,strip_start,page.width,strip_start+strip_height)).extract_text()

def extract_page_data(page):
    vote_headers = page.search('Vote For')
    all_data = []
    i = 0
    while i < len(vote_headers):
        if i < len(vote_headers) - 1:
            pair = vote_headers[i:i+2]
            bbox = (0,pair[0]['bottom']-30,page.width,pair[1]['top']-20)
        else:
            bbox = (0,vote_headers[i]['bottom']-30,page.width,page.height)
        temp = extract_box_data(page,bbox)
        all_data.append(temp)
        i += 1

    df = pd.concat(all_data)
    df['precinct'] = extract_precinct_name(page)
    return df

def extract_statistics(page):
    if len(page.search('STATISTICS')) > 0:
        bbox = (0,page.search('STATISTICS')[0]['bottom'],page.width,page.search('STATISTICS')[0]['bottom'] + 150)
        stats_text = page.crop(bbox).extract_text()

        stats = pd.DataFrame({
            'Registered Voters': [re.search(r'Registered Voters - Total ([0-9]*)',stats_text).group(1)],
            'Ballots Cast': [re.search(r'Ballots Cast - Total ([0-9]*)',stats_text).group(1)]
        })

        stats = stats.melt().rename(columns={
            'variable':'office',
            'value':'votes'
        })

        stats['precinct'] = extract_precinct_name(page)

        return stats
    else:
        return None

In [136]:
files = ['data_2024/primary/' + file for file in os.listdir('data_2024/primary')]

In [137]:
file = 'data_2024/primary/Adams PA Primary 2024.pdf'
pdf = pdfplumber.open(file)
county_name = 'Adams'

In [138]:
df = pd.concat([extract_page_data(page) for page in pdf.pages])

In [139]:
df['county'] = county_name

In [140]:
df = df.rename(columns={
    'TOTAL':'votes',
    'Election Day':'election_day',
    'Provisional Votes':'provisional',
    'Mail Votes':'absentee'
})
df = df[df['candidate'].apply(lambda x: x not in ['Total Votes Cast','Overvotes','Undervotes','Contest Totals'])]

In [141]:
stats_df = pd.concat([extract_statistics(page) for page in pdf.pages if len(page.search('STATISTICS')) > 0])
stats_df['county'] = county_name

In [142]:
df = pd.concat([df,stats_df])
df['district'] = df['district'].str.extract('(\d+)')
df = df.fillna('')
df = df.reset_index()
df = df[['county','precinct','office','district','party','candidate','votes','election_day','provisional','absentee']]

In [143]:
# data cleaning
df['candidate'] = df['candidate'].str.title()
df['office'] = df['office'].replace('President of the United States','President')
df['office'] = df['office'].replace('United States Senator','U.S. Senate')
df['office'] = df['office'].replace('Representative in Congress','U.S. House')
df['office'] = df['office'].replace('Senator in the General Assembly','State Senate')
df['office'] = df['office'].replace('Representative in the General Assembly','General Assembly')

In [145]:
df = df.drop_duplicates()

Unnamed: 0,county,precinct,office,district,party,candidate,votes,election_day,provisional,absentee
0,Adams,Abbottstown,President,,DEM,Joseph R Biden Jr,25,10,0,15
1,Adams,Abbottstown,President,,DEM,Dean Phillips,3,3,0,0
2,Adams,Abbottstown,President,,DEM,Write-In Totals,1,0,0,1
3,Adams,Abbottstown,U.S. Senate,,DEM,Robert P Casey Jr,28,12,0,16
4,Adams,Abbottstown,U.S. Senate,,DEM,Write-In Totals,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1372,Adams,Tyrone,Ballots Cast,,,,297,,,
1373,Adams,Union,Registered Voters,,,,1,,,
1374,Adams,Union,Ballots Cast,,,,534,,,
1375,Adams,York Springs,Registered Voters,,,,230,,,


In [146]:
df.to_csv(f'data_cleaned/20240423__pa__primary__{county_name.lower()}__precinct.csv',index=False)