In [36]:
import os
import pdfplumber
import pandas as pd
import re
import numpy as np

In [37]:
def get_tables_from_page(page):
    return pd.concat([pd.DataFrame(table).dropna() for table in page.extract_tables()])

def precinct_filler(val):
    global curr_name
    if val != '':
        curr_name = val
    return curr_name

def clean_table(table):
    df = pd.DataFrame(table)
    df = df.fillna('')
    df = df.rename(columns=df.iloc[0])
    df = df.melt('Precinct')
    df['variable'] = df['variable'].apply(lambda x: x[::-1]).str.replace('\n',' ')

    df = df[df['variable'] != 'Total Votes']
    df = df[df['Precinct'] != 'Precinct']
    df = df[df['Precinct'].apply(lambda x: 'County' not in x)]

    df = df.rename(columns={'Precinct':'Vote_Type'})
    df['Precinct'] = df['Vote_Type'].apply(lambda x: x if x not in ['Election Day','Mail-In','Provisional','Total'] else '')

    df['Precinct'] = df['Precinct'].apply(precinct_filler)
    df = df[df['Vote_Type'].apply(lambda x: x in ['Election Day','Mail-In','Provisional','Total'])]

    df = df.pivot_table(values='value',index=['variable','Precinct'],columns='Vote_Type',aggfunc='sum')

    df.columns.name = None
    df = df.reset_index()

    df = df.rename(columns={'variable':'candidate',
            'Precinct':'precinct',
            'Election Day':'election_day',
            'Mail-In':'absentee',
            'Provisional':'provisional',
            'Total':'votes'})

    df = df[df['candidate'] != '']

    df['candidate'] = df['candidate'].replace('Voters Registered','Registered Voters')
    if 'Registered Voters' in df['candidate'].unique():
        df = df[['candidate','precinct','votes']]

    return df

def extract_race_title(page):
    page_text = page.extract_text()
    if not re.search(r'.*Vote.*\n',page_text) is None:
        race_title = re.search(r'.*Vote.*\n',page_text).group(0)
        return race_title
    else:
        return None

In [38]:
file = '../data_2024/primary/Carbon PA StatementOfVotesCastRPT.pdf'
pdf = pdfplumber.open(file)
county_name = 'Carbon'

In [39]:
all_data = []
race_title = None
for page in pdf.pages[9:]:
    temp = pd.concat([clean_table(table) for table in page.extract_tables()])

    curr_title = extract_race_title(page)
    if not curr_title is None:
        race_title = curr_title
    
    temp['Race'] = race_title
    all_data.append(temp)
df = pd.concat(all_data)

In [40]:
df['party'] = df['Race'].apply(lambda x: re.search(r'[A-Z]{3}',x).group(0) if not re.search(r'[A-Z]{3}',x) is None else '')
df['office'] = df['Race'].apply(lambda x: re.search(r'^[^\(]+',x).group(0).strip() if re.search(r'^[^\(]+',x) is not None else '')

df['district'] = df['office'].apply(lambda x: re.search(r'- (.*)',x).group(1).strip() if re.search(r'- (.*)',x) is not None else '')
df['district'] = df['district'].apply(lambda x: re.search(r'[0-9]+',x).group(0).strip() if re.search(r'[0-9]+',x) is not None else '')

df['office'] = df['office'].apply(lambda x: re.search(r'(.*) -',x).group(1).strip() if re.search(r'(.*) -',x) is not None else x)

df['precinct'] = df['precinct'].str.replace('\n','')
df['county'] = county_name

In [41]:
df['office'] = df['office'].replace('President of the United States','President')
df['office'] = df['office'].replace('United States Senator','U.S. Senate')
df['office'] = df['office'].replace('Representative in Congress','U.S. House')
df['office'] = df['office'].replace('Senator in General Assembly','State Senate')
df['office'] = df['office'].replace("Representative in the General Assembly",'General Assembly')
df['office'] = df['office'].replace("Voters Registered",'Registered Voters')

In [42]:
df = df[df['candidate'].apply(lambda x: x not in ['Overvotes','Times Cast','Undervotes'])]
df['candidate'] = df['candidate'].apply(lambda x: re.sub(r'\(.*\)','',x).strip().replace('  ',' '))
df = df.reset_index()

In [43]:
df = df[['county','precinct','office','district','party','candidate','votes','election_day','provisional','absentee']]

In [44]:
df = df.fillna('')

In [45]:
df.to_csv(f'../data_cleaned/20240423__pa__primary__{county_name.lower()}__precinct.csv',index=False)