In [2]:
from PyPDF2 import PdfReader as pdf
import numpy as np
import pandas as pd

In [3]:
metadata = {
 'year': 'Year of Election',
 'fips': 'FIPS Code',
 'elect_type': 'Election Type',
 'elect_name': 'Election Name',
 'reg_voters': 'Total Registered Voters, #',
 'ballots_cast': 'Total Ballots Cast, #',
 's_house_d': 'State House Democratic Votes, #',
 's_house_r': 'State House GOP Votes, #',
 's_house_o': 'State House Other Votes, #',
 's_senate_d': 'State Senate Democratic Votes, #',
 's_senate_r': 'State Senate GOP Votes, #',
 's_senate_o': 'State Senate Other Votes, #',
 'house_d': 'House Democratic Votes, #',
 'house_r': 'House GOP Votes, #',
 'house_o': 'House Other Votes, #',
 'senate_d': 'Senate Democratic Votes, #',
 'senate_r': 'Senate GOP Votes, #',
 'senate_o': 'Senate Other Votes, #',
 'gov_d': 'Governor Democratic Votes, #',
 'gov_r': 'Governor GOP Votes, #',
 'gov_o': 'Governor Other Votes, #',
 'sos_d': 'Secretary of State Democratic Votes, #',
 'sos_r': 'Secretary of State GOP Votes, #',
 'sos_o': 'Secretary of State Other Votes, #',
 'pres_d': 'President Democratic Votes, #',
 'pres_r': 'President GOP Votes, #',
 'pres_o': 'President Other Votes, #'
}

data = pd.DataFrame(columns=metadata.keys())
data.head()

Unnamed: 0,year,fips,elect_type,elect_name,county_name,reg_voters,ballots_cast,s_house_d,s_house_r,s_house_o,...,senate_o,gov_d,gov_r,gov_o,sos_d,sos_r,sos_o,pres_d,pres_r,pres_o


In [5]:
reader = pdf("2022_State.pdf")
page = reader.pages[1]
print(page.extract_text())

Beaverhead
Deer Lodge
Flathead
Gallatin
Glacier
Granite
Lake
Lincoln
Madison
Mineral
Missoula
Pondera
Powell
Ravalli
Sanders
Silver Bow
Total
UNITED STATES REPRESENTATIVE - 1ST DISTRICT
MONICA TRANEL
Democrat
JOHN LAMB
Libertarian
RYAN K ZINKE
Republican
1330
193
3004
2256
212
1510
16571
1864
29290
28747
1810
22288
1748
111
1213
533
100
1149
5388
541
7389
2200
446
6687
1483
258
3290
580
146
1458
35991
1703
18846
227
26
461
639
172
1762
7523
886
15158
1615
460
4549
8434
665
5048
115265
9593
123102
2022 STATEWIDE GENERAL ELECTION CANVASS
MONTANA SECRETARY OF STATE CHRISTI JACOBSEN
2 of 10



In [6]:
text_data = page.extract_text()

def process_election_data(text_data, number_of_candidates):
    try:
        # Splitting the text data at 'Total'
        sections = text_data.split("Total", 1)

        # Extract counties
        counties_list = sections[0].strip().split('\n')
        counties_list.append('Total')  # Add 'Total' back to the list

        # Process candidates and votes
        remaining_parts = sections[1].split('\n')
        candidates, votes = [], []
        for part in remaining_parts:
            if part.isdigit():
                votes.extend(part.split())
            elif part.strip() and not part.startswith("2022 STATEWIDE GENERAL ELECTION"):
                candidates.append(part)

        # Reshape the votes
        reshaped_votes = [votes[i:i + number_of_candidates] for i in range(0, len(votes), number_of_candidates)]

        # Combine candidate names and parties
        candidate_columns = []
        for i in range(0, len(candidates), 2):
            name = ' '.join(candidates[i:i+2])
            if len(candidate_columns) < number_of_candidates:
                candidate_columns.append(name)

        # Debugging: Print lengths of lists
        print("Length of counties_list:", len(counties_list))
        print("Length of reshaped_votes:", len(reshaped_votes))

        # Create the DataFrame
        df = pd.DataFrame(reshaped_votes, columns=candidate_columns)
        df.insert(0, 'County', counties_list)

        return df

    except Exception as e:
        print("An error occurred: ", str(e))
        # Optionally, print any relevant debugging information here
        print("Length of counties_list (inside except):", len(counties_list))
        # Other debugging information can be added here as needed


In [7]:
df = process_election_data(text_data,3)

df.head()

Length of counties_list: 17
Length of reshaped_votes: 17


Unnamed: 0,County,UNITED STATES REPRESENTATIVE - 1ST DISTRICT MONICA TRANEL,Democrat JOHN LAMB,Libertarian RYAN K ZINKE
0,Beaverhead,1330,193,3004
1,Deer Lodge,2256,212,1510
2,Flathead,16571,1864,29290
3,Gallatin,28747,1810,22288
4,Glacier,1748,111,1213


In [8]:
df.columns = ['county_name', 'house_d', 'house_o', 'house_r']
# Initialize data with all NaN values
data = pd.DataFrame(index=range(len(df)), columns=data.columns)

# Fill 'county_name', 'house_d', 'house_o', 'house_r' columns from df
data[['county_name', 'house_d', 'house_o', 'house_r']] = df[['county_name', 'house_d', 'house_o', 'house_r']]

print(data)

   year fips elect_type elect_name county_name reg_voters ballots_cast  \
0   NaN  NaN        NaN        NaN  Beaverhead        NaN          NaN   
1   NaN  NaN        NaN        NaN  Deer Lodge        NaN          NaN   
2   NaN  NaN        NaN        NaN    Flathead        NaN          NaN   
3   NaN  NaN        NaN        NaN    Gallatin        NaN          NaN   
4   NaN  NaN        NaN        NaN     Glacier        NaN          NaN   
5   NaN  NaN        NaN        NaN     Granite        NaN          NaN   
6   NaN  NaN        NaN        NaN        Lake        NaN          NaN   
7   NaN  NaN        NaN        NaN     Lincoln        NaN          NaN   
8   NaN  NaN        NaN        NaN     Madison        NaN          NaN   
9   NaN  NaN        NaN        NaN     Mineral        NaN          NaN   
10  NaN  NaN        NaN        NaN    Missoula        NaN          NaN   
11  NaN  NaN        NaN        NaN     Pondera        NaN          NaN   
12  NaN  NaN        NaN        NaN    

In [9]:
data.to_csv('exported_data.csv', index=False)