# Analyze past Allegheny County primaries

In [1]:
import pandas as pd
from PyPDF2 import PdfReader
import re

In [2]:
### Calculate relevant statistics, append year suffix to columns
def calculate_stats(df, year):
    df = df.groupby('precinct').agg({
        'd_reg': 'sum',
        'd_cast': 'sum'
    }).reset_index()

    df['turnout'] = df.d_cast / df.d_reg
    df['ballots_pct_of_total'] = df.d_cast / df.d_cast.sum()
    df['reg_pct_of_total'] = df.d_reg / df.d_reg.sum()

    df.columns = [f'{i}_{year}' if i not in ['precinct'] else i for i in df.columns]
    df = df.set_index('precinct')
    return df

### Remove ward and district and ward markings, but leave ward for Pittsburgh
def format_precinct_name(precinct):
    precinct = precinct.replace(' WD', ' WARD')
    precinct = precinct.replace(' WRD', ' WARD')
    precinct = re.sub(r' DIST.*', '', precinct)

    if precinct.startswith('PITTSBURGH'):
        return precinct
    else:
        return re.sub(r' WARD.*', '', precinct)

In [3]:
def _parse_pdf_precinct_stats(index, precincts, precinct, text, d_reg_regex, d_cast_regex):
    precinct_text_index = text.index(precinct)

    if index == len(precincts) - 1:
        next_precinct_text_index = len(text)
    else:
        next_precinct_text_index = text.index(precincts[index + 1])

    precinct_text = text[precinct_text_index:next_precinct_text_index]
    d_reg = re.findall(d_reg_regex, precinct_text)[0][0]
    d_cast = re.findall(d_cast_regex, precinct_text)[0][0]

    return { 'precinct': precinct, 'd_reg': d_reg, 'd_cast': d_cast }

REGEX_PRECINCT = "STATISTICS\n(.*)\n                                                      VOTES  PERCENT"
REGEX_D_CAST_13 = "BALLOTS CAST - DEMOCRATIC .  .  .  .  .  .\s+(\d+)\s+(\d+\.\d+)?\n?\s+(?:VOTER TURNOUT - TOTAL|BALLOTS CAST - NONPARTISAN|\*+ \(Republican\) \*+)"
REGEX_D_REG_13 = "REGISTERED VOTERS - DEMOCRATIC  .  .  .  .\s+(\d+)\s+(\d+\.\d+)?\n?\s+(?:BALLOTS CAST - TOTAL|REGISTERED VOTERS - NONPARTISAN)"
REGEX_D_CAST_15 = "BALLOTS CAST - DEMOCRATIC .  .  .  .  .  .\s+(\d+)\s+(.*)(\n)?\s+BALLOTS CAST - REPUBLICAN"
REGEX_D_REG_15 = "REGISTERED VOTERS - DEMOCRATIC  .  .  .  .\s+(\d+)   (.*)(\n)?           REGISTERED VOTERS - REPUBLICAN"

def parse_pdf_results(filename):
    reader = PdfReader(filename)
    text = ""

    for page in reader.pages:
        text += page.extract_text() + "\n"

    text = re.sub("Page \d+\n", "", text)
    precincts = re.findall(REGEX_PRECINCT, text)

    d_cast_regex = REGEX_D_CAST_15 if '2015' in filename else REGEX_D_CAST_13
    d_reg_regex = REGEX_D_REG_15 if '2015' in filename else REGEX_D_REG_13

    precincts_with_figures = [
        _parse_pdf_precinct_stats(index, precincts, precinct, text, d_reg_regex, d_cast_regex)
        for index, precinct in enumerate(precincts)
    ]

    df = pd.DataFrame.from_dict(precincts_with_figures)
    df['precinct'] = df.precinct.str.replace(r'^\d+ ', '', regex=True)
    df['d_cast'] = df.d_cast.astype(int)
    df['d_reg'] = df.d_reg.astype(int)

    return df

### Import voting data

##### Import P13

In [4]:
p13 = parse_pdf_results('input/primary_2013.pdf')

p13['precinct'] = p13.precinct.apply(format_precinct_name)
p13 = calculate_stats(p13, '13')
p13.head(1)

Unnamed: 0_level_0,d_reg_13,d_cast_13,turnout_13,ballots_pct_of_total_13,reg_pct_of_total_13
precinct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALEPPO,506,60,0.118577,0.000488,0.000954


##### Import P15

In [5]:
p15 = parse_pdf_results('input/primary_2015.pdf')

p15['precinct'] = p15.precinct.apply(format_precinct_name)
p15 = calculate_stats(p15, '15')
p15.head(1)

Unnamed: 0_level_0,d_reg_15,d_cast_15,turnout_15,ballots_pct_of_total_15,reg_pct_of_total_15
precinct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALEPPO,472,74,0.15678,0.00065,0.00093


##### Import P17

In [6]:
p17 = pd.read_csv('input/primary_2017.csv')
p17 = p17[['Precinct Name', 'Democratic Reg Voters', 'Democratic Ballots Cast']]
p17.columns = ['precinct', 'd_reg', 'd_cast']

p17['precinct'] = p17.precinct.apply(format_precinct_name)
p17 = calculate_stats(p17, '17')
p17.head(1)

Unnamed: 0_level_0,d_reg_17,d_cast_17,turnout_17,ballots_pct_of_total_17,reg_pct_of_total_17
precinct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALEPPO,520,123,0.236538,0.001016,0.000967


##### Import P19

In [7]:
p19 = pd.read_csv('input/primary_2019.csv')
p19 = p19[['Precinct Name', 'Democratic Reg Voters', 'Democratic Ballots Cast']]
p19.columns = ['precinct', 'd_reg', 'd_cast']

p19['d_cast'] = p19.d_cast.str.replace(r'.\s', '', regex=True).astype(int)

p19['precinct'] = p19.precinct.apply(format_precinct_name)
p19 = calculate_stats(p19, '19')
p19.head(1)

Unnamed: 0_level_0,d_reg_19,d_cast_19,turnout_19,ballots_pct_of_total_19,reg_pct_of_total_19
precinct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALEPPO,527,98,0.185958,0.00083,0.000955


### Merge primaries; adjust precinct names

In [8]:
primaries = pd.concat([p13, p15, p17, p19], join='outer', axis=1)
primaries = primaries.reset_index()

In [9]:
primaries['precinct'] = primaries.precinct.replace({
    'CASL SHANNON': 'CASTLE SHANNON',
    "OHARA": "O'HARA",
    'ROSSLYN FARM': 'ROSSLYN FARMS',
    'SPRINGDAL BR': 'SPRINGDALE BR',
    'SPRINGDALE TWP': 'SPRINGDALE TP',
    'UP ST CLAIR': 'UPPER ST. CLAIR',
})

primaries['precinct'] = primaries.precinct.str.replace(r'^E ', 'EAST ', regex=True)
primaries['precinct'] = primaries.precinct.str.replace(r'^MT ', 'MOUNT ', regex=True)
primaries['precinct'] = primaries.precinct.str.replace(r'^N ', 'NORTH ', regex=True)
primaries['precinct'] = primaries.precinct.str.replace(r'^S ', 'SOUTH ', regex=True)
primaries['precinct'] = primaries.precinct.str.replace(r'^W ', 'WEST ', regex=True)
primaries['precinct'] = primaries.precinct.str.replace(' HL', ' HILLS')
primaries['precinct'] = primaries.precinct.str.replace(' HTS', ' HEIGHTS')
primaries['precinct'] = primaries.precinct.str.replace(' HT', ' HEIGHTS')
primaries['precinct'] = primaries.precinct.str.replace(' PK', ' PARK')
primaries['precinct'] = primaries.precinct.str.replace(' VILL', ' VILLAGE')

In [10]:
primaries.head(5)

Unnamed: 0,precinct,d_reg_13,d_cast_13,turnout_13,ballots_pct_of_total_13,reg_pct_of_total_13,d_reg_15,d_cast_15,turnout_15,ballots_pct_of_total_15,...,d_reg_17,d_cast_17,turnout_17,ballots_pct_of_total_17,reg_pct_of_total_17,d_reg_19,d_cast_19,turnout_19,ballots_pct_of_total_19,reg_pct_of_total_19
0,ALEPPO,506,60,0.118577,0.000488,0.000954,472,74,0.15678,0.00065,...,520,123,0.236538,0.001016,0.000967,527,98,0.185958,0.00083,0.000955
1,ASPINWALL,1051,214,0.203616,0.00174,0.001982,1047,303,0.289398,0.002662,...,1174,527,0.448893,0.004355,0.002182,1225,363,0.296327,0.003073,0.002219
2,AVALON,2025,284,0.140247,0.002309,0.003819,1951,293,0.150179,0.002574,...,2076,402,0.193642,0.003322,0.003859,2126,358,0.168391,0.003031,0.003852
3,BALDWIN BR,8634,2039,0.236159,0.016578,0.016285,8332,2224,0.266923,0.019537,...,8729,1638,0.18765,0.013535,0.016226,8748,1844,0.210791,0.015611,0.015849
4,BALDWIN TP,832,192,0.230769,0.001561,0.001569,806,188,0.233251,0.001651,...,827,167,0.201935,0.00138,0.001537,836,197,0.235646,0.001668,0.001515


### Analyze primaries

##### Average % of total ballots cast

In [11]:
primaries['ballots_pct_of_total_avg'] = primaries[[c for c in primaries.columns if c.startswith('ballots_pct_of_total')]].mean(axis=1)

In [12]:
primaries.sort_values(by='ballots_pct_of_total_avg', ascending=False).head(5)

Unnamed: 0,precinct,d_reg_13,d_cast_13,turnout_13,ballots_pct_of_total_13,reg_pct_of_total_13,d_reg_15,d_cast_15,turnout_15,ballots_pct_of_total_15,...,d_cast_17,turnout_17,ballots_pct_of_total_17,reg_pct_of_total_17,d_reg_19,d_cast_19,turnout_19,ballots_pct_of_total_19,reg_pct_of_total_19,ballots_pct_of_total_avg
93,PITTSBURGH WARD 14,20255,7501,0.370328,0.060987,0.038203,19158,5457,0.284842,0.047937,...,7603,0.341003,0.062825,0.041444,23833,6433,0.26992,0.054462,0.043179,0.056553
84,PENN HILLS,21350,2958,0.138548,0.02405,0.040268,20725,4123,0.198938,0.036219,...,3642,0.169387,0.030095,0.039967,21987,4954,0.225315,0.041941,0.039835,0.033076
98,PITTSBURGH WARD 19,12926,4810,0.372118,0.039108,0.02438,12324,3071,0.249189,0.026977,...,3757,0.288091,0.031045,0.024241,13222,2626,0.198608,0.022232,0.023955,0.029841
73,MOUNT LEBANON,12336,3156,0.255837,0.02566,0.023267,12123,2700,0.222717,0.023718,...,3583,0.272617,0.029607,0.024431,14028,3179,0.226618,0.026914,0.025415,0.026475
153,WEST MIFFLIN,10033,2829,0.28197,0.023001,0.018923,9497,2817,0.29662,0.024746,...,2853,0.30149,0.023575,0.01759,9372,2555,0.272621,0.021631,0.01698,0.023238


In [13]:
primaries['ballots_pct_of_total_avg_dw'] = 100 * primaries.ballots_pct_of_total_avg
primaries[['precinct', 'ballots_pct_of_total_avg_dw']].to_csv('output/ballots_pct_of_total_avg_dw.csv', index=False)

In [16]:
count = 26
print('It takes **%i** munis/PGH wards to reach this average pct of the off-year primary vote:\n' % count)

pct = primaries.sort_values(by='ballots_pct_of_total_avg', ascending=False).head(count).ballots_pct_of_total_avg.sum()
print(pct)

It takes **26** munis/PGH wards to reach this average pct of the off-year primary vote:

0.5051268663835214


##### East End PGH vs. non-East-End-PGH vs. non-PGH

In [14]:
cols = ['d_cast', 'd_reg']
precincts_pgh_ee = [
    'PITTSBURGH WARD 7',
    'PITTSBURGH WARD 8',
    'PITTSBURGH WARD 10',
    'PITTSBURGH WARD 11',
    'PITTSBURGH WARD 12',
    'PITTSBURGH WARD 13',
    'PITTSBURGH WARD 14',
    'PITTSBURGH WARD 15'
]
primaries_by_precinct_types = pd.DataFrame([['PGH_EE'], ['PGH_EE_NON'], ['PGH_NON']], columns=['precinct_type'])
years = ['13', '15', '17', '19']

for year in years:
    for col in cols:
        col_year = col + '_' + year
        col_year_sum = primaries[col_year].sum()
        col_year_sum_pgh = primaries[primaries.precinct.str.startswith('PITTSBURGH')][col_year].sum()

        primaries_by_precinct_types.loc[
            primaries_by_precinct_types.precinct_type == 'PGH_EE',
            col_year
        ] = primaries[
            primaries.precinct.isin(precincts_pgh_ee)
        ][col_year].sum() / col_year_sum

        primaries_by_precinct_types.loc[
            primaries_by_precinct_types.precinct_type == 'PGH_EE',
            col + '_city_' + year
        ] = primaries[
            primaries.precinct.isin(precincts_pgh_ee)
        ][col_year].sum() / col_year_sum_pgh

        primaries_by_precinct_types.loc[
            primaries_by_precinct_types.precinct_type == 'PGH_EE_NON',
            col_year
        ] = primaries[
            (primaries.precinct.str.startswith('PITTSBURGH')) &
            ~(primaries.precinct.isin(precincts_pgh_ee))
        ][col_year].sum() / col_year_sum

        primaries_by_precinct_types.loc[
            primaries_by_precinct_types.precinct_type == 'PGH_EE_NON',
            col + '_city_' + year
        ] = primaries[
            (primaries.precinct.str.startswith('PITTSBURGH')) &
            ~(primaries.precinct.isin(precincts_pgh_ee))
        ][col_year].sum() / col_year_sum_pgh

        primaries_by_precinct_types.loc[
            primaries_by_precinct_types.precinct_type == 'PGH_NON',
            col_year
        ] = primaries[
            ~primaries.precinct.str.startswith('PITTSBURGH')
        ][col_year].sum() / col_year_sum

primaries_by_precinct_types.loc['Total'] = primaries_by_precinct_types.sum(numeric_only=True)

In [15]:
primaries_by_precinct_types

Unnamed: 0,precinct_type,d_cast_13,d_cast_city_13,d_reg_13,d_reg_city_13,d_cast_15,d_cast_city_15,d_reg_15,d_reg_city_15,d_cast_17,d_cast_city_17,d_reg_17,d_reg_city_17,d_cast_19,d_cast_city_19,d_reg_19,d_reg_city_19
0,PGH_EE,0.16479,0.432273,0.126744,0.407845,0.133306,0.446652,0.124734,0.407591,0.159547,0.476317,0.131162,0.412327,0.152042,0.479687,0.134201,0.417821
1,PGH_EE_NON,0.216427,0.567727,0.184021,0.592155,0.16515,0.553348,0.181294,0.592409,0.175412,0.523683,0.18694,0.587673,0.164918,0.520313,0.186992,0.582179
2,PGH_NON,0.618783,,0.689236,,0.701544,,0.693972,,0.665042,,0.681898,,0.68304,,0.678807,
Total,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
