# Calculate biggest donors for County Executive candidates currently in office

In [1]:
import glob
import pandas as pd
from PyPDF2 import PdfReader
import re

In [2]:
CONTRIB_REGEX = r'Full Name of Contrib.*\n(.*)\n(.*)\n(.*)\n(.*)\n(.*)\n(.*)\n(\d+)\n(\d+)\n(\d+)\n'
RECEIPT_REGEX = r'Full Name \n(.*)\n(.*)\n(.*)\n(.*)\n(.*)\n(.*)\n(\d+)\n(\d+)\n(\d+)\nReceipt Description\n.*\n'
UNITEMIZED_REGEX = r'Unitemized  Contributions Received - \$ 50.00 or Less Per Contributor\nTOTAL for the Reporting Period           \(1\)\n\$\n(.*)\n'

def flatten(l):
    return [item for sublist in l for item in sublist]

def get_contributions(name):
    contributions = []
    for file in glob.glob('input/%s_*.pdf' % name.capitalize()):
        contributions.append(flatten(process_report(file)))

    df = pd.DataFrame(
        flatten(contributions),
        columns=['name', 'address1', 'city', 'state', 'zipcode', 'amount', 'month', 'day', 'year'])
    df['name'] = df.name.str.upper()
    df['amount'] = pd.to_numeric(df.amount.str.replace(',', ''))

    return df

def process_report(filename):
    reader = PdfReader(filename)
    text = ""

    for page in reader.pages:
        text += page.extract_text() + "\n"

    unitemized = re.findall(UNITEMIZED_REGEX, text)
    if len(unitemized) == 0:
        unitemized = []
    else:
        unitemized = [(
            'UNITEMIZED',
            '',
            'PITTSBURGH',
            'PA',
            '',
            unitemized[0],
            '',
            '',
            ''
        )]

    return [
        re.findall(CONTRIB_REGEX, text),
        re.findall(RECEIPT_REGEX, text),
        unitemized
    ]

### John Weinstein (state committee)

In [3]:
df_weinstein = get_contributions('Weinstein')

df_weinstein['name'] = df_weinstein.name.replace({
    "AMALGAMATED TRANSIT UNION COPE": "ATU COPE VOLUNTARY ACCOUNT",
    "LABORERS DISTRICT COUNCIL OF WESTERN PENNSYLVANIA": "WESTERN PENNSYLVANIA LABORERS",
    "LABORERS' DISTRICT COUNCIL OF WESTERN PENNSYLVANIA": "WESTERN PENNSYLVANIA LABORERS",
    "MIDATLANTIC POLITICAL LEAGUE - MALPA": "MID-ATLANTIC LABORERS' POLITICAL LEAGUE",
    "STEAMFITTERS LOCAL UNION 449 PAC FUND": "STEAMFITTERS LOCAL UNON #449",
    "WESTERN PA LABORERS UNION PAC": "WESTERN PENNSYLVANIA LABORERS",
    "WESTERN PENNSYLVANIA LABORERS' PAC": "WESTERN PENNSYLVANIA LABORERS"
})

In [4]:
df_weinstein_topcontribs = df_weinstein.groupby('name').amount.sum().to_frame()

weinstein_sum = df_weinstein.amount.sum()
df_weinstein_topcontribs['pct'] = df_weinstein_topcontribs.amount / weinstein_sum

df_weinstein_topcontribs.sort_values(by='amount', ascending=False).head(10)

Unnamed: 0_level_0,amount,pct
name,Unnamed: 1_level_1,Unnamed: 2_level_1
WESTERN PENNSYLVANIA LABORERS,90000.0,0.180471
ATU COPE VOLUNTARY ACCOUNT,37614.0,0.075425
FNB CORPORATION PAC,22500.0,0.045118
MID-ATLANTIC LABORERS' POLITICAL LEAGUE,22000.0,0.044115
FRIENDS OF RANDY MARTINI,20000.0,0.040105
STEAMFITTERS LOCAL UNON #449,15000.0,0.030078
IBEW LOCAL UNION NO. 5 PAC,10000.0,0.020052
AMALGAMATED TRANSIT UNION - LOCAL 85,10000.0,0.020052
JAMES P. GRANT,10000.0,0.020052
CHARLES HAMMEL III,7500.0,0.015039


### Sara Innamorato (state committee)

In [5]:
df_innamorato = get_contributions('Innamorato')

df_innamorato['name'] = df_innamorato.name.replace({
    "1776 PAC": "1776 PAC (UFCW)",
    "AFSCME COUNCIL 13 POL & LEG ACCT": "AFSCME COUNCIL 13",
    "EVAN SEGAL": "EVAN J. SEGAL",
    "PENNSYLVANIA SEIU COPE": "SEIU HEALTHCARE PA COPE",
    "PFT POL ACTION FUND (PGH FED TEACHERS)": "PFT POLITICAL ACTION FUND",
    "TEAMSERS LOCAL UNION 249": "TEAMSTERS LOCAL UNION 249 - DRIVE FUND",
    "TEAMSTERS LOCAL UNION 249 DRIVE FUND": "TEAMSTERS LOCAL UNION 249 - DRIVE FUND",
    "WESTERN PENNSYLVANIA LABORERS POLITICAL ACTION FUND": "WESTERN PENNSYLVANIA LABORERS",
})

In [6]:
df_innamorato_topcontribs = df_innamorato.groupby('name').amount.sum().to_frame()

innamorato_sum = df_innamorato.amount.sum()
df_innamorato_topcontribs['pct'] = df_innamorato_topcontribs.amount / innamorato_sum

df_innamorato_topcontribs.sort_values(by='amount', ascending=False).head(10)

Unnamed: 0_level_0,amount,pct
name,Unnamed: 1_level_1,Unnamed: 2_level_1
UNITEMIZED,35293.34,0.109579
EVAN J. SEGAL,16020.63,0.049741
SEIU HEALTHCARE PA COPE,11250.0,0.034929
PENNSYLVANIA SIERRA CLUB PAC,10000.0,0.031048
DAVID TURNER,7650.0,0.023752
PENNSYLVANIA DEMOCRATIC PARTY,7613.9,0.02364
NANCY BERNSTEIN,6500.0,0.020181
WOMEN FOR THE FUTURE,6500.0,0.020181
ALLEGHENY COUNTY DEMOCRATIC DELEGATION,5750.0,0.017853
REPRESENT PAC,5500.0,0.017076


### Michael Lamb (state and local committees)

### Liv Bennett (local committee)