# Analyze Cycle 7 reports for County Executive candidates currently in office

In [1]:
from functools import reduce
import glob
import pandas as pd
from PyPDF2 import PdfReader
import re

In [2]:
CONTRIB_REGEX = r'Full Name of Contrib.*\n(.*)\n(.*)\n(.*)\n(.*)\n(.*)\n(.*)\n(\d+)\n(\d+)\n(\d+)\n'
RECEIPT_REGEX = r'Full Name \n(.*)\n(.*)\n(.*)\n(.*)\n(.*)\n(.*)\n(\d+)\n(\d+)\n(\d+)\nReceipt Description\n.*\n'
UNITEMIZED_REGEX = r'Unitemized  Contributions Received - \$ 50.00 or Less Per Contributor\nTOTAL for the Reporting Period           \(1\)\n\$\n(.*)\n'

def flatten(l):
    return [item for sublist in l for item in sublist]

def get_contributions(filename):
    contributions = []
    contributions.append(flatten(process_report(filename)))

    df = pd.DataFrame(
        flatten(contributions),
        columns=['name', 'address1', 'city', 'state', 'zipcode', 'amount', 'month', 'day', 'year'])
    df['name'] = df.name.str.upper()
    df['amount'] = pd.to_numeric(df.amount.str.replace(',', ''))

    return df

def process_report(filename):
    reader = PdfReader(filename)
    text = ""

    for page in reader.pages:
        text += page.extract_text() + "\n"

    unitemized = re.findall(UNITEMIZED_REGEX, text)
    if len(unitemized) == 0:
        unitemized = []
    else:
        unitemized = [(
            'UNITEMIZED',
            '',
            'PITTSBURGH',
            'PA',
            '',
            unitemized[0],
            '',
            '',
            ''
        )]

    return [
        re.findall(CONTRIB_REGEX, text),
        re.findall(RECEIPT_REGEX, text),
        unitemized
    ]

### Load analysis from previous reports

In [3]:
pastleaders = pd.read_csv('output/wein-inna-lamb-top.csv')

### Load Cycle 7 reports for 2022

##### John Weinstein (state committee)

In [4]:
df_weinstein = get_contributions('input/blank.pdf')
df_weinstein = df_weinstein.rename(columns={'amount': 'amount_wein'})

df_weinstein['name'] = df_weinstein.name.replace({
    "AMALGAMATED TRANSIT UNION COPE": "ATU COPE VOLUNTARY ACCOUNT",
    "IBEW LOCAL UNION NO. 5 PAC": "LOCAL 0005 IBEW PAC",
    "MIDATLANTIC POLITICAL LEAGUE - MALPA": "MID-ATLANTIC LABORERS' POLITICAL LEAGUE",
    "STEAMFITTERS LOCAL UNON #449": "LOCAL 0449 STEAMFITTERS UNION PAC",
    "STEAMFITTERS LOCAL UNION 449 PAC FUND": "LOCAL 0449 STEAMFITTERS UNION PAC",
    "PLUMBERS LOCAL UNION NO. 27 PAC": "LOCAL 0027 PLUMBERS UNION PAC",
    "BRICKLAYERS &AMP; ALLIED CRAFTWORKERS LOCAL 9 PAC": "LOCAL 0009 BRICKLAYERS & ALLIED CRAFTWORKERS PA PAC",
    "PITTSBURGH FIRE FIGHTERS LOCAL NO 1 FIRE PAC ACCOUNT": "PGH FIRE FIGHTERS LOCAL #1 FIRE PAC",
    "TEAMSTERS LOCAL UNION 249 - DRIVE FUND": "LOCAL 0249 TEAMSTERS DRIVE",
    "U.W.U.A. LOCAL 433 PAC": "LOCAL 0433 UWUA (UTILITY WORKERS)",
    "TEAMSTER JOINT COUNCIL 40 PAC": "TEAMSTERS JT COUNCIL 40 PAC",
    "AFSCME COUNCIL 13 POLITICAL &AMP; LEGISLATIVE": "AFSCME COUNCIL 13 POL & LEG ACCT",
    # Laborers
    "LABORERS DISTRICT COUNCIL OF WESTERN PENNSYLVANIA": "WESTERN PENNSYLVANIA LABORERS 2019 PAC",
    "LABORERS' DISTRICT COUNCIL OF WESTERN PENNSYLVANIA": "WESTERN PENNSYLVANIA LABORERS 2019 PAC",
    "WESTERN PA LABORERS UNION PAC": "WESTERN PENNSYLVANIA LABORERS 2019 PAC",
    "WESTERN PENNSYLVANIA LABORERS": "WESTERN PENNSYLVANIA LABORERS 2019 PAC",
    "WESTERN PENNSYLVANIA LABORERS' PAC": "WESTERN PENNSYLVANIA LABORERS 2019 PAC",
})

In [5]:
df_weinstein_topcontribs = df_weinstein.groupby('name').amount_wein.sum().to_frame().reset_index()

weinstein_sum = df_weinstein.amount_wein.sum()
df_weinstein_topcontribs['pct_wein'] = df_weinstein_topcontribs.amount_wein / weinstein_sum

df_weinstein_topcontribs.sort_values(by='amount_wein', ascending=False).head(10)

Unnamed: 0,name,amount_wein,pct_wein


##### Sara Innamorato (state committee)

In [6]:
df_innamorato = get_contributions('input/blank.pdf')
df_innamorato = df_innamorato.rename(columns={'amount': 'amount_inna'})

df_innamorato['name'] = df_innamorato.name.replace({
    "1776 PAC": "1776 PAC (UFCW)",
    "AFSCME COUNCIL 13": "AFSCME COUNCIL 13 POL & LEG ACCT",
    "EVAN SEGAL": "EVAN J. SEGAL",
    "PENNSYLVANIA SEIU COPE": "SEIU HEALTHCARE PA COPE",
    "PFT POLITICAL ACTION FUND": "PFT POL ACTION FUND (PGH FED TEACHERS)",
    "TEAMSTERS LOCAL UNION 249": "LOCAL 0249 TEAMSTERS DRIVE",
    "TEAMSERS LOCAL UNION 249": "LOCAL 0249 TEAMSTERS DRIVE",
    "TEAMSTERS LOCAL UNION 249 DRIVE FUND": "LOCAL 0249 TEAMSTERS DRIVE",
    "WESTERN PENNSYLVANIA LABORERS POLITICAL ACTION FUND": "WESTERN PENNSYLVANIA LABORERS 2019 PAC",
})

In [7]:
df_innamorato_topcontribs = df_innamorato.groupby('name').amount_inna.sum().to_frame().reset_index()

innamorato_sum = df_innamorato.amount_inna.sum()
df_innamorato_topcontribs['pct_inna'] = df_innamorato_topcontribs.amount_inna / innamorato_sum

df_innamorato_topcontribs.sort_values(by='amount_inna', ascending=False).head(10)

Unnamed: 0,name,amount_inna,pct_inna


In [8]:
# lamb

### Compare Cycle 7 to previous reports

In [9]:
# 