In [1]:
pip install unidecode

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install dj_database_url

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install psycopg2-binary

Note: you may need to restart the kernel to use updated packages.


In [4]:
import csv
import os
import zipfile

import dj_database_url
import psycopg2
import psycopg2.extras
import unidecode
import requests


In [5]:
_file = 'receipts-20200101-20200701'
contributions_zip_file = _file + '.txt.zip'
contributions_txt_file = _file + '.txt'
contributions_csv_file = _file + '.csv'

_file = 'receipts-20200101-20200701'
contributions_zip_file = _file + '.txt.zip'
contributions_txt_file = _file + '.txt'
contributions_csv_file = _file + '.csv'

if not os.path.exists(contributions_zip_file):
    print('downloading', contributions_zip_file, '(~60mb) ...')
    u = requests.get(
        'https://s3.amazonaws.com/dedupe-data/Illinois-campaign-contributions.txt.zip')
    localFile = open(contributions_zip_file, 'wb')
    localFile.write(u.content)
    localFile.close()

if not os.path.exists(contributions_txt_file):
    zip_file = zipfile.ZipFile(contributions_zip_file, 'r')
    print('extracting %s' % contributions_zip_file)
    zip_file_contents = zip_file.namelist()
    for f in zip_file_contents:
        if ('.txt' in f):
            zip_file.extract(f)
    zip_file.close()

# Create a cleaned up CSV version of file with consistent row lengths.
# Postgres COPY doesn't handle "ragged" files very well
if not os.path.exists(contributions_csv_file):
    print('converting tab-delimited raw file to csv...')
    with open(contributions_txt_file, 'rU') as txt_file, \
            open(contributions_csv_file, 'w') as csv_file:
        csv_writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
        for line in txt_file:
            if not all(ord(c) < 128 for c in line):
                line = unidecode.unidecode(line)
            row = line.rstrip('\t\r\n').split('\t')
            if len(row) != 29:
                print('skipping bad row (length %s, expected 29):' % len(row))
                print(row)
                continue
            csv_writer.writerow(row)

In [6]:
conn = psycopg2.connect(database="campaign-finance",
                        user="postgres",
                        password="",
                        host="172.16.238.13",
                        port="5432")
c = conn.cursor()

In [7]:
print('importing raw data from csv...')
c.execute("DROP TABLE IF EXISTS donors")
c.execute("DROP TABLE IF EXISTS recipients")
c.execute("DROP TABLE IF EXISTS contributions")
c.execute("DROP TABLE IF EXISTS processed_donors")

importing raw data from csv...


In [8]:
conn.commit()

c.execute("CREATE TABLE raw_table "
          "(receipt_id INT GENERATED BY DEFAULT AS IDENTITY, name VARCHAR(200), "
          " street_line_1 VARCHAR(200), street_line_2 VARCHAR(200), city VARCHAR(200), "
          " state VARCHAR(15), zip_code VARCHAR(11), "
          " occupation VARCHAR(200), employer VARCHAR(200), "
          " committee_name VARCHAR(200), committee_sboe_id VARCHAR(200), "
          " committee_street_1 VARCHAR(200), committee_street_2 VARCHAR(200), "
          " committee_city VARCHAR(200), committee_state VARCHAR(10), "
          " committee_zip VARCHAR(20), report_name VARCHAR(90), "
          " report_date VARCHAR(20), account_code VARCHAR(200), "
          " amount VARCHAR(200), form_of_payment VARCHAR(200), "
          " purpose VARCHAR(500), candidate_ref_name VARCHAR(200), "
          " declaration VARCHAR(200) )")

conn.commit()

with open(contributions_csv_file, 'rU') as csv_file:
    c.copy_expert("COPY raw_table "
                  "(name,  "
                  " street_line_1, street_line_2, city, state, "
                  " zip_code, occupation, employer, "
                  " committee_name, committee_sboe_id, committee_street_1, "
                  " committee_street_2, committee_city, committee_state, "
                  " committee_zip, report_name,report_date, "
                  " account_code, "
                  " amount, form_of_payment, "
                  " purpose, candidate_ref_name,"
                  " declaration) "
                  "FROM STDIN CSV HEADER", csv_file)

conn.commit()

In [9]:
print('creating donors table...')
c.execute("CREATE TABLE donors "
          "(donor_id SERIAL PRIMARY KEY, "
          " name VARCHAR(200), "
          " address_1 VARCHAR(200), address_2 VARCHAR(200), "
          " city VARCHAR(200), state VARCHAR(15), "
          " zip VARCHAR(11), occupation VARCHAR(200), "
          " employer VARCHAR(200) "
          " )")

c.execute("INSERT INTO donors "
          "(name, address_1, "
          " address_2, city, state, zip, occupation, employer) "
          "SELECT DISTINCT "
          "name, "
          "street_line_1, street_line_2, "
          "city, state, zip_code, "
          "profession_job_title, employer_name_specific_field "
          "FROM raw_table")

c.execute("SELECT COUNT(*) FROM donors")

conn.commit()

creating donors table...


In [10]:
print('creating indexes on donors table...')
c.execute("CREATE INDEX donors_donor_info ON donors "
          "(name, address_1, address_2, city, "
          " state, zip)")
conn.commit()

creating indexes on donors table...


In [11]:
conn.commit()

In [12]:
print('creating recipients table...')
c.execute("CREATE TABLE recipients "
          "(recipient_id VARCHAR(200), name VARCHAR(200), "
          " address_1 VARCHAR(200), address_2 VARCHAR(200), "
          " city VARCHAR(200), state VARCHAR(200), zip VARCHAR(200) "
         ")")

c.execute("INSERT INTO recipients "
          "SELECT DISTINCT committee_sboe_id, "
          "committee_name, committee_street_1, committee_street_2, "
          "committee_city, committee_state, committee_zip_code "
          "FROM raw_table")

c.execute("SELECT COUNT(*) FROM recipients")

conn.commit()

creating recipients table...


In [13]:
print('creating contributions table...')
c.execute("CREATE TABLE contributions "
                "(contribution_id INT, committee_sboe_id VARCHAR(200), donor_id INT, "
                " transaction_type VARCHAR(200), date_occured DATE, amount FLOAT, "
                " report_name VARCHAR(200), account_code VARCHAR(200), form_of_payment VARCHAR(200), "
                " purpose VARCHAR(500), candidate_referendum_name VARCHAR(200), declaration VARCHAR(200)"
                ")")
conn.commit()


creating contributions table...


In [16]:
conn.commit()

In [17]:
c.execute("INSERT INTO contributions "
          "SELECT trans_id, committee_sboe_id, donors.donor_id, "
          " transaction_type, TO_DATE(TRIM(date_occured), 'MM/DD/YYYY'), "
          " CAST(amount as DOUBLE PRECISION), "
          " report_name, "
          " account_code, "
          " form_of_payment, "
          " purpose, "
          " candidate_referendum_name, "
          " declaration "
          "FROM raw_table JOIN donors ON "
          "donors.name = raw_table.name AND "
          "donors.address_1 = raw_table.street_line_1 AND "
          "donors.address_2 = raw_table.street_line_2 AND "
          "donors.city = raw_table.city AND "
          "donors.state = raw_table.state AND "
          "donors.employer = raw_table.employer_name_specific_field AND "
          "donors.occupation = raw_table.profession_job_title AND "
          "donors.zip = raw_table.zip_code")
conn.commit()

In [18]:
print('creating indexes on contributions...')
c.execute("ALTER TABLE contributions ADD PRIMARY KEY(contribution_id)")
c.execute("CREATE INDEX donor_idx ON contributions (donor_id)")
c.execute("CREATE INDEX recipient_idx ON contributions (committee_sboe_id)")

conn.commit()

creating indexes on contributions...


In [19]:
print('nullifying empty strings in donors...')
c.execute(
    "UPDATE donors "
    "SET "
    "name = CASE name WHEN '' THEN NULL ELSE name END, "
    "address_1 = CASE address_1 WHEN '' THEN NULL ELSE address_1 END, "
    "address_2 = CASE address_2 WHEN '' THEN NULL ELSE address_2 END, "
    "city = CASE city WHEN '' THEN NULL ELSE city END, "
    "state = CASE state WHEN '' THEN NULL ELSE state END, "
    "employer = CASE employer WHEN '' THEN NULL ELSE employer END, "
    "occupation = CASE occupation WHEN '' THEN NULL ELSE occupation END, "
    "zip = CASE zip WHEN '' THEN NULL ELSE zip END"
)


conn.commit()

nullifying empty strings in donors...


In [20]:
print('creating processed_donors...')
c.execute("CREATE TABLE processed_donors AS "
          "(SELECT donor_id, "
          " CASE WHEN (name IS NULL) "
          "      THEN NULL "
          "      ELSE LOWER(CONCAT_WS(' ', name)) "
          " END AS name, "  
           " CASE WHEN (address_1 IS NULL AND address_2 IS NULL) "
          "      THEN NULL "
          "      ELSE LOWER(CONCAT_WS(' ', address_1, address_2)) "
          " END AS address, " 
          " LOWER(city) AS city, "
          " LOWER(state) AS state, "
          " LOWER(zip) AS zip, "
          " LOWER(occupation) AS occupation, "
          " LOWER(employer) AS employer, "
          " CAST((name IS NULL) AS INTEGER) AS person "
          " FROM donors)")

c.execute("CREATE INDEX processed_donor_idx ON processed_donors (donor_id)")

conn.commit()

c.close()
conn.close()
print('done')

creating processed_donors...
done
