The purpose of this notebook is to load individuals contribution data from the FEC website for all individuals in NY state.

The following steps will need to be taken:

1. Download the data and unzip if necessary
2. Load the data into a dataframe
3. Filter our all individuals that reside in NY state
4. Push the data into our MySQL database

Data obtained from:

https://www.fec.gov/data/browse-data/?tab=bulk-data

In [18]:
import pandas as pd
import numpy as np
import requests
import itertools
import xlrd
import csv
import os
import io
import requests
from zipfile import ZipFile
from io import StringIO, BytesIO
from urllib.request import urlopen
import json
import mysql.connector
import sqlalchemy
from credentials import MYSQL_PASSWORD
pd.set_option('display.max_rows', 5000)

## Candidate Data

In [15]:
# Downloading the column names
header_link = 'https://www.fec.gov/files/bulk-downloads/data_dictionaries/cn_header_file.csv'
s = requests.get(header_link).content
c = pd.read_csv(io.StringIO(s.decode('utf-8')))
header_list = list(c.columns)

['CAND_ID',
 'CAND_NAME',
 'CAND_PTY_AFFILIATION',
 'CAND_ELECTION_YR',
 'CAND_OFFICE_ST',
 'CAND_OFFICE',
 'CAND_OFFICE_DISTRICT',
 'CAND_ICI',
 'CAND_STATUS',
 'CAND_PCC',
 'CAND_ST1',
 'CAND_ST2',
 'CAND_CITY',
 'CAND_ST',
 'CAND_ZIP']

In [20]:
# Loading the data into a dataframe
can_df = pd.DataFrame(columns = header_list)
for year in [20,18,16]:
    data_link = f'https://www.fec.gov/files/bulk-downloads/20{year}/cn{year}.zip'
    r = urlopen(data_link).read()
    file = ZipFile(BytesIO(r))
    can_txt = file.open("cn.txt")
    df = pd.read_csv(can_txt, sep="|", header=None, error_bad_lines=False)
    df.columns = header_list
    can_df = pd.concat([can_df, df], axis=0).reset_index(drop=True)

6838
14219
21616


In [27]:
# Removing duplicate records
can_df = can_df.drop_duplicates().reset_index(drop=True)

In [28]:
# Load data to the MySQL database
host="192.168.4.38"
user="monty"
password=MYSQL_PASSWORD
db="nyc"
port=3306

conn = sqlalchemy.create_engine(f'mysql+mysqlconnector://{user}:{password}@{host}/{db}').connect()

can_df.to_sql(con=conn, name='candidate_info', if_exists='append', index=False)
                                
conn.close()

## Committee Data

In [30]:
# Downloading the column names
header_link = 'https://www.fec.gov/files/bulk-downloads/data_dictionaries/cm_header_file.csv'
s = requests.get(header_link).content
c = pd.read_csv(io.StringIO(s.decode('utf-8')))
header_list = list(c.columns)

['CMTE_ID',
 'CMTE_NM',
 'TRES_NM',
 'CMTE_ST1',
 'CMTE_ST2',
 'CMTE_CITY',
 'CMTE_ST',
 'CMTE_ZIP',
 'CMTE_DSGN',
 'CMTE_TP',
 'CMTE_PTY_AFFILIATION',
 'CMTE_FILING_FREQ',
 'ORG_TP',
 'CONNECTED_ORG_NM',
 'CAND_ID']

In [34]:
# Loading the data into a dataframe
com_df = pd.DataFrame(columns = header_list)
for year in [20,18,16]:
    data_link = f'https://www.fec.gov/files/bulk-downloads/20{year}/cm{year}.zip'
    r = urlopen(data_link).read()
    file = ZipFile(BytesIO(r))
    com_txt = file.open("cm.txt")
    df = pd.read_csv(com_txt, sep="|", header=None, error_bad_lines=False)
    df.columns = header_list
    com_df = pd.concat([com_df, df], axis=0).reset_index(drop=True)

16299
35327
52981


In [35]:
# Removing duplicate records
com_df = com_df.drop_duplicates().reset_index(drop=True)

In [37]:
# Load data to the MySQL database
host="192.168.4.38"
user="monty"
password=MYSQL_PASSWORD
db="nyc"
port=3306

conn = sqlalchemy.create_engine(f'mysql+mysqlconnector://{user}:{password}@{host}/{db}').connect()

com_df.to_sql(con=conn, name='committee_info', if_exists='append', index=False)
                                
conn.close()

## Individual Contributions Data

In [62]:
# Downloading the column names
header_link = 'https://www.fec.gov/files/bulk-downloads/data_dictionaries/indiv_header_file.csv'
s = requests.get(header_link).content
c = pd.read_csv(io.StringIO(s.decode('utf-8')))
header_list = list(c.columns)

In [63]:
# Checking the number of lines in the file
with open('fec_data/itcont_2020.txt') as f:
    for i, l in enumerate(f):
        pass
print(i + 1)

23393810


In [66]:
# The data has been manually downloaded and placed in a new folder called 'fec_data' 

# Create a connection engine to the MySQL database
host="192.168.4.38"
user="monty"
password=MYSQL_PASSWORD
db="nyc"
port=3306

conn = sqlalchemy.create_engine(f'mysql+mysqlconnector://{user}:{password}@{host}/{db}').connect()

# Read the data and load it into MySQL in chunks
for x in range(0,24000000,1000000):
    data = pd.read_csv('fec_data/itcont_2020.txt', sep="|", header=None, error_bad_lines=False, skiprows=x,
                        nrows=1000000)
    data.columns = header_list
    df = data[data['STATE'] == 'NY'].reset_index(drop=True)
    
    df.to_sql(con=conn, name='individual_donations', if_exists='append', index=False)
    print(x)
                                
conn.close()

  interactivity=interactivity, compiler=compiler, result=result)


0
1000000
2000000
3000000


  interactivity=interactivity, compiler=compiler, result=result)


4000000


  interactivity=interactivity, compiler=compiler, result=result)


5000000


b'Skipping line 6328020: expected 21 fields, saw 22\n'


6000000
7000000
8000000
9000000


  interactivity=interactivity, compiler=compiler, result=result)


10000000


  interactivity=interactivity, compiler=compiler, result=result)


11000000
12000000


  interactivity=interactivity, compiler=compiler, result=result)


13000000
14000000


  interactivity=interactivity, compiler=compiler, result=result)


15000000
16000000
17000000
18000000
19000000


  interactivity=interactivity, compiler=compiler, result=result)


20000000


  interactivity=interactivity, compiler=compiler, result=result)


21000000


  interactivity=interactivity, compiler=compiler, result=result)


22000000
23000000
