In [1]:
import numpy as np
import pandas as pd
import zipfile
from IPython.display import display, HTML

# Processing votes

In [2]:
def interpret_position(positions):
    r = positions.copy()
    r = r.apply(lambda x : 1 if x == "Yes" else ( -1 if x == "No" else 0 ))
    
    return r

USEFUL_COLUMNS = ['id','bill.number','congress','member_id','question','position', 'timestamp']

def process_vote_file(file_path, since_congress=115):
    print("Processing vote data for {file}".format(file=file_path), end='\r')
    ifile = ZIPPED_VOTES.open(file_path)
    
    # Read data
    df = pd.read_csv(ifile)

    # We drop entries without bill number as they lead to invalid bill IDs
    df = df[df['bill.number'].notna()]
    
    # We drop entries without roll call value
    df = df[df['roll_call'].notna()]
    
    # As there can be several votes per bill we need to build a unique ID
    df['id'] = df['bill.bill_id'].map(str) + "-" + df['roll_call'].map(str)

    # Some bill IDs and roll call numbers are poorly filled, leading to duplicates
    df = df.drop_duplicates('id')
    
    # Build timestamps
    df['timestamp'] = pd.to_datetime(df.date.map(str) + " " + df.time.map(str))
    
    # Keep only useful columns
    df = df[USEFUL_COLUMNS]
    
    # Keep only relevant congresses
    df = df[df['congress'] >= since_congress]
    
    # Convert position to numeric
    df['position'] = interpret_position(df['position'])
    
    return df

NOMINATION_VOTE = "On the Nomination"
MOTION_VOTE = "On the Motion"
AMENDMENT_VOTE = "On the Amendment"
MOTION_TO_TABLE_VOTE = "On the Motion to Table"
CLOTURE_MOTION_VOTE = "On the Cloture Motion"
BILL_PASSAGE = "On Passage of the Bill"

def build_vote_matrix(questions, data):
    """
    Given a list of questions, build a vote matrix.
    Index are senators IDs, columns are vote IDs and values are the respective positions
    of the senators with respect to the question (Yes, No, blank or NaN if the senator
    did not take part in the vote)
    
    Data should contain the following columns:
        - senators ID (member_id)
        - vote ID (id)
        - position (position)
    """    
    
    vote_matrix = data 
    
    if len(questions) > 0:
        vote_matrix = data[data['question'].isin(questions)]
        
    vote_matrix = vote_matrix.pivot(columns='id', index='member_id', values='position')
    
    return vote_matrix

## Read and compile votes per senators in a matrix

In [3]:
ZIPPED_VOTES = zipfile.ZipFile('data/votes/votes.zip')
vote_data_files = [x.filename for x in ZIPPED_VOTES.infolist() if x.filename.startswith('votes_')]

df_list = []

for filepath in vote_data_files:
    vote_data = process_vote_file(filepath)
    df_list.append(vote_data)
    
raw_votes = pd.concat(df_list)
display(raw_votes.head(1))

Processing vote data for votes_A000355.csv

Unnamed: 0,id,bill.number,congress,member_id,question,position,timestamp
0,hr6-115-221,H.R.6,115,A000360,On the Motion,1,2018-10-03 15:18:00
1,hr302-115-220,H.R.302,115,A000360,On the Motion,1,2018-10-03 12:01:00
2,hr302-115-219,H.R.302,115,A000360,On the Cloture Motion,1,2018-10-01 17:31:00
3,pn1677-115-218,PN1677,115,A000360,On the Nomination,1,2018-09-27 12:41:00
4,pn2073-115-217,PN2073,115,A000360,On the Nomination,1,2018-09-26 13:46:00


In [11]:
vote_matrix = build_vote_matrix([BILL_PASSAGE], raw_votes).dropna(0)
display(vote_matrix)
vote_matrix.to_pickle("data/processed/processed_votes.pickle")
np.save("data/processed/processed_senators_ids.npy", vote_matrix.index.values)

id,hjres38-115-43,hr1-115-303,hr1865-115-60,hr2-115-143,hr2430-115-187,hr2810-115-199,hr3364-115-175,hr5515-115-128,hr5895-115-139,hr6-115-210,hr6147-115-180,hr6157-115-193,hr72-115-28,s2155-115-54,s2554-115-209,s722-115-147,s84-115-27,s89-115-102
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
A000360,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1
B000575,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
B000944,-1,-1,1,1,1,1,1,1,1,1,1,1,1,-1,1,1,1,1
B001135,1,1,1,-1,0,1,1,1,1,1,1,1,1,1,1,1,1,1
B001230,-1,-1,1,1,1,1,1,1,1,1,1,1,1,-1,1,1,-1,1
B001236,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1
B001261,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
B001267,-1,-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
B001277,-1,-1,1,1,1,1,1,1,1,1,1,1,1,-1,1,1,-1,-1
B001288,-1,-1,1,1,1,1,1,1,0,1,1,1,1,-1,1,1,-1,1


# Processing bills

# Processing senators

In [21]:
raw_senators = pd.read_csv("data/senate_members/senate_members_115.csv")[['id', 'party']].set_index('id')
senators = raw_senators.loc[vote_matrix.index]
np.save("data/processed/processed_labels.npy", senators['party'].values)
labels = senators['party'].values
dict_mapping = {'R':-1,'D':1,'I':0}
labels = np.vectorize(dict_mapping.get)(labels)
np.save("data/processed/processed_labels_numerical.npy", labels)