# Graph Construction and features selection

In this notebook, we detail all the followed steps to build the adjacency matrices that will be used later on our project 

In [1]:
import numpy as np
import pandas as pd
import zipfile
from IPython.display import display, HTML

## Processing votes

In [2]:
def interpret_position(positions):
    """
    Binarize the votes positions passed in parameter
    """
    r = positions.copy()
    r = r.apply(lambda x : 1 if x == "Yes" else ( -1 if x == "No" else 0 ))
    
    return r

USEFUL_COLUMNS = ['id','bill.number','congress','member_id','question','position', 'timestamp', 'bill.bill_id']

def process_vote_file(votes_zip_object,file_path, since_congress=115):
    """
    Sanitizes the votes data collected through Prorepublica API
    
    Parameters
    ----------
    votes_zip_object :
        zip file containing the votes
    file_path : String
        CSV file path in the zip file that we would like to collect
    since_congress : int (optional)
        filter votes which belongs to older congress meeting  (< since_congress)

    Returns
    -------
    Dataframe
        sanitized and filtered votes
    """
    print("Processing vote data for {file}".format(file=file_path), end='\r')
    ifile = votes_zip_object.open(file_path)
    
    # Read data
    df = pd.read_csv(ifile)
    
    # We drop entries without bill number as they lead to invalid bill IDs
    df = df[df['bill.number'].notna()]
    
    # We drop entries without roll call value
    df = df[df['roll_call'].notna()]
    
    # As there can be several votes per bill we need to build a unique ID
    df['id'] = df['bill.bill_id'].map(str) + "-" + df['roll_call'].map(str) + "-" + df['session'].map(str)

    # Some bill IDs and roll call numbers are poorly filled, leading to duplicates
    df = df.drop_duplicates('id')
    
    # Build timestamps
    df['timestamp'] = pd.to_datetime(df.date.map(str) + " " + df.time.map(str))
    
    # Keep only useful columns
    df = df[USEFUL_COLUMNS]
    
    # Keep only relevant congresses
    df = df[df['congress'] >= since_congress]
    
    # Convert position to numeric
    df['position'] = interpret_position(df['position'])
    
    return df

NOMINATION_VOTE = "On the Nomination"
MOTION_VOTE = "On the Motion"
AMENDMENT_VOTE = "On the Amendment"
MOTION_TO_TABLE_VOTE = "On the Motion to Table"
CLOTURE_MOTION_VOTE = "On the Cloture Motion"
BILL_PASSAGE = "On Passage of the Bill"
ON_RESOLUTION = 'On the Resolution'
ON_CONC_RESOLUTION = 'On the Concurrent Resolution'
ON_RAT_RESOLUTION = 'On the Resolution of Ratification'
def build_vote_matrix(questions, data):
    """
    Given a list of questions, build a vote matrix.
    Index are senators IDs, columns are vote IDs and values are the respective positions
    of the senators with respect to the question (Yes, No, blank or NaN if the senator
    did not take part in the vote)
    
    Data should contain the following columns:
        - senators ID (member_id)
        - vote ID (id)
        - position (position)
    """    
    
    vote_matrix = data 
    
    if len(questions) > 0:
        vote_matrix = data[data['question'].isin(questions)]
        
    vote_matrix = vote_matrix.pivot(columns='id', index='member_id', values='position')
    
    return vote_matrix

## Read and compile votes per senators in a matrix

In [3]:
# Open votes zip file and list its content
ZIPPED_VOTES = zipfile.ZipFile('data/votes/votes.zip')
vote_data_files = [x.filename for x in ZIPPED_VOTES.infolist() if x.filename.startswith('votes_')]

df_list = []

# parse all votes and save them
for filepath in vote_data_files:
    vote_data = process_vote_file(ZIPPED_VOTES,filepath)
    df_list.append(vote_data)

raw_votes = pd.concat(df_list)
raw_votes.to_pickle("data/processed/processed_vote_data.pickle")
raw_votes

Processing vote data for votes_Y000064.csv

Unnamed: 0,id,bill.number,congress,member_id,question,position,timestamp,bill.bill_id
0,hr695-115-274-2,H.R.695,115,A000360,On the Motion to Proceed,1,2018-12-21 12:31:00,hr695-115
1,hr695-115-273-2,H.R.695,115,A000360,On the Motion,-1,2018-12-19 21:30:00,hr695-115
2,pn2209-115-272-2,PN2209,115,A000360,On the Cloture Motion,1,2018-12-19 11:22:00,pn2209-115
3,s756-115-271-2,S.756,115,A000360,On the Motion,1,2018-12-18 20:22:00,s756-115
4,s756-115-270-2,S. 756,115,A000360,On the Amendment,-1,2018-12-18 20:02:00,s756-115
5,s756-115-269-2,S. 756,115,A000360,On the Amendment,-1,2018-12-18 19:43:00,s756-115
6,s756-115-268-2,S. 756,115,A000360,On the Amendment,-1,2018-12-18 19:01:00,s756-115
7,s756-115-267-2,S.756,115,A000360,On the Cloture Motion,0,2018-12-17 17:22:00,s756-115
8,sjres54-115-266-2,S.J.Res.54,115,A000360,On the Joint Resolution,-1,2018-12-13 15:03:00,sjres54-115
9,sjres54-115-265-2,S.J.Res. 54,115,A000360,On the Amendment,1,2018-12-13 14:45:00,sjres54-115


In this part we construct two feature matrices where the rows correspond to senators and the columns correspond to selected votes. Every matrix takes into account a different set of votes.In fact, the first one includes only votes on bill and resolution passage. Mainwhile the second one, also includes motion votes.The reasons behind our choice will become clearer in the next sections

In [4]:
vote_matrix_1 = build_vote_matrix([BILL_PASSAGE,ON_RESOLUTION,ON_CONC_RESOLUTION,ON_RAT_RESOLUTION], raw_votes).fillna(0)
vote_matrix_2 = build_vote_matrix([BILL_PASSAGE,ON_RESOLUTION,ON_CONC_RESOLUTION,ON_RAT_RESOLUTION,MOTION_VOTE], raw_votes).fillna(0)

vote_matrix_1.to_pickle("data/processed/processed_votes_1.pickle")
vote_matrix_2.to_pickle("data/processed/processed_votes_2.pickle")
np.save("data/processed/processed_senators_ids_1.npy", vote_matrix_1.index.values)
np.save("data/processed/processed_senators_ids_2.npy", vote_matrix_2.index.values)
display(HTML("<center><b> Feature Matrix 1 </b></center>"))
display(vote_matrix_1.head())
display(HTML("<center><b> Feature Matrix 2 </b></center>"))
display(vote_matrix_2.head())



id,hconres71-115-245-1,hjres38-115-43-1,hr1-115-303-1,hr1865-115-60-2,hr2-115-143-2,hr2430-115-187-1,hr2810-115-199-1,hr3364-115-175-1,hr5515-115-128-2,hr5895-115-139-2,...,hr72-115-28-1,s2155-115-54-2,s2554-115-209-2,s722-115-147-1,s84-115-27-1,s89-115-102-1,sconres3-115-26-1,sres176-115-138-1,sres584-115-162-2,treatydoc.114-12-115-98-1
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A000360,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
B000575,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
B000944,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,-1.0,1.0,1.0,1.0,1.0,-1.0,1.0,1.0,1.0
B001135,1.0,1.0,1.0,1.0,-1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
B001230,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,-1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,1.0,1.0


id,hconres71-115-224-1,hconres71-115-227-1,hconres71-115-231-1,hconres71-115-234-1,hconres71-115-245-1,hjres38-115-43-1,hr1-115-285-1,hr1-115-286-1,hr1-115-287-1,hr1-115-288-1,...,sconres3-115-26-1,sconres3-115-4-1,sconres3-115-5-1,sconres3-115-6-1,sconres3-115-7-1,sconres3-115-8-1,sconres3-115-9-1,sres176-115-138-1,sres584-115-162-2,treatydoc.114-12-115-98-1
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A000360,-1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0
B000575,0.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,0.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0
B000944,1.0,1.0,1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,...,-1.0,1.0,-1.0,1.0,1.0,1.0,-1.0,1.0,1.0,1.0
B001135,-1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0
B001230,1.0,1.0,1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,...,-1.0,1.0,-1.0,1.0,1.0,1.0,-1.0,1.0,1.0,1.0


# Processing bills

In [5]:
# Keep unique active bill ids
bill_ids = pd.DataFrame(raw_votes['bill.bill_id'].drop_duplicates()).set_index('bill.bill_id')
df_bills = pd.read_csv("data/bills/active_bills.csv")
df_bills = df_bills[['bill_id','summary','summary_short']]

# Collect active bills descriptions
bill_descr = bill_ids.merge(df_bills, left_index=True, right_on='bill_id', how='left').set_index('bill_id')
bill_descr.to_csv("data/processed/processed_bill_descriptions.csv")

# Processing senators

The table below details list all the senators in the 115th congress meeting

In [6]:
df = pd.read_csv("data/senate_members/senate_members_115.csv")[['id','first_name', 'last_name','party','votes_with_party_pct']]
df.columns = ["id","First name","Last name","party","Percentage of votes position similar to the same party members"]
df


Unnamed: 0,id,First name,Last name,party,Percentage of votes position similar to the same party members
0,A000360,Lamar,Alexander,R,96.15
1,B001230,Tammy,Baldwin,D,94.46
2,B001261,John,Barrasso,R,96.32
3,B001267,Michael,Bennet,D,90.74
4,B001277,Richard,Blumenthal,D,90.94
5,B000575,Roy,Blunt,R,98.29
6,B001288,Cory,Booker,D,84.94
7,B001236,John,Boozman,R,98.32
8,B000944,Sherrod,Brown,D,93.80
9,B001135,Richard,Burr,R,97.74


In [7]:
#Sanitize senators names

raw_senators = pd.read_csv("data/senate_members/senate_members_115.csv")[['id', 'party', 'first_name', 'last_name','votes_with_party_pct']].set_index('id')
raw_senators['name'] = raw_senators['first_name'].map(str) + " " + raw_senators['last_name'].map(str)
senators = raw_senators.loc[vote_matrix_1.index]
senators.to_pickle("data/processed/processed_senators.pickle")

# Saving party info
np.save("data/processed/processed_labels.npy", senators['party'].values)
labels = senators['party'].values

# Saving numerical values for party
dict_mapping = {'R':0.,'D':0.12,'I':0.223}
labels = np.vectorize(dict_mapping.get)(labels)
np.save("data/processed/processed_labels_numerical.npy", labels)