In [616]:
import pandas as pd
import os
import yaml
import urllib.request as requests
import json
import re
from bs4 import BeautifulSoup as bs
import numpy as np
from collections import Counter

In [617]:
# set directory
os.chdir("/Users/mathiasrask/Desktop/kandidat/E2020/Advanced-Quantitative-Methods/")

# define url
url = "https://theunitedstates.io/congress-legislators/"

# define name of files to be parsed
social_media = 'legislators-social-media.yaml'
historical   = 'legislators-historical.yaml'
current      = 'legislators-current.yaml'

In [618]:
# Dump file in YAML-format 
#soup = bs(content)
#with open('social-media-accounts/legislators-historical.yaml', 'w') as file:
#    documents = yaml.dump(soup, file)

In [619]:
def read_yaml(url):
    url = requests.urlopen(url)               # connect to url
    content = url.read().decode('utf-8')      # read and decode to UTF-8 format 
    return content

def clean_yaml(string, file:str):
    content = string.split('\n')
    content = [x.lstrip("' '|>-") for x in content]
    
    if file == 'social-media':
        content = content[18:]
    if file == 'legislators':
        content[0] = re.sub('[ b\'-]', "", content[0])
    return content

def split_list(match:list, string, within_legislator=False):
    def match_function():
        matches = [re.match(match, x) for x in string]
        matches = [x is not None for x in matches]    
        return np.cumsum(matches).tolist()
    
    indices = match_function()
    
    def split_indices():
        results = []
        for idx, val in enumerate(indices):
            if idx == 0:
                results.append([val])
                continue
            if val > indices[idx - 1]:
                results.append([val])
            else:
                results[-1].append(val)
        return np.cumsum([len(x) for x in results]).tolist()
    
    #if within_legislator == False:
    #    indices = split_indices()
    #    indices = np.cumsum([len(x) for x in indices]).tolist()
    #if within_legislator == True:
    #    indices = split_indices()
    #    indices = np.cumsum([len(x) for x in indices]).tolist()
    indices = split_indices()
    def decomposition():
        decomposed_list = []
        for idx, val in enumerate(indices):
            if idx == 0:
                decomposed_list.append(string[0:val])
            else:
                current  = indices[idx]
                previous = indices[idx-1] 
                decomposed_list.append(string[previous:current])
        return decomposed_list
    
    final = decomposition()
    
    # remove potential empty elements
    final = [[x for x in l if x] for l in final]
    
    return final

def get_index_positions(list_object, element):
    """
    Returns the indexes of all occurrences of an element in
    a list.
    """  
    index_pos_list = []
    index_pos = 0
    while True:
        try:
            # Search for item in list from indexPos to the end of list
            index_pos = list_object.index(element, index_pos)
            # Add the index position in list
            index_pos_list.append(index_pos)
            index_pos += 1
        except ValueError as e:
            break
    return index_pos_list

def del_list_numpy(l, id_to_del):
    """
    Delete indicies from list using numpy's delete-function.
    """
    arr = np.array(l)
    return list(np.delete(arr, id_to_del))


def delete_party_affiliations(list_object):
    for idx, val in enumerate(list_object):
        
        party_aff = [re.match('party_affiliations:',x) is not None for x in list_object[idx]['terms:']]
        type_     = [re.match('type:',x) is not None for x in list_object[idx]['terms:']]
        
        type_indices      = get_index_positions(type_, True)
        party_aff_indices = get_index_positions(party_aff, True)
        
        distances = []
        indices = []
        
        for idx1, val1 in enumerate(party_aff_indices):
            temp = []
            for idx2, val2 in enumerate(type_indices):
                temp.append(val2 - val1)
            
            if max(temp) > 0:
                index = temp.index(min([i for i in temp if i > 0]))
                indices = indices + list(range(val1, type_indices[index]))
                distances.append(index)
            
            if max(temp) < 0:
                indices = indices + list(range(max(party_aff_indices), len(list_object[idx]['terms:'])))
        
        list_object[idx]['terms:'] = del_list_numpy(list_object[idx]['terms:'], indices)    
    
    return list_object

def clean_bio(list_object, element:str):    
    """
    Function to clean-up fec-numbers and id to biographical information
    """
    for idx, val in enumerate(list_object):
        index = [re.match(element,x) for x in val]

        if np.sum([x!=None for x in index]) > 0:
            element_index = [x!=None for x in index].index(True)
            if element == 'fec:':
                matches = [re.match('[A-Z0-9]{9}', val[i]) for i in range(element_index+0, element_index+4)]

            if element == 'bioguide_previous:':
                matches = [re.match('[A-Z0-9]{7}', val[i]) for i in range(element_index+0, element_index+4)]

            index_match = get_index_positions([x!=None for x in matches], True)

            N_matches = np.sum([x is not None for x in matches])

            if element == 'fec:':
                element_numbers = [matches[index_match[i]].string for i in range(N_matches)]
                element_numbers = '|'.join(element_numbers)
                list_object[idx][element_index] = list_object[idx][element_index] + ' ' + element_numbers

            if element == 'bioguide_previous:':
                for i in range(N_matches):
                    list_object[idx][element_index] = list_object[idx][element_index] + ' ' + matches[index_match[i]].string

            list_object[idx] = del_list_numpy(list_object[idx], [i+element_index for i in index_match])
    return list_object


def get_row_indices(list_object, match:str):
    return np.cumsum([re.match(match,x[0]) is not None for x in list_object])-1

In [620]:
# Function to move bioguide into the second element of the list
def clean_bioguide():
    # For-loop to make sure that bioguide is at top
    #for idx, val in enumerate(between_legislator):
    matches = [re.match('bioguide: (.*)', x) for x in val]
    boolean = [x is not None for x in matches]

    bioguide = [x.string for x in matches if x is not None][0]
    bio_position = get_index_positions(boolean, True)[0]

    if bio_position != 1:
        replace_element = between_legislator[idx][1]
        between_legislator[idx][1] = bioguide
        between_legislator[idx][bio_position] = replace_element
    return

# Function to assure that "name:" only figures once
def clean_family():
    matches = [re.match("name:", x) for x in val]
    indices = get_index_positions([x!=None for x in matches], True)

    # get indices for the actual name element
    name_indices = get_index_positions([x.group()==x.string for x in matches if x is not None], False)

    # keep only indices to be changed
    indices = [indices[x] for x in name_indices]

    # construct pairs of the name-title and the name of the fam. member
    match_pairs = [re.match('(.*): (.*)', between_legislator[idx][x]).groups() for x in indices]
    
    # replace family names
    for i,v in enumerate(indices):
        between_legislator[idx][v] = f"{match_pairs[i][0]}_fam: {match_pairs[i][1]}"
    return

# Historical Legislators

In this section, I prepare the data for historical legislators, that is legislators not in the current US Congress (116th). 

In [621]:
# Read yaml-file for historical data
content = read_yaml(url=f"{url}{historical}")
content = clean_yaml(content, file='legislators')

# Split list into sub-lists for each legislator
between_legislator = split_list(match="id:", string=content)

In [622]:
# Make sure bioguide is on top and clean the family section
for idx, val in enumerate(between_legislator):
    clean_bioguide()
    clean_family()

In [623]:
# delete contact information from list
delete_info = [[re.match('address:|phone:|fax:|contact_form:|office:|other_names:|rss_url:',x) for x in l] for l in between_legislator]
boolean = [[x is None for x in c] for c in delete_info]
delete_indices = [get_index_positions(x, False) for x in boolean]
between_legislator = [del_list_numpy(between_legislator[idx], delete_indices[idx]) for idx in range(len(between_legislator))]

# Clean up fec and bioguide 
between_legislator = clean_bio(between_legislator, element='fec:')
between_legislator = clean_bio(between_legislator, element='bioguide_previous:')

In [229]:
# Split list into sub-lists for each legislator
#between_legislator = split_list(match="id:", string=content)
#
# Delete contact information for each legislator
#contact = [[re.match('address:|phone:|fax:|contact_form:|office:|other_names:|rss_url:|family:|relation:',x) for x in l] for l in between_legislator]
#subset_index = [[x is None for x in c] for c in contact]
#contact = [[x.string for x in c if x is not None] for c in contact]
#delete_indices = [get_index_positions(x, False) for x in subset_index]
#between_legislator = [del_list_numpy(between_legislator[idx], delete_indices[idx]) for idx in range(len(between_legislator))]
#
# Clean up fec and bioguide
#between_legislator = clean_bio(between_legislator, element='fec:')
#between_legislator = clean_bio(between_legislator, element='bioguide_previous:')

In [230]:
# code to delete all family names
#matches = [re.match("name:", x) for x in between_legislator]
#indices = get_index_positions([x!=None for x in matches], True)
#name_matches = get_index_positions([matches[x].group()=='name:' for x in indices], True)
#name_indices = [indices[x] for x in name_matches]
#change_name_index = [x for x in name_indices if matches[x].group()!=matches[x].string]
#del_list_numpy(between_legislator, change_name_index)

In [624]:
# Construct a list for each type of information for each legislator.
# I delete party affiliations within the 'terms'-list to avoid complications.
#within_legislator = [split_list(match="id:|name:|bio:|terms:|leadership_roles:|family:", string=x) for x in between_legislator]
within_legislator = [split_list(match="id:|name:|bio:|terms:|leadership_roles:|family:", string=x) for x in between_legislator]
within_legislator = [{x[0]:x[1:] for x in l} for l in within_legislator]
within_legislator = delete_party_affiliations(within_legislator)

# Constuct column-row pairs for each legislator
for idx0, val0 in enumerate(within_legislator):
    for idx1, val1 in enumerate(val0):
        within_legislator[idx0][val1] = [re.match('(.*): (.*)', x).groups() for x in within_legislator[idx0][val1]] 
        

for idx in range(len(within_legislator)):
    boolean_end = [re.match('end', x) is not None for x,y in within_legislator[idx]['name:']]
    if sum(boolean_end) > 0:
        end_position = get_index_positions(boolean_end, True)[0]
        del within_legislator[idx]['name:'][end_position]

for idx in range(len(within_legislator)):
    boolean_middle = [re.match('~', y) is not None for x,y in within_legislator[idx]['name:']]
    if sum(boolean_middle) > 0:
        middle_position = get_index_positions(boolean_middle, True)[0]
        del within_legislator[idx]['name:'][middle_position]

for idx, val in enumerate(within_legislator):
    last_positions = get_index_positions([re.match('last', x) is not None for x,y in within_legislator[idx]['name:']],True)
    if len(last_positions) > 1:
        max_last = max(last_positions)
        del within_legislator[idx]['name:'][max_last]

In [645]:
# Loop through each legislator and create a dataframe for each of them 
df_leader_list = []
df_family_list = []
df_list   = []

for idx0, val0 in enumerate(within_legislator):
    if idx0%500 == 0:
        print(idx0)
    for idx1, val1 in enumerate(val0):
        subset = within_legislator[idx0][val1]
        
        if val1 == 'id:':
            row_index = get_row_indices(subset, match='bioguide')
        if val1 == 'name:':
            row_index = get_row_indices(subset, match='first')
        if val1 == 'bio:':
            row_index = get_row_indices(subset, match='birthday')    
        if val1 == 'terms:':
            row_index = get_row_indices(subset, match='type')
        
        if val1 == 'leadership_roles:':
            row_index = get_row_indices(subset, match='title')
            temp = pd.DataFrame()
            for idx2, val2 in enumerate(row_index):
                temp.loc[val2, within_legislator[idx0]['id:'][0][0]] = within_legislator[idx0]['id:'][0][1]
                temp.loc[val2, subset[idx2][0]] = subset[idx2][1]
            df_leader_list.append(temp)
        
        elif val1 == 'family:':
            row_index = get_row_indices(subset, match='name_fam')
            temp = pd.DataFrame()
            for idx2, val2 in enumerate(row_index):
                temp.loc[val2, within_legislator[idx0]['id:'][0][0]] = within_legislator[idx0]['id:'][0][1]
                temp.loc[val2, subset[idx2][0]] = subset[idx2][1]
            df_family_list.append(temp)
        
        else:
            if val1 == 'id:':
                temp = pd.DataFrame()
                for idx2, val2 in enumerate(row_index):
                    temp.loc[val2, subset[idx2][0]] = subset[idx2][1]
                df = temp
            else:
                temp = pd.DataFrame()
                for idx2, val2 in enumerate(row_index):
                    temp.loc[val2, within_legislator[idx0]['id:'][0][0]] = within_legislator[idx0]['id:'][0][1]
                    temp.loc[val2, subset[idx2][0]] = subset[idx2][1]
                df = df.merge(temp, on=['bioguide'])
    
    gender_match   = [re.match('gender',  x) is not None for x in list(df.columns)]
    birthday_match = [re.match('birthday',x) is not None for x in list(df.columns)]
    if sum(gender_match) == 1:
        df['gender']   = df['gender'].fillna(method='ffill')
    if sum(birthday_match) == 1:
        df['birthday'] = df['birthday'].fillna(method='bfill')
    
    if (sum(gender_match) == 1) or (sum(birthday_match)==1):
        df = df[df.duplicated()]
    
    # append df to df-list
    df_list.append(df)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500


In [646]:
df = pd.concat(df_list, ignore_index=True)
df_family = pd.concat(df_family_list, ignore_index=True)
df_leader = pd.concat(df_leader_list, ignore_index=True)

df = df.drop(columns=['how'])

In [647]:
# save to csv
df.to_csv('data/historical-legislators.csv', index=False,sep=',')
df_family.to_csv('data/historical-legislators-family.csv', index=False,sep=',')
df_leader.to_csv('data/historical-legislators-leader.csv', index=False,sep=',')

# Current Legislators

In [648]:
# Read yaml-file for historical data
content = read_yaml(url=f"{url}{current}")
content = clean_yaml(content, file='legislators')

# Split list into sub-lists for each legislator
between_legislator = split_list(match="id:", string=content)

In [649]:
# Make sure bioguide is on top and clean the family section
for idx, val in enumerate(between_legislator):
    clean_bioguide()
    clean_family()

In [650]:
# delete contact information from list
delete_info = [[re.match('address:|phone:|fax:|contact_form:|office:|other_names:|rss_url:',x) for x in l] for l in between_legislator]
boolean = [[x is None for x in c] for c in delete_info]
delete_indices = [get_index_positions(x, False) for x in boolean]
between_legislator = [del_list_numpy(between_legislator[idx], delete_indices[idx]) for idx in range(len(between_legislator))]

# Clean up fec and bioguide 
between_legislator = clean_bio(between_legislator, element='fec:')
between_legislator = clean_bio(between_legislator, element='bioguide_previous:')

In [651]:
# Construct a list for each type of information for each legislator.
# I delete party affiliations within the 'terms'-list to avoid complications.
#within_legislator = [split_list(match="id:|name:|bio:|terms:|leadership_roles:|family:", string=x) for x in between_legislator]
within_legislator = [split_list(match="id:|name:|bio:|terms:|leadership_roles:|family:", string=x) for x in between_legislator]
within_legislator = [{x[0]:x[1:] for x in l} for l in within_legislator]
within_legislator = delete_party_affiliations(within_legislator)

# Constuct column-row pairs for each legislator
for idx0, val0 in enumerate(within_legislator):
    for idx1, val1 in enumerate(val0):
        within_legislator[idx0][val1] = [re.match('(.*): (.*)', x).groups() for x in within_legislator[idx0][val1]] 
    
for idx in range(len(within_legislator)):
    boolean_end = [re.match('end', x) is not None for x,y in within_legislator[idx]['name:']]
    if sum(boolean_end) > 0:
        end_position = get_index_positions(boolean_end, True)[0]
        del within_legislator[idx]['name:'][end_position]

for idx in range(len(within_legislator)):
    boolean_middle = [re.match('~', y) is not None for x,y in within_legislator[idx]['name:']]
    if sum(boolean_middle) > 0:
        middle_position = get_index_positions(boolean_middle, True)[0]
        del within_legislator[idx]['name:'][middle_position]

for idx, val in enumerate(within_legislator):
    last_positions = get_index_positions([re.match('last', x) is not None for x,y in within_legislator[idx]['name:']],True)
    if len(last_positions) > 1:
        max_last = max(last_positions)
        del within_legislator[idx]['name:'][max_last]

In [652]:
# Loop through each legislator and create a dataframe for each of them 
df_leader_list = []
df_family_list = []
df_list   = []

for idx0, val0 in enumerate(within_legislator):
    if idx0%100 == 0:
        print(idx0)
    for idx1, val1 in enumerate(val0):
        subset = within_legislator[idx0][val1]
        
        if val1 == 'id:':
            row_index = get_row_indices(subset, match='bioguide')
        if val1 == 'name:':
            row_index = get_row_indices(subset, match='first')
        if val1 == 'bio:':
            row_index = get_row_indices(subset, match='birthday')    
        if val1 == 'terms:':
            row_index = get_row_indices(subset, match='type')
        
        if val1 == 'leadership_roles:':
            row_index = get_row_indices(subset, match='title')
            temp = pd.DataFrame()
            for idx2, val2 in enumerate(row_index):
                temp.loc[val2, within_legislator[idx0]['id:'][0][0]] = within_legislator[idx0]['id:'][0][1]
                temp.loc[val2, subset[idx2][0]] = subset[idx2][1]
            df_leader_list.append(temp)
        
        elif val1 == 'family:':
            row_index = get_row_indices(subset, match='name_fam')
            temp = pd.DataFrame()
            for idx2, val2 in enumerate(row_index):
                temp.loc[val2, within_legislator[idx0]['id:'][0][0]] = within_legislator[idx0]['id:'][0][1]
                temp.loc[val2, subset[idx2][0]] = subset[idx2][1]
            df_family_list.append(temp)
        
        else:
            if val1 == 'id:':
                temp = pd.DataFrame()
                for idx2, val2 in enumerate(row_index):
                    temp.loc[val2, subset[idx2][0]] = subset[idx2][1]
                df = temp
            else:
                temp = pd.DataFrame()
                for idx2, val2 in enumerate(row_index):
                    temp.loc[val2, within_legislator[idx0]['id:'][0][0]] = within_legislator[idx0]['id:'][0][1]
                    temp.loc[val2, subset[idx2][0]] = subset[idx2][1]
                df = df.merge(temp, on=['bioguide'])
    
    df['gender']   = df['gender'].fillna(method='ffill')
    df['birthday'] = df['birthday'].fillna(method='bfill')
    df = df[df.duplicated()]
    
    # append df to df-list
    df_list.append(df)

0
100
200
300
400
500


In [653]:
df = pd.concat(df_list, ignore_index=True)
df_family = pd.concat(df_family_list, ignore_index=True)
df_leader = pd.concat(df_leader_list, ignore_index=True)

df = df.drop(columns=['how'])

In [654]:
# save to csv
df.to_csv('data/current-legislators.csv', index=False,sep=',')
df_family.to_csv('data/current-legislators-family.csv', index=False,sep=',')
df_leader.to_csv('data/current-legislators-leader.csv', index=False,sep=',')

# Social Media

In [655]:
# Read yaml-file for historical data
content = read_yaml(url=f"{url}{social_media}")
content = clean_yaml(content, file='legislators')

# Split list into sub-lists for each legislator
between_legislator = split_list(match="id:", string=content)

# delete first element of list
del between_legislator[0]

In [656]:
for idx, val in enumerate(between_legislator):
    clean_bioguide()

In [657]:
# Clean up fec and bioguide if needed to 
between_legislator = clean_bio(between_legislator, element='fec:')
between_legislator = clean_bio(between_legislator, element='bioguide_previous:')

In [658]:
within_legislator = [split_list(match="id:|social:", string=x) for x in between_legislator]
within_legislator = [{x[0]:x[1:] for x in l} for l in within_legislator]

In [659]:
for idx0, val0 in enumerate(within_legislator):
    for idx1, val1 in enumerate(val0):
        within_legislator[idx0][val1] = [re.match('(.*): (.*)', x).groups() for x in within_legislator[idx0][val1]] 

In [660]:
df_list = []
for idx0, val0 in enumerate(within_legislator):
    for idx1, val1 in enumerate(val0):
            temp = pd.DataFrame(within_legislator[idx0][val1]).T
            temp.columns = temp.iloc[0]
            temp = temp[1:]
            within_legislator[idx0][val1] = temp
    df_list.append(within_legislator[idx0]['id:'].join(within_legislator[idx0]['social:']))

In [661]:
df = pd.concat(df_list, ignore_index=True)
df.to_csv('data/social-media.csv', index=False,sep=',')

# Merge data

In [662]:
datatype = {"lis": str, "thomas":str ,"ballotpedia":str ,"cspan": str,
            "official_full": str, "opensecrets": str, "url":str,
            "fec": str, "state_rank": str, "district":str, 
            "house_history":str, "votesmart":str, "govtrack":str}

df_historical = pd.read_csv('data/historical-legislators.csv', dtype=datatype)
df_current    = pd.read_csv('data/current-legislators.csv',  dtype=datatype)

datatype = {"thomas":str, "govtrack":str, "twitter_id":str,
           'instagram_id':str, 'youtube_id':str, 'youtube':str,
           'facebook':str, 'twitter':str, 'instagram': str}
df_some       = pd.read_csv('data/social-media.csv', dtype=datatype)

In [678]:
df = pd.concat((df_historical, df_current))

In [679]:
df = pd.merge(df_some, df, how='outer')

In [680]:
#df_current[df_current['bioguide']=='G000061']
df[df['bioguide']=='R000600'].loc[:,['type','start','end']]

Unnamed: 0,type,start,end
0,rep,'2015-01-06','2017-01-03'
1,rep,'2017-01-03','2019-01-03'
2,rep,'2019-01-03','2021-01-03'


In [686]:
df.loc[3:3,['bioguide', 'end']]

Unnamed: 0,bioguide,end
3,Y000064,


In [692]:
df[df['start'].isnull()]

Unnamed: 0,bioguide,thomas,govtrack,twitter,facebook,youtube_id,twitter_id,youtube,instagram,instagram_id,...,lis,ballotpedia,official_full,opensecrets,url,fec,maplight,state_rank,caucus,end-type
3,Y000064,'02019',412428,SenToddYoung,SenatorToddYoung,UCuknj4PGn91gHDNAfboZEgQ,234128524,RepToddYoung,sentoddyoung,,...,,,,,,,,,,
6,Y000062,'01853',412211,RepJohnYarmuth,'214258646163',UCy5KW4yrEfEiyZRX45Eoxkg,384913290,RepJohnYarmuth,,,...,,,,,,,,,,
7,Y000033,'01256',400440,RepDonYoung,RepDonYoung,UCg5ZIR5-82EbJiNeI1bqT-A,37007274,RepDonYoung,,,...,,,,,,,,,,
8,W000809,'01991',412402,Rep_SteveWomack,RepSteveWomack,UCXJbUDLYX-wGIhRuN66hqZw,234469322,CongressmanWomack,,,...,,,,,,,,,,
9,W000808,'02004',412412,RepWilson,RepWilson,UCP5QBhng_lHv-vJgE_h7lpA,234014087,repfredericawilson,repwilson,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,O000173,,,Ilhan,,,1082334352711790593,,,,...,,,,,,,,,,
914,C001055,,,RepEdCase,,,1081350574589833221,,,,...,,,,,,,,,,
915,H001089,,,SenHawleyPress,,,1080960924687704064,,,,...,,,,,,,,,,
916,V000133,,,CongressmanJVD,,,1083469084648505344,,,,...,,,,,,,,,,


In [695]:
df_historical[df_historical['bioguide']=='Y000033']

Unnamed: 0,bioguide,govtrack,icpsr,wikipedia,wikidata,google_entity_id,first,last,birthday,gender,...,cspan,votesmart,lis,ballotpedia,official_full,opensecrets,url,fec,maplight,state_rank


In [696]:
df_current[df_current['bioguide']=='Y000033']

Unnamed: 0,bioguide,thomas,lis,govtrack,opensecrets,votesmart,fec,cspan,wikipedia,house_history,...,district,party,url,class,state_rank,middle,nickname,suffix,caucus,end-type


In [697]:
df_some[df_some['bioguide']=='Y000033']

Unnamed: 0,bioguide,thomas,govtrack,twitter,facebook,youtube_id,twitter_id,youtube,instagram,instagram_id
5,Y000033,'01256',400440,RepDonYoung,RepDonYoung,UCg5ZIR5-82EbJiNeI1bqT-A,37007274,RepDonYoung,,
