In [311]:
import numpy as np
import pandas as pd

### Inspect trade data
Full congressional trade data pulled down from [Quiver Quantitative](https://www.quiverquant.com/home/).

In [312]:
trades_df = pd.read_excel('../data/quiverquant_congress_trading_all.xlsx')

# inspect data
trades_df.head()

Unnamed: 0,Ticker,TickerType,Company,Traded,Transaction,Trade_Size_USD,Status,Subholding,Description,Name,BioGuideID,Filed,Party,District,Chamber,Comments,Quiver_Upload_Time,excess_return,State,last_modified
0,NVDA,ST,NVIDIA CORPORATION - COMMON STOCK,2024-11-01,Purchase,"$1,001 - $15,000",NEW,IRA ONE,,Pete Sessions,S000250,2024-11-01,R,TX17,House,,2024-11-04,0.697227,Texas,2024-11-04
1,NGL,ST,NGL ENERGY PARTNERS LP COMMON UNITS REPRESENTI...,2024-10-24,Sale,"$50,001 - $100,000",NEW,EQUITABLE ADVISORS INVESTMENT ACCOUNT,,Mark Dr Green,G000590,2024-10-29,R,TN07,House,,2024-10-30,-3.609834,Tennessee,2024-10-30
2,BRK.B,ST,BERKSHIRE HATHAWAY INC. NEW COMMON STOCK,2024-10-21,Purchase,"$1,001 - $15,000",NEW,,,Marjorie Taylor Mrs Greene,G000596,2024-10-22,R,GA14,House,,2024-10-22,-2.339083,Georgia,2024-10-22
3,HD,ST,"HOME DEPOT, INC.",2024-10-21,Purchase,"$1,001 - $15,000",NEW,,,Marjorie Taylor Mrs Greene,G000596,2024-10-22,R,GA14,House,,2024-10-22,-0.292208,Georgia,2024-10-22
4,TSLA,ST,"TESLA, INC. - COMMON STOCK",2024-10-21,Purchase,"$1,001 - $15,000",NEW,,,Marjorie Taylor Mrs Greene,G000596,2024-10-22,R,GA14,House,,2024-10-22,13.3345,Georgia,2024-10-22


In [313]:
print(f"Total number of rows: {len(trades_df)}")

Total number of rows: 48319


In [314]:
# get summary statistics
trades_df.describe(include='all')

Unnamed: 0,Ticker,TickerType,Company,Traded,Transaction,Trade_Size_USD,Status,Subholding,Description,Name,BioGuideID,Filed,Party,District,Chamber,Comments,Quiver_Upload_Time,excess_return,State,last_modified
count,48319,35381,48318,48319,48319,48319,48226,28668,4562,48319,48319,48319,48319,37884,48319,821,48319,45768.0,48319,48235
unique,4018,17,6917,2861,5,452,2,553,2405,354,287,2467,3,202,2,199,801,,50,192
top,MSFT,ST,MICROSOFT CORPORATION,2023-07-10,Purchase,"$1,001 - $15,000",NEW,THOMAS C MACARTHUR AND DEBORAH A MACARTHUR,DIVIDEND REINVESTMENT,Josh Gottheimer,G000583,2014-06-10,R,NJ05,House,R,2020-07-26,,New Jersey,2023-11-16
freq,818,25605,634,326,23875,35694,37791,1841,296,2926,2926,915,25480,2926,37884,184,29793,,6465,35583
mean,,,,,,,,,,,,,,,,,,-18.060701,,
std,,,,,,,,,,,,,,,,,,241.951968,,
min,,,,,,,,,,,,,,,,,,-347.284064,,
25%,,,,,,,,,,,,,,,,,,-96.568779,,
50%,,,,,,,,,,,,,,,,,,-33.460178,,
75%,,,,,,,,,,,,,,,,,,13.265246,,


In [315]:
print(f"The min filling data: {trades_df['Filed'].min()}") # 113th congress
print(f"The max filling data: {trades_df['Filed'].max()}") # 118th congress

The min filling data: 2014-01-03
The max filling data: 2024-11-01


In [316]:
trades_df['Filed'] = pd.to_datetime(trades_df['Filed'])
trades_df.groupby(trades_df['Filed'].dt.year).size()

Filed
2014    3205
2015    3797
2016    3786
2017    4087
2018    4394
2019    5377
2020    7699
2021    5377
2022    3652
2023    4383
2024    2562
dtype: int64

In [317]:
# perhaps we should filter out reinvestments and clean up descriptions? 
trades_df['Description'].value_counts().head(15)

DIVIDEND REINVESTMENT                                                                                        296
PUBLICLY TRADED MASTER LIMITED PARTNERSHIP INTEREST                                                          185
REINVEST SHARES                                                                                              175
PART OF MY SPOUSE'S RETIREMENT PORTFOLIO.                                                                    148
ADR STOCK                                                                                                     69
COVERED SHORT                                                                                                 54
CORPORATE BOND                                                                                                44
SELL TO CLOSE.                                                                                                40
CALL                                                                                            

In [318]:
trades_df['Transaction'].value_counts()

Purchase          23875
Sale              18742
Sale (Full)        3221
Sale (Partial)     2053
Exchange            428
Name: Transaction, dtype: int64

### Inspect member ideology data
Data pulled down from [VoteView](https://voteview.com/data).

In [319]:
mem_id_df = pd.read_csv('../data/voteview_member_ideology.csv')

# inspect data
mem_id_df.head()

Unnamed: 0,congress,chamber,icpsr,state_icpsr,district_code,state_abbrev,party_code,occupancy,last_means,bioname,...,died,nominate_dim1,nominate_dim2,nominate_log_likelihood,nominate_geo_mean_probability,nominate_number_of_votes,nominate_number_of_errors,conditional,nokken_poole_dim1,nokken_poole_dim2
0,1,President,99869,99,0.0,USA,5000,,,"WASHINGTON, George",...,,,,,,,,,,
1,1,House,379,44,2.0,GA,4000,0.0,1.0,"BALDWIN, Abraham",...,1807.0,-0.165,-0.373,-28.55029,0.758,103.0,12.0,,-0.429,-0.817
2,1,House,4854,44,1.0,GA,4000,0.0,1.0,"JACKSON, James",...,1806.0,-0.32,-0.181,-24.89986,0.776,98.0,9.0,,-0.559,-0.052
3,1,House,6071,44,3.0,GA,4000,0.0,1.0,"MATHEWS, George",...,1812.0,-0.428,-0.317,-12.62728,0.88,99.0,2.0,,-0.413,-0.232
4,1,House,1538,52,6.0,MD,5000,0.0,1.0,"CARROLL, Daniel",...,1796.0,0.116,-0.74,-23.47008,0.783,96.0,11.0,,0.114,-0.779


In [320]:
mem_id_df.columns

Index(['congress', 'chamber', 'icpsr', 'state_icpsr', 'district_code',
       'state_abbrev', 'party_code', 'occupancy', 'last_means', 'bioname',
       'bioguide_id', 'born', 'died', 'nominate_dim1', 'nominate_dim2',
       'nominate_log_likelihood', 'nominate_geo_mean_probability',
       'nominate_number_of_votes', 'nominate_number_of_errors', 'conditional',
       'nokken_poole_dim1', 'nokken_poole_dim2'],
      dtype='object')

In [321]:
# get summary statistics
mem_id_df.describe(include='all')

Unnamed: 0,congress,chamber,icpsr,state_icpsr,district_code,state_abbrev,party_code,occupancy,last_means,bioname,...,died,nominate_dim1,nominate_dim2,nominate_log_likelihood,nominate_geo_mean_probability,nominate_number_of_votes,nominate_number_of_errors,conditional,nokken_poole_dim1,nokken_poole_dim2
count,50496.0,50496,50496.0,50496.0,50496.0,50496,50496.0,48213.0,48213.0,50496,...,40858.0,50274.0,50274.0,49265.0,49265.0,49265.0,49265.0,0.0,50030.0,50030.0
unique,,3,,,,57,,,,12365,...,,,,,,,,,,
top,,House,,,,NY,,,,"DINGELL, John David, Jr.",...,,,,,,,,,,
freq,,40475,,,,4356,,,,30,...,,,,,,,,,,
mean,69.779785,,9793.941679,33.721542,9.291726,,226.146091,0.131583,1.225022,,...,1931.322238,0.006418,0.015513,-102.683098,0.753505,391.522074,46.271389,,0.005949,0.013407
std,30.6832,,10205.585402,20.552235,16.309728,,656.377477,0.52683,0.671845,,...,55.565617,0.375583,0.463358,93.492794,0.098544,331.319047,43.49973,,0.392926,0.486798
min,1.0,,1.0,1.0,0.0,,1.0,0.0,0.0,,...,1790.0,-1.0,-1.0,-1135.01523,0.16,1.0,0.0,,-1.0,-1.0
25%,47.0,,3673.75,14.0,1.0,,100.0,0.0,1.0,,...,1892.0,-0.33,-0.3,-141.30118,0.689,131.0,16.0,,-0.339,-0.327
50%,73.0,,7441.0,33.0,5.0,,100.0,0.0,1.0,,...,1935.0,-0.041,-0.009,-70.81749,0.756,248.0,32.0,,-0.0355,-0.001
75%,95.0,,12039.0,49.0,10.0,,200.0,0.0,1.0,,...,1976.0,0.349,0.309,-36.82361,0.822,618.0,63.0,,0.354,0.342


We need to determine which columns are relevant and how to map them using [this documentation](https://voteview.com/articles/data_help_members).
- Our trade data goes back to the 113th so we should filter from there on the congress column
- We should filter out 'President' in chamber

In [322]:
# filter out relevant data
mem_id_df = mem_id_df[mem_id_df['congress'] >= 113]
mem_id_df = mem_id_df[mem_id_df['chamber'] != 'President']

In [323]:
# conform names
mem_id_df['bioname'] = mem_id_df['bioname'].str.title()
mem_id_df['last_name'] = mem_id_df['bioname'].str.split(',').str[0]
mem_id_df['first_name'] = mem_id_df['bioname'].str.split(', ').str[-1].str.split(' ').str[0]
mem_id_df['name'] = mem_id_df['first_name'] + ' ' + mem_id_df['last_name']
mem_id_df['name'] = mem_id_df['name'].str.strip()

trades_df['Name'] = trades_df['Name'].str.replace('Iii', '')
trades_df['Name'] = trades_df['Name'].str.replace('Jr.', '')
trades_df['first_name'] = trades_df['Name'].str.split(' ').str[0]
trades_df['last_name'] = trades_df['Name'].str.split(' ').str[-1]
trades_df['name_c'] = trades_df['first_name'] + ' ' + trades_df['last_name']
trades_df['name_c'] = trades_df['name_c'].str.strip()

  trades_df['Name'] = trades_df['Name'].str.replace('Jr.', '')


In [324]:
# do a check on join with the trades dataframe
join_check_df = pd.merge(trades_df, mem_id_df, left_on='name_c', right_on='name', how='outer')
join_check_df = join_check_df[['Name', 'bioname','name_c', 'name']].drop_duplicates()

In [325]:
# need to do more processing, probably just loop over both sets of names
#join_check_df[~(join_check_df['name'].isnull()) & ~(join_check_df['name_c'].isnull())]
join_check_df[(join_check_df['name'].isnull()) | (join_check_df['name_c'].isnull())]

Unnamed: 0,Name,bioname,name_c,name
4113,Rudy Yakym,,Rudy,
4508,"Boozman, John",,"Boozman, John",
7043,Thomas H. Kean,,Thomas,
24367,"Capito, Shelley Moore",,"Capito, Moore",
27155,"Whitehouse, Sheldon",,"Whitehouse, Sheldon",
...,...,...,...,...
134888,,"Schmitt, Eric Stephen",,Eric Schmitt
134889,,"Ricketts, John Peter (Pete)",,John Ricketts
134890,,"Helmy, George S.",,George Helmy
134891,,"Vance, James David",,James Vance


### Inspect ICPSR data
Data pulled down from [ICPSR](https://www.icpsr.umich.edu/web/ICPSR/studies/3371/summary).

In [326]:
comm_df = pd.read_csv('../data/icpsr-congressional-historical-statistics/DS0010/03371-0010-Data.tsv', 
                      delimiter='	', low_memory=False)

# inspect data
comm_df.head()

Unnamed: 0,Congress,SESSION,MEMBER_ID,COMMITTEE,CHAMBER,PARTY,TYPE,NAME,STATUS,RANK,MONTH_APPOINT,DAY_APPOINT,YEAR_APPOINT,MONTH_TERM,DAY_TERM,YEAR_TERM
0,1,1,154,45,H,5000,1,"Ames, Fisher",1,2,4,13,1789,9,29,1789
1,1,2,154,45,H,5000,1,"Ames, Fisher",1,1,2,1,1790,8,12,1790
2,1,1,649,45,H,5000,1,"Benson, Egbert",1,3,4,13,1789,9,29,1789
3,1,2,649,45,H,5000,1,"Benson, Egbert",1,3,2,1,1790,8,12,1790
4,1,1,1538,45,H,5000,1,"Carroll, Daniel",1,4,4,13,1789,9,29,1789


In [327]:
comm_df['YEAR_TERM'] = pd.to_numeric(comm_df['YEAR_TERM'], errors='coerce')
comm_df['YEAR_TERM'] = comm_df['YEAR_TERM'].fillna(0).astype(int)

# only goes up to 1993, need to find other data
np.sort(comm_df['YEAR_TERM'].unique())  

array([   0,   99, 1006, 1111, 1789, 1790, 1791, 1792, 1793, 1794, 1795,
       1796, 1797, 1798, 1799, 1800, 1801, 1802, 1803, 1804, 1805, 1806,
       1807, 1808, 1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817,
       1818, 1819, 1820, 1821, 1822, 1823, 1824, 1825, 1826, 1827, 1828,
       1829, 1830, 1831, 1832, 1833, 1834, 1835, 1836, 1837, 1838, 1839,
       1840, 1841, 1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849, 1850,
       1851, 1852, 1853, 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861,
       1862, 1863, 1864, 1865, 1866, 1867, 1868, 1869, 1870, 1871, 1872,
       1873, 1874, 1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883,
       1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894,
       1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905,
       1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916,
       1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927,
       1928, 1929, 1930, 1931, 1932, 1933, 1934, 19

In [328]:
#years = [0, 99, 1006, 1111, 9179, 9891, 9911]
#comm_df[comm_df['YEAR_TERM'].isin(years)]

### Scrape data from Wikipedia
Testing getting members and commitees for the 113th Congress from [here](https://en.m.wikipedia.org/wiki/113th_United_States_Congress).

In [329]:
import requests
from bs4 import BeautifulSoup

In [330]:
url = "https://en.m.wikipedia.org/wiki/113th_United_States_Congress"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

In [331]:
senate_section = soup.find("h3", id="Senate_3")
senate_table = senate_section.find_next("table", class_="col-begin")

senators = []

for state_div in senate_table.find_all("div", class_="mw-heading4"):
    state_name = state_div.h4.get_text(strip=True)
    
    for senator_entry in state_div.find_next("dl").find_all("dd"):
        senator_name = senator_entry.find("a").get_text(strip=True)
        party_affiliation = senator_entry.get_text().split()[-1]
        senators.append({"State": state_name, "Name": senator_name, "Party": party_affiliation})
        
# looks good, just need to pull out house + commitees and loop over the other congresses 
senators[:10]

[{'State': 'Alabama', 'Name': 'Jeff Sessions', 'Party': '(R)'},
 {'State': 'Alabama', 'Name': 'Richard Shelby', 'Party': '(R)'},
 {'State': 'Alaska', 'Name': 'Mark Begich', 'Party': '(D)'},
 {'State': 'Alaska', 'Name': 'Lisa Murkowski', 'Party': '(R)'},
 {'State': 'Arizona', 'Name': 'Jeff Flake', 'Party': '(R)'},
 {'State': 'Arizona', 'Name': 'John McCain', 'Party': '(R)'},
 {'State': 'Arkansas', 'Name': 'Mark Pryor', 'Party': '(D)'},
 {'State': 'Arkansas', 'Name': 'John Boozman', 'Party': '(R)'},
 {'State': 'California', 'Name': 'Dianne Feinstein', 'Party': '(D)'},
 {'State': 'California', 'Name': 'Barbara Boxer', 'Party': '(D)'}]

In [332]:
# house_section = soup.find("h3", id="House_of_Representatives_3")
# house_table = house_section.find_next("table", class_="col-begin")

# representatives = []

# for state_div in specific_table.find_all("div", class_="mw-heading4"):
#     state_name = state_div.h4.get_text(strip=True)
    
#     for rep_entry in state_div.find_next("dl").find_all("dd"):
#         district_link = rep_entry.find("a")
        
#         if district_link:
#             district_number = district_link.get_text(strip=True)
#         else:
#             district_number = "At-large" 

#         links = rep_entry.find_all("a")
        
#         if len(links) >= 2:
#             rep_name = links[1].get_text(strip=True)
#             party_affiliation = rep_entry.get_text().split()[-1]
#         else:
#             continue 
        
#         representatives.append({"State": state_name, "District": district_number, "Name": rep_name,
#                                 "Party": party_affiliation})

# representatives

[]

In [333]:
senate_committees = []

senate_section = soup.find("h3", id="Senate_5")
if senate_section:
    # Find the relevant table within this section
    specific_table = senate_section.find_next("table", class_="col-begin")
    
    # Check if the table is found within the section
    if specific_table:
        # Loop through each top-level committee in the table
        for committee_item in specific_table.find_all("li", recursive=False):
            # Get the main committee link and name
            committee_link = committee_item.find("a")
            if committee_link:
                committee_name = committee_link.get_text(strip=True)
                committee_url = committee_link['href']
                
                # Store committee name and link in a structured format
                senate_committees.append({
                    "Committee": committee_name,
                    "Link": f"https://en.wikipedia.org{committee_url}"
                })

In [334]:
senate_committees

[]