In [17]:
import numpy as np
import pandas as pd

### Inspect trade data
Full congressional trade data pulled down from [Quiver Quantitative](https://www.quiverquant.com/home/).

In [18]:
trades_df = pd.read_excel('../data/quiverquant_congress_trading_all.xlsx')

# inspect data
trades_df.head()

KeyboardInterrupt: 

In [None]:
print(f"Total number of rows: {len(trades_df)}")

In [None]:
# get summary statistics
trades_df.describe(include='all')

In [None]:
print(f"The min filling data: {trades_df['Filed'].min()}") # 113th congress
print(f"The max filling data: {trades_df['Filed'].max()}") # 118th congress

In [None]:
trades_df['Filed'] = pd.to_datetime(trades_df['Filed'])
trades_df.groupby(trades_df['Filed'].dt.year).size()

In [None]:
# perhaps we should filter out reinvestments and clean up descriptions? 
trades_df['Description'].value_counts().head(15)

In [None]:
trades_df['Transaction'].value_counts()

### Inspect member ideology data
Data pulled down from [VoteView](https://voteview.com/data).

In [None]:
mem_id_df = pd.read_csv('../data/voteview_member_ideology.csv')

# inspect data
mem_id_df.head()

In [None]:
mem_id_df.columns

In [None]:
# get summary statistics
mem_id_df.describe(include='all')

We need to determine which columns are relevant and how to map them using [this documentation](https://voteview.com/articles/data_help_members).
- Our trade data goes back to the 113th so we should filter from there on the congress column
- We should filter out 'President' in chamber

In [None]:
# filter out relevant data
mem_id_df = mem_id_df[mem_id_df['congress'] >= 113]
mem_id_df = mem_id_df[mem_id_df['chamber'] != 'President']

In [None]:
# conform names
mem_id_df['bioname'] = mem_id_df['bioname'].str.title()
mem_id_df['last_name'] = mem_id_df['bioname'].str.split(',').str[0]
mem_id_df['first_name'] = mem_id_df['bioname'].str.split(', ').str[-1].str.split(' ').str[0]
mem_id_df['name'] = mem_id_df['first_name'] + ' ' + mem_id_df['last_name']
mem_id_df['name'] = mem_id_df['name'].str.strip()

trades_df['Name'] = trades_df['Name'].str.replace('Iii', '')
trades_df['Name'] = trades_df['Name'].str.replace('Jr.', '')
trades_df['first_name'] = trades_df['Name'].str.split(' ').str[0]
trades_df['last_name'] = trades_df['Name'].str.split(' ').str[-1]
trades_df['name_c'] = trades_df['first_name'] + ' ' + trades_df['last_name']
trades_df['name_c'] = trades_df['name_c'].str.strip()

In [None]:
# do a check on join with the trades dataframe
join_check_df = pd.merge(trades_df, mem_id_df, left_on='name_c', right_on='name', how='outer')
join_check_df = join_check_df[['Name', 'bioname','name_c', 'name']].drop_duplicates()

In [None]:
# need to do more processing, probably just loop over both sets of names
#join_check_df[~(join_check_df['name'].isnull()) & ~(join_check_df['name_c'].isnull())]
join_check_df[(join_check_df['name'].isnull()) | (join_check_df['name_c'].isnull())]

### Inspect ICPSR data
Data pulled down from [ICPSR](https://www.icpsr.umich.edu/web/ICPSR/studies/3371/summary).

In [None]:
comm_df = pd.read_csv('../data/icpsr-congressional-historical-statistics/DS0010/03371-0010-Data.tsv', 
                      delimiter='	', low_memory=False)

# inspect data
comm_df.head()

In [None]:
comm_df['YEAR_TERM'] = pd.to_numeric(comm_df['YEAR_TERM'], errors='coerce')
comm_df['YEAR_TERM'] = comm_df['YEAR_TERM'].fillna(0).astype(int)

# only goes up to 1993, need to find other data
np.sort(comm_df['YEAR_TERM'].unique())  

In [None]:
#years = [0, 99, 1006, 1111, 9179, 9891, 9911]
#comm_df[comm_df['YEAR_TERM'].isin(years)]

### Scrape data from Wikipedia
Testing getting members and commitees for the 113th Congress from [here](https://en.m.wikipedia.org/wiki/113th_United_States_Congress).

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = "https://en.m.wikipedia.org/wiki/113th_United_States_Congress"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
# senate_section = soup.find("h3", id="Senate_3")
# senate_table = senate_section.find_next("table", class_="col-begin")

# senators = []

# for state_div in senate_table.find_all("div", class_="mw-heading4"):
#     state_name = state_div.h4.get_text(strip=True)
    
#     for senator_entry in state_div.find_next("dl").find_all("dd"):
#         senator_name = senator_entry.find("a").get_text(strip=True)
#         party_affiliation = senator_entry.get_text().split()[-1]
#         senators.append({"State": state_name, "Name": senator_name, "Party": party_affiliation})
        
# # looks good, just need to pull out house + commitees and loop over the other congresses 
# senators[:10]

In [None]:
# house_section = soup.find("h3", id="House_of_Representatives_3")
# house_table = house_section.find_next("table", class_="col-begin")

# representatives = []

# for state_div in specific_table.find_all("div", class_="mw-heading4"):
#     state_name = state_div.h4.get_text(strip=True)
    
#     # Find each representative under this state
#     for rep_entry in state_div.find_next("dl").find_all("dd"):
#         # Extract the district number
#         district_link = rep_entry.find("a")
#         if district_link:
#             district_number = district_link.get_text(strip=True)
#         else:
#             district_number = "At-large"  # Handle at-large districts

#         # Extract the representative's name and party affiliation
#         links = rep_entry.find_all("a")
#         if len(links) >= 2:
#             # Use the second link text for the representative's name
#             rep_name = links[1].get_text(strip=True)
#             # Party affiliation is typically the last part of the text
#             party_affiliation = rep_entry.get_text().split()[-1]
#         else:
#             continue  # Skip if the structure doesn't match
        
#         # Append the representative's details to the list
#         representatives.append({
#             "State": state_name,
#             "District": district_number,
#             "Name": rep_name,
#             "Party": party_affiliation})

# house_table

In [None]:
# senate_committees = []

# senate_section = soup.find("h3", id="Senate_5")
# if senate_section:
#     committee_list = senate_section.find_next("ul")
#     if committee_list:
#         for committee_item in committee_list.find_all("li", recursive=False):
#             committee_link = committee_item.find("a")
#             if committee_link:
#                 committee_name = committee_link.get_text(strip=True)
#                 committee_url = f"https://en.wikipedia.org{committee_link['href']}"
                
#                 senate_committees.append({
#                     "Committee": committee_name,
#                     "Link": committee_url
#                 })

# senate_committees

### Filter member vote data
Filtering member vote record from VoteView

In [19]:
vote_df = pd.read_csv('../data/voteview_members_votes.csv')

# inspect data
vote_df.head()

Unnamed: 0,congress,chamber,rollnumber,icpsr,cast_code,prob
0,1,House,1,154.0,6.0,61.1
1,1,House,1,259.0,9.0,99.6
2,1,House,1,379.0,1.0,100.0
3,1,House,1,649.0,1.0,59.2
4,1,House,1,786.0,1.0,97.7


In [23]:
nums = [113, 114, 115, 116, 117, 118]

vote_df = vote_df[vote_df['congress'].isin(nums)].reset_index(drop=True)

In [26]:
vote_df.to_csv('../data/voteview_members_vote_record.csv', index=False)