In [17]:
# Imports
import pandas as pd
import requests
import json
import re

In [None]:
# Base URL and API Key
api_key = ''
base_url = "https://api.congress.gov/v3/member"
params = {
    "api_key": api_key,
    "offset": 0,  # Start with the first record
    "limit": 250  # Maximum number of records per request
}

In [19]:
# List to store all records
all_data = []

# Loop to fetch data with increasing offset
while True:
    # Make the API request
    response = requests.get(base_url, params=params)
    if response.status_code != 200:
        print(f"Failed request: {response.status_code}, {response.text}")
        break

    # Parse the JSON response
    data = response.json()
    members = data.get("members", [])
    
    # Break loop if no more records
    if not members:
        break

    # Append the current batch of members to the list
    all_data.extend(members)

    # Increment the offset for the next batch
    params["offset"] += 250
    print(f"Fetched {len(members)} records, total so far: {len(all_data)}")

Fetched 250 records, total so far: 250
Fetched 250 records, total so far: 500
Fetched 250 records, total so far: 750
Fetched 250 records, total so far: 1000
Fetched 250 records, total so far: 1250
Fetched 250 records, total so far: 1500
Fetched 250 records, total so far: 1750
Fetched 250 records, total so far: 2000
Fetched 250 records, total so far: 2250
Fetched 250 records, total so far: 2500
Fetched 26 records, total so far: 2526


In [20]:
df = pd.json_normalize(
    all_data,
    record_path=['terms','item'],
    meta=["name","bioguideId", "partyName", "state", "district"]
    ,errors='ignore')


In [21]:
df['endYear'] = df['endYear'].fillna(2024)

In [22]:
df['endYear'] = df['endYear'].astype('Int64')


In [23]:
def get_congress_number(year,):
    # Congress number calculation
    start_year = 1789  # Start year of the 1st Congress
    congress_number = ((year - start_year) // 2) + 1
    return congress_number

df['startCongress'] = df['startYear'].apply(get_congress_number)
df['endCongress'] = df['endYear'].apply(get_congress_number)
df

Unnamed: 0,chamber,startYear,endYear,name,bioguideId,partyName,state,district,startCongress,endCongress
0,House of Representatives,2024,2024,"Wied, Tony",W000829,Republican,Wisconsin,8,118,118
1,House of Representatives,2024,2024,"Lee Carter, Erica",L000605,Democratic,Texas,18,118,118
2,House of Representatives,1975,1985,"Patterson, Jerry M.",P000121,Democratic,California,38,94,99
3,House of Representatives,1977,1993,"Ireland, Andrew P.",I000029,Republican,Florida,10,95,103
4,House of Representatives,1991,2009,"Hobson, David L.",H000666,Republican,Ohio,7,102,111
...,...,...,...,...,...,...,...,...,...,...
2787,House of Representatives,1975,1991,"Florio, James J.",F000215,Democratic,New Jersey,1,94,102
2788,House of Representatives,1945,1947,"Flood, Daniel J.",F000209,Democratic,Pennsylvania,11,79,80
2789,House of Representatives,1949,1953,"Flood, Daniel J.",F000209,Democratic,Pennsylvania,11,81,83
2790,House of Representatives,1955,1981,"Flood, Daniel J.",F000209,Democratic,Pennsylvania,11,84,97


In [27]:
# Function to split names
def split_name(name):
    try:
        # Regular expression patterns
        aka_pattern = r"\((.*?)\)|\"(.*?)\""  # Matches AKA in parentheses or quotes
        suffix_pattern = r",\s*(Jr\.|Sr\.|III|IV|V)$"  # Matches suffix after a comma
        main_pattern = r"([^,]+),\s*([^\(,]+)"  # Matches Last, First Middle

        # Extract AKA (from parentheses or quotes)
        aka_matches = re.findall(aka_pattern, name)
        aka = ", ".join(filter(None, [match[0] or match[1] for match in aka_matches])) if aka_matches else None

        # Remove AKA occurrences (parentheses and quotes) from the name for further processing
        name = re.sub(aka_pattern, "", name).strip()

        # Extract suffix
        suffix = re.search(suffix_pattern, name)
        suffix = suffix.group(1) if suffix else None

        # Remove suffix from the name for further processing
        name = re.sub(suffix_pattern, "", name).strip()

        # Extract main parts (Last, First Middle)
        main = re.match(main_pattern, name)
        if main:
            last_name = main.group(1).strip()
            first_middle = main.group(2).strip()
        else:
            last_name = None
            first_middle = None

        # Split first and middle names correctly
        if first_middle:
            first_middle_parts = first_middle.split(" ", 1)  # Split into at most 2 parts
            first_name = first_middle_parts[0].strip() if len(first_middle_parts) > 0 else None
            middle_name = first_middle_parts[1].strip() if len(first_middle_parts) > 1 else None
        else:
            first_name = None
            middle_name = None

        return pd.Series([last_name, first_name, middle_name, aka, suffix])
    except Exception as e:
        # Handle unexpected errors
        print(f"Error processing name: {name} -> {e}")
        return pd.Series([None, None, None, None, None])

# Apply the function to split names
df[["lastName", "firstName", "middleName", "aka", "suffix"]] = df["name"].apply(split_name)

df

Unnamed: 0,chamber,startYear,endYear,name,bioguideId,partyName,state,district,startCongress,endCongress,lastName,firstName,middleName,aka,suffix
0,House of Representatives,2024,2024,"Wied, Tony",W000829,Republican,Wisconsin,8,118,118,Wied,Tony,,,
1,House of Representatives,2024,2024,"Lee Carter, Erica",L000605,Democratic,Texas,18,118,118,Lee Carter,Erica,,,
2,House of Representatives,1975,1985,"Patterson, Jerry M.",P000121,Democratic,California,38,94,99,Patterson,Jerry,M.,,
3,House of Representatives,1977,1993,"Ireland, Andrew P.",I000029,Republican,Florida,10,95,103,Ireland,Andrew,P.,,
4,House of Representatives,1991,2009,"Hobson, David L.",H000666,Republican,Ohio,7,102,111,Hobson,David,L.,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2787,House of Representatives,1975,1991,"Florio, James J.",F000215,Democratic,New Jersey,1,94,102,Florio,James,J.,,
2788,House of Representatives,1945,1947,"Flood, Daniel J.",F000209,Democratic,Pennsylvania,11,79,80,Flood,Daniel,J.,,
2789,House of Representatives,1949,1953,"Flood, Daniel J.",F000209,Democratic,Pennsylvania,11,81,83,Flood,Daniel,J.,,
2790,House of Representatives,1955,1981,"Flood, Daniel J.",F000209,Democratic,Pennsylvania,11,84,97,Flood,Daniel,J.,,


In [28]:
rep_df = pd.read_excel('../data/legislator_table.xlsx')
rep_df = rep_df[['bioguide', 'govtrack', 'icpsr', 'wikipedia', 'birthday', 'gender']]
rep_df

Unnamed: 0,bioguide,govtrack,icpsr,wikipedia,birthday,gender
0,B000944,400050,29389.0,,1952-11-09,M
1,C000127,300018,39310.0,,1958-10-13,F
2,C000141,400064,15408.0,,1943-10-05,M
3,C000174,300019,15015.0,,1947-01-23,M
4,C001070,412246,40703.0,,1960-04-13,M
...,...,...,...,...,...,...
12681,B001297,412619,21510.0,,1959-02-16,M
12682,G000579,412731,21720.0,,1984-03-03,M
12683,J000032,400199,29573.0,,1950-01-12,F
12684,P000096,400309,29741.0,,1937-01-25,M


In [29]:
merged_df = pd.merge(df,rep_df, left_on='bioguideId', right_on='bioguide',how='left')
merged_df.drop(['bioguide'], axis=1, inplace=True)
merged_df

Unnamed: 0,chamber,startYear,endYear,name,bioguideId,partyName,state,district,startCongress,endCongress,lastName,firstName,middleName,aka,suffix,govtrack,icpsr,wikipedia,birthday,gender
0,House of Representatives,2024,2024,"Wied, Tony",W000829,Republican,Wisconsin,8,118,118,Wied,Tony,,,,,,,,
1,House of Representatives,2024,2024,"Lee Carter, Erica",L000605,Democratic,Texas,18,118,118,Lee Carter,Erica,,,,,,,,
2,House of Representatives,1975,1985,"Patterson, Jerry M.",P000121,Democratic,California,38,94,99,Patterson,Jerry,M.,,,408529.0,14266.0,,1934-10-25,M
3,House of Representatives,1977,1993,"Ireland, Andrew P.",I000029,Republican,Florida,10,95,103,Ireland,Andrew,P.,,,405887.0,14428.0,,1930-08-23,M
4,House of Representatives,1991,2009,"Hobson, David L.",H000666,Republican,Ohio,7,102,111,Hobson,David,L.,,,400180.0,29136.0,,1936-10-17,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2787,House of Representatives,1975,1991,"Florio, James J.",F000215,Democratic,New Jersey,1,94,102,Florio,James,J.,,,404170.0,14223.0,,1937-08-29,M
2788,House of Representatives,1945,1947,"Flood, Daniel J.",F000209,Democratic,Pennsylvania,11,79,80,Flood,Daniel,J.,,,404164.0,3224.0,,1903-11-26,M
2789,House of Representatives,1949,1953,"Flood, Daniel J.",F000209,Democratic,Pennsylvania,11,81,83,Flood,Daniel,J.,,,404164.0,3224.0,,1903-11-26,M
2790,House of Representatives,1955,1981,"Flood, Daniel J.",F000209,Democratic,Pennsylvania,11,84,97,Flood,Daniel,J.,,,404164.0,3224.0,,1903-11-26,M


In [30]:
df.to_excel('../data/updated_legislator_table.xlsx',index=False)