In [98]:
import pdfplumber
import pandas as pd
import re
from datetime import datetime
import unicodedata

In [252]:
def committee_parser(file, congress_number):
    # Initialize an empty list to store data
    data = []
    partial_name = None  # Temporary storage for lines ending with "of" or comma
    ongoing_committee = None  # Temporary storage for incomplete committee names
    representative = None  # Store the current representative

    # Open and read the PDF
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            text = page.extract_text()

            # Split text by lines and iterate through
            lines = text.split('\n')

            for line in lines:
                # Debug: Print the current line being processed
                # print(f"Processing line: {line.strip()}")

                # Identify and skip unwanted lines by pattern
                if any(skip in line for skip in [
                    'ASSIGNMENTS OF REPRESENTATIVES', 'COMMISSIONER, AND DELEGATES',
                    '[Democrats', 'and Delegates', 'Representative Committees', 'VerDate',
                    'htiw', 'Congressional Directory', '3G4F', 'BOJ_', 'House Committee Assignments'
                ]) or re.match(r'^\d+$', line.strip()):
                    continue

                # Check if the line is part of a multi-line representative name
                if partial_name:
                    # Combine with the previous line
                    line = f"{partial_name} {line.strip()}"
                    partial_name = None  # Reset the partial name

                # If the line ends with "of" or a comma, it is part of a multi-line representative name
                if line.strip().endswith("of") or line.strip().endswith(","):
                    partial_name = line.strip()
                    continue

                # Check if the line contains both a representative and their first committee
                if line.strip() and ('...' in line or '..' in line):
                    # Split the line into representative and committee parts
                    parts = re.split(r'\.{2,}', line.strip())
                    representative = parts[0].strip()
                    # print(f"New representative: {representative}")
                    
                    # If there is a committee on the same line, capture it
                    if len(parts) > 1 and parts[1].strip():
                        first_committee = parts[1].strip()
                        
                        # Check for "Chair" or "Vice Chair"
                        rank = None
                        if ", Chair" in first_committee:
                            first_committee = first_committee.replace(", Chair", "").strip()
                            rank = "Chair"
                        elif ", Vice Chair" in first_committee:
                            first_committee = first_committee.replace(", Vice Chair", "").strip()
                            rank = "Vice Chair"

                        # print(f"Adding first committee for {representative}: {first_committee} with rank: {rank}")
                        data.append([representative, first_committee, rank])
                    continue  # Move to the next line

                # Handle subsequent committee lines for the same representative
                elif representative and line.strip():
                    if ongoing_committee:
                        ongoing_committee += f" {line.strip()}"
                    else:
                        ongoing_committee = line.strip()

                    # Finalize committee if it ends with a period
                    if ongoing_committee.endswith("."):
                        # Check for "Chair" or "Vice Chair"
                        rank = None
                        if ", Chair" in ongoing_committee:
                            ongoing_committee = ongoing_committee.replace(", Chair", "").strip()
                            rank = "Chair"
                        elif ", Vice Chair" in ongoing_committee:
                            ongoing_committee = ongoing_committee.replace(", Vice Chair", "").strip()
                            rank = "Vice Chair"

                        # print(f"Adding committee for {representative}: {ongoing_committee.strip()} with rank: {rank}")
                        data.append([representative, ongoing_committee.strip(), rank])
                        ongoing_committee = None

                # Finalize the last committee for the representative on empty line
                if not line.strip():
                    if representative and ongoing_committee:
                        # Check for "Chair" or "Vice Chair"
                        rank = None
                        if ", Chair" in ongoing_committee:
                            ongoing_committee = ongoing_committee.replace(", Chair", "").strip()
                            rank = "Chair"
                        elif ", Vice Chair" in ongoing_committee:
                            ongoing_committee = ongoing_committee.replace(", Vice Chair", "").strip()
                            rank = "Vice Chair"

                        # print(f"Finalizing last committee for {representative}: {ongoing_committee.strip()} with rank: {rank}")
                        data.append([representative, ongoing_committee.strip(), rank])
                        ongoing_committee = None
                    representative = None  # Reset representative for the next block

    # Convert data into a DataFrame
    df = pd.DataFrame(data, columns=["Representative", "Committee", "Rank"])
    df['Congress'] = congress_number
    return df

In [255]:
df_117 = committee_parser("../data/committee_assignment_pdfs/COMMITTEEASSIGNMENTS-117.pdf",117)
df_116 = committee_parser("../data/committee_assignment_pdfs/COMMITTEEASSIGNMENTS-116.pdf",116)
df_115 = committee_parser("../data/committee_assignment_pdfs/COMMITTEEASSIGNMENTS-115.pdf",115)
df_115_2 = committee_parser("../data/committee_assignment_pdfs/COMMITTEEASSIGNMENTS-115_2.pdf",115)
df_114 = committee_parser("../data/committee_assignment_pdfs/COMMITTEEASSIGNMENTS-114.pdf",114)
df_113 = committee_parser("../data/committee_assignment_pdfs/COMMITTEEASSIGNMENTS-113.pdf",113)
df_112 = committee_parser("../data/committee_assignment_pdfs/COMMITTEEASSIGNMENTS-112.pdf",112)

In [296]:
df_committee = pd.concat([df_117, df_116, df_115, df_115_2, df_114, df_113, df_112], axis=0, ignore_index=True)
df_committee

Unnamed: 0,Representative,Committee,Rank,Congress
0,Adams,Agriculture.,,117
1,Adams,Education and Labor.,,117
2,Adams,Financial Services.,,117
3,Aderholt,Appropriations.,,117
4,Aguilar,Appropriations.,,117
...,...,...,...,...
5884,"Young, C. W. Bill, of Florida",Appropriations.,,112
5885,"Young, Don, of Alaska",Natural Resources.,,112
5886,"Young, Don, of Alaska",Transportation and Infrastructure.,,112
5887,"Young, Todd C., of Indiana",Armed Services.,,112


In [257]:
rep_df = pd.read_excel('../data/updated_legislator_table.xlsx')
rep_df

Unnamed: 0,chamber,startYear,endYear,name,bioguideId,partyName,state,district,startCongress,endCongress,lastName,firstName,middleName,aka,suffix
0,House of Representatives,2024,2024,"Wied, Tony",W000829,Republican,Wisconsin,8.0,118,118,Wied,Tony,,,
1,House of Representatives,2024,2024,"Lee Carter, Erica",L000605,Democratic,Texas,18.0,118,118,Lee Carter,Erica,,,
2,House of Representatives,1975,1985,"Patterson, Jerry M.",P000121,Democratic,California,38.0,94,99,Patterson,Jerry,M.,,
3,House of Representatives,1977,1993,"Ireland, Andrew P.",I000029,Republican,Florida,10.0,95,103,Ireland,Andrew,P.,,
4,House of Representatives,1991,2009,"Hobson, David L.",H000666,Republican,Ohio,7.0,102,111,Hobson,David,L.,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2787,House of Representatives,1975,1991,"Florio, James J.",F000215,Democratic,New Jersey,1.0,94,102,Florio,James,J.,,
2788,House of Representatives,1945,1947,"Flood, Daniel J.",F000209,Democratic,Pennsylvania,11.0,79,80,Flood,Daniel,J.,,
2789,House of Representatives,1949,1953,"Flood, Daniel J.",F000209,Democratic,Pennsylvania,11.0,81,83,Flood,Daniel,J.,,
2790,House of Representatives,1955,1981,"Flood, Daniel J.",F000209,Democratic,Pennsylvania,11.0,84,97,Flood,Daniel,J.,,


In [297]:
def parse_representative_name(name):
    # Initialize default values
    last_name = first_name = middle_initial = state = None

    # Check if there is a comma, indicating a full name format
    if ',' in name:
        # Split on the first comma to get last name and the rest of the name
        last_name, rest = name.split(',', 1)
        last_name = last_name.strip()

        # Split "rest" further to extract first name, middle initial, and state
        # Format: "First Middle Initial., of State"
        parts = rest.strip().split()
        
        # First name should be the first part
        first_name = parts[0].strip(",")

        # Check if there’s a middle initial with a period
        if len(parts) >= 2 and (parts[1].endswith('.') or parts[1].endswith('.,')):
            middle_initial = parts[1][0]  # Get just the initial without the period
            
        # Check if 'of' is in parts, then capture the state
        if 'of' in parts:
            state_index = parts.index('of') + 1
            # Join the remaining parts as the state (handles multi-word states like "North Carolina")
            state = " ".join(parts[state_index:])

    else:
        # If no comma, assume only the last name is provided
        last_name = name.strip()

    return last_name, first_name, middle_initial, state

# Apply the function to split the Representative column
df_committee[['Last Name', 'First Name', 'Middle Initial', 'State']] = df_committee['Representative'].apply(
    lambda x: pd.Series(parse_representative_name(x))
)

# Display the DataFrame
df_committee.head(20)  # Display the first few rows to confirm structure


Unnamed: 0,Representative,Committee,Rank,Congress,Last Name,First Name,Middle Initial,State
0,Adams,Agriculture.,,117,Adams,,,
1,Adams,Education and Labor.,,117,Adams,,,
2,Adams,Financial Services.,,117,Adams,,,
3,Aderholt,Appropriations.,,117,Aderholt,,,
4,Aguilar,Appropriations.,,117,Aguilar,,,
5,Aguilar,House Administration.,,117,Aguilar,,,
6,Aguilar,Select Committee to Investigate the January 6t...,,117,Aguilar,,,
7,Allen,Agriculture.,,117,Allen,,,
8,Allen,Education and Labor.,,117,Allen,,,
9,Allred,Foreign Affairs.,,117,Allred,,,


In [298]:
# Letters to correct
replacements = {
    'a´': 'á',
    '´a': 'á',
    'e´': 'é',
    '´e': 'é',
    'i´': 'í',
    '´i': 'í',
    'ı´': 'í',  # Dotless i
    '´ı': 'í',
    'o´': 'ó',
    '´o': 'ó',
    'u´': 'ú',
    '´u': 'ú',
    'A´': 'Á',
    '´A': 'Á',
    'E´': 'É',
    '´E': 'É',
    'I´': 'Í',
    '´I': 'Í',
    '´O': 'Ó',
    'U´': 'Ú',
    '´U': 'Ú',
    'O’': "O'"
}


# Apply the normalization function to the DataFrame
df_committee["Last Name"] = df_committee["Last Name"].replace(replacements, regex=True)
df_committee["First Name"] = df_committee["First Name"].replace(replacements, regex=True)

In [302]:
# Manual Correction to committee df
def correct_name(df, first_name=None, middle_initial=None, last_name=None, update_values=None):
    if update_values is None:
        update_values = {}
    
    # Start with a mask of all True values
    mask = pd.Series([True] * len(df), index=df.index)
    
    # Apply filters based on provided name components using regex
    if first_name is not None:
        pattern = re.escape(first_name)
        mask &= df['First Name'].str.contains(pattern, regex=True, na=False)
    if middle_initial is not None:
        mask &= df['Middle Initial'] == middle_initial
    if last_name is not None:
        mask &= df['Last Name'] == last_name
    
    # Check if any rows match the criteria
    if mask.any():
        # Update the specified columns with new values
        for column, value in update_values.items():
            if column in df.columns:
                df.loc[mask, column] = value
            else:
                print(f"Column '{column}' does not exist in the DataFrame.")
    else:
        print("No matching records found.")


correct_name(
    df_committee,
    first_name='Cathy',
    last_name='Rodgers',
    update_values={'Last Name': 'McMorris Rodgers'}
)

correct_name(
    df_committee,
    last_name='AMASH',
    update_values={'Last Name': 'Amash'}
)

correct_name(
    df_committee,
    last_name='BluntRochester',
    update_values={'Last Name': 'Blunt Rochester'}
)

correct_name(
    df_committee,
    last_name='HerreraBeutler',
    update_values={'Last Name': 'Herrera Beutler'}
)

correct_name(
    df_committee,
    last_name='JacksonLee',
    update_values={'Last Name': 'Jackson Lee'}
)

correct_name(
    df_committee,
    last_name='Luján',
    update_values={'Last Name': 'Lujan'}
)

correct_name(
    df_committee,
    first_name='CathyMcMorris,of',
    last_name='Rodgers',
    update_values={'Last Name': 'McMorris Rodgers', 'First Name': 'Cathy'}
)

correct_name(
    df_committee,
    last_name='SanNicolas',
    update_values={'Last Name': 'San Nicolas'}
)

correct_name(
    df_committee,
    last_name='TorresSmall',
    update_values={'Last Name': 'Torres Small'}
)

correct_name(
    df_committee,
    last_name='VanDrew',
    update_values={'Last Name': 'Van Drew'}
)
correct_name(
    df_committee,
    last_name='WassermanSchultz',
    update_values={'Last Name': 'Wasserman Schultz'}
)

correct_name(
    df_committee,
    last_name='WatsonColeman',
    update_values={'Last Name': 'Watson Coleman'}
)

correct_name(
    df_committee,
    last_name='LujanGrisham',
    update_values={'Last Name': 'Lujan Grisham'}
)

correct_name(
    df_committee,
    last_name='McMorrisRodgers',
    update_values={'Last Name': 'McMorris Rodgers'}
)

correct_name(
    df_committee,
    last_name='VanHollen',
    update_values={'Last Name': 'Van Hollen'}
)

correct_name(
    df_committee,
    last_name='NegreteMcLeod',
    update_values={'Last Name': 'Negrete McLeod'}
)

correct_name(
    df_committee,
    last_name='Barragan',
    update_values={'Last Name': 'Barragán'}
)

correct_name(
    df_committee,
    last_name='Gonzalez-Colon',
    update_values={'Last Name': 'González-Colón'}
)

correct_name(
    df_committee,
    last_name='Gutiérrez',
    update_values={'Last Name': 'Gutierrez'}
)

correct_name(
    df_committee,
    last_name='Cardenas',
    update_values={'Last Name': 'Cárdenas'}
)

correct_name(
    df_committee,
    last_name='Velazquez',
    update_values={'Last Name': 'Velázquez'}
)

No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.
No matching records found.


In [None]:
def find_bioguide(rep_df, df_row,index):
    # Find values
    congress = df_row['Congress']
    last = df_row['Last Name']
    first = df_row['First Name']
    middle = df_row['Middle Initial']
    state = df_row['State']
    #print(last)
    possibles = rep_df.loc[(rep_df['startCongress']<=congress) & (rep_df['endCongress']>=congress)]
    possibles = possibles.loc[possibles['lastName']==last]
    if len(possibles) == 1:
        return possibles['bioguideId'].iloc[0]
    elif len(possibles) > 1:
        possibles = possibles.loc[possibles['firstName']==first]
        if len(possibles) == 1:
            return possibles['bioguideId'].iloc[0]
    else:
        print("Multiple for : "+last)
        print(index)
        print(len(possibles))
    return
# print(df_committee.loc[2530])
# test_df =find_bioguide(rep_df,df2.loc[704], 704)
# test_df

for index, row in df_committee.iterrows():
    df_committee.at[index, 'bioguideID'] = find_bioguide(rep_df,row, index)

Representative              Velazquez
Committee         Financial Services.
Rank                             None
Congress                          115
Last Name                   Velázquez
First Name                       None
Middle Initial                   None
State                            None
bioguideID                       None
Name: 2530, dtype: object


In [306]:
df_committee = df_committee.drop_duplicates()
df_committee

Unnamed: 0,Representative,Committee,Rank,Congress,Last Name,First Name,Middle Initial,State,bioguideID
0,Adams,Agriculture.,,117,Adams,,,,A000370
1,Adams,Education and Labor.,,117,Adams,,,,A000370
2,Adams,Financial Services.,,117,Adams,,,,A000370
3,Aderholt,Appropriations.,,117,Aderholt,,,,A000055
4,Aguilar,Appropriations.,,117,Aguilar,,,,A000371
...,...,...,...,...,...,...,...,...,...
5884,"Young, C. W. Bill, of Florida",Appropriations.,,112,Young,C.,W,Florida,Y000031
5885,"Young, Don, of Alaska",Natural Resources.,,112,Young,Don,,Alaska,Y000033
5886,"Young, Don, of Alaska",Transportation and Infrastructure.,,112,Young,Don,,Alaska,Y000033
5887,"Young, Todd C., of Indiana",Armed Services.,,112,Young,Todd,C,Indiana,Y000064


In [307]:
df_committee.to_excel('../data/committee_assignments.xlsx',index=False)