In [None]:
input_csv_path = "./preprocessing/agg_outputs"
output_csv_path = "/agg_party"
party_lookup_path = "metadata_mapping_16_17_18.csv"

In [None]:
import pandas as pd
import re
from rapidfuzz import process, fuzz

agg_file = pd.read_csv(input_csv_path)
metadata_mapper = pd.read_csv(party_lookup_path)

In [None]:
%pip install rapidfuzz


In [None]:
import unicodedata

# Clean name function with Unicode normalization
def clean_name(name):
    name = str(name).split('|')[0]
    name = unicodedata.normalize('NFKD', name)
    name = ''.join(c for c in name if not unicodedata.combining(c))  # Remove diacritics
    name = name.upper()
    name = re.sub(r'[^A-Z ]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

# Raw manual map
raw_manual_party_map = {
    "M VENKAIAH NAIDU": "Bharatiya Janata Party",
    "BHARTRUHARI MAHTAB": "Biju Janata Dal",
    "DEPUTY SPEAKER": "None",
    "ĒMA VAIṀKAIYYĀ NĀYAḌŪ | ĒMAVAIṀKAIYYĀ NĀYAḌŪ": "BJP",
    "NEIPHIU RIO": "NDPP",
    "RĀJĪVA PRATĀPA RŪḌĪ": "BJP",
    "NAVANĪTA RAVI RĀṆĀ": "BJP",
    "MĪNĀKṢĪ LĒKHĪ": "BJP"
}

manual_party_map = {
    clean_name(k): v for k, v in raw_manual_party_map.items()
}

# clean names in both datasetsss
agg_file['Cleaned_Speaker'] = agg_file['Speaker'].apply(clean_name)
metadata_mapper['Cleaned_Name'] = metadata_mapper['Name'].apply(clean_name)

# fuzzy match function with manual override
def get_party(speaker_name):
    if speaker_name in manual_party_map:
        return manual_party_map[speaker_name]
    
    match, score, _ = process.extractOne(
        speaker_name,
        metadata_mapper['Cleaned_Name'],
        scorer=fuzz.token_sort_ratio
    )
    if score > 65:
        return metadata_mapper.loc[
            metadata_mapper['Cleaned_Name'] == match, 'Party Name'
        ].values[0]
    else:
        return None

#  party mapping
agg_file['Party Name'] = agg_file['Cleaned_Speaker'].apply(get_party)

# logs
unmatched = agg_file[agg_file['Party Name'].isnull()]
if not unmatched.empty:
    print("Unmatched speakers:")
    print(unmatched['Speaker'].unique())

agg_file.drop(columns=['Cleaned_Speaker'], inplace=True)
agg_file.to_csv(output_csv_path, index=False)


Unmatched speakers:
['CHAIRPERSON' 'SPEAKER']
