In [12]:
%pip install pandas openpyxl matplotlib rapidfuzz
import pandas as pd
import matplotlib.pyplot as plt
import os
import difflib
from difflib import get_close_matches


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp312-cp312-win_amd64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp312-cp312-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 1.6/1.6 MB 43.7 MB/s eta 0:00:00
Installing collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0
Note: you may need to restart the kernel to use updated packages.


In [15]:
import pandas as pd
from rapidfuzz import process, fuzz

# Define file paths
old_file_path = '../data/stl/St. Louis Contacts.xlsx'
new_file_path = '../data/stl/AG List.xlsx'

# Load the Excel files
old_df = pd.read_excel(old_file_path, engine='openpyxl')
new_df = pd.read_excel(new_file_path, engine='openpyxl')

# Preserve original names for final output
old_df['Firm Name Original'] = old_df['Firm Name']
new_df['Account Name Original'] = new_df['Account Name']

# Normalize names for matching
old_df['Firm Name'] = old_df['Firm Name'].astype(str).str.strip().str.lower()
new_df['Account Name'] = new_df['Account Name'].astype(str).str.strip().str.lower()

# Fuzzy match with high threshold
match_threshold = 95
matches = []
matched_indices = set()

for idx, account_name in new_df['Account Name'].items():
    match_result = process.extractOne(
        account_name,
        old_df['Firm Name'],
        scorer=fuzz.token_sort_ratio
    )
    if match_result:
        match, score, match_idx = match_result
        if score >= match_threshold:
            matched_row = old_df.iloc[match_idx]
            merged_row = pd.concat([new_df.loc[idx], matched_row], axis=0)
            matches.append(merged_row)
            matched_indices.add(idx)

# Create DataFrame from matched rows
matched_df = pd.DataFrame(matches)

# Fill missing 'Contact' values with data from the old dataset if available
if 'Contact' in matched_df.columns and 'Contact_old' in matched_df.columns:
    matched_df['Contact'] = matched_df['Contact'].combine_first(matched_df['Contact_old'])

# Restore original casing
matched_df['Account Name'] = matched_df['Account Name Original']
matched_df['Firm Name'] = matched_df['Firm Name Original']

# Drop helper columns
matched_df = matched_df.drop(columns=['Account Name Original', 'Firm Name Original'], errors='ignore')

# Append unmatched rows
unmatched_df = new_df[~new_df.index.isin(matched_indices)]
final_df = pd.concat([matched_df, unmatched_df], ignore_index=True)

# Save result
final_df.to_excel('../data/stl/Merged_Contacts.xlsx', index=False)

print(f"Merged file saved with {len(matched_df)} matched rows and {len(unmatched_df)} unmatched rows.")




Merged file saved with 438 matched rows and 5015 unmatched rows.


In [10]:
import pandas as pd
from rapidfuzz import process, fuzz

# Define file paths
old_file_path = '../data/stl/St. Louis Contacts.xlsx'
new_file_path = '../data/stl/STL Prospect List 7.21.25.xlsx'
output_file_path = '../data/stl/Merged_Prospects_7.21.25.xlsx'

# Load the Excel files
old_df = pd.read_excel(old_file_path, engine='openpyxl')
new_df = pd.read_excel(new_file_path, sheet_name='STL Accounts', engine='openpyxl')

# Clean column names
old_df.columns = old_df.columns.str.strip().str.replace('\u00A0', ' ').str.replace('\n', ' ')
new_df.columns = new_df.columns.str.strip().str.replace('\u00A0', ' ').str.replace('\n', ' ')

# Identify the best match for 'Account Name' column
account_col = process.extractOne('Account Name', new_df.columns, scorer=fuzz.ratio)[0]
print(f"Using column '{account_col}' as 'Account Name'")

# Preserve original names for final output
old_df['Firm Name Original'] = old_df['Firm Name']
new_df['Account Name Original'] = new_df[account_col]

# Normalize names for matching
old_df['Firm Name Lower'] = old_df['Firm Name'].astype(str).str.strip().str.lower()
new_df['Account Name Lower'] = new_df[account_col].astype(str).str.strip().str.lower()

# Fuzzy match with high threshold
match_threshold = 95
matches = []
matched_indices = set()

for idx, account_name in new_df['Account Name Lower'].items():
    match_result = process.extractOne(
        account_name,
        old_df['Firm Name Lower'],
        scorer=fuzz.token_sort_ratio
    )
    if match_result:
        match, score, match_idx = match_result
        if score >= match_threshold:
            matched_row = old_df.iloc[match_idx]
            new_row = new_df.loc[idx]
            combined_row = pd.concat([new_row, matched_row])
            matches.append(combined_row)
            matched_indices.add(idx)

# Create DataFrame from matched rows
matched_df = pd.DataFrame(matches)

# Append unmatched rows from new_df
unmatched_df = new_df[~new_df.index.isin(matched_indices)]
final_df = pd.concat([matched_df, unmatched_df], ignore_index=True)

# Save result
final_df.to_excel(output_file_path, index=False)

print(f"Merged file saved with {len(matched_df)} matched rows and {len(unmatched_df)} unmatched rows.")



Using column 'Account Name' as 'Account Name'
Merged file saved with 497 matched rows and 630 unmatched rows.
