In [15]:
import pandas as pd
import os
from src.utils.results_utils import mapper
from fuzzywuzzy import fuzz, process

In [3]:
folder = "static"
df = pd.DataFrame()
for file in os.listdir(folder):
    if file.endswith(".xlsx"):
        try:
            temp = pd.read_excel(os.path.join(folder, file), sheet_name="Data")
        except:
            temp = pd.read_excel(os.path.join(folder, file), sheet_name="Sheet 1")
        df = pd.concat([df, temp])


In [4]:
df.ffill(inplace=True)

In [10]:
target = df[["Modified Field", "Maid’s Nationality", "Agent Value"]]

In [11]:
places = target[target["Modified Field"] == "Birth Place"]

In [16]:
# Get unique places and nationalities
places_unique = places[["Maid’s Nationality", "Agent Value"]].drop_duplicates()

# Normalize nationality to country code using mapper
def get_country_code(nationality):
    # Use fuzzy matching to find the closest key in mapper
    matched_country, score = process.extractOne(nationality, mapper.keys())
    return mapper.get(matched_country, "XXX")

places_unique["country_code"] = places_unique["Maid’s Nationality"].apply(get_country_code)

# Prepare DataFrame for custom_cities.csv format
places_to_add = places_unique[["Agent Value", "country_code"]].rename(columns={"Agent Value": "name"})

# Load existing custom_cities.csv
custom_cities_path = os.path.join(folder, "custom_cities.csv")
if os.path.exists(custom_cities_path):
    existing_cities = pd.read_csv(custom_cities_path)
else:
    existing_cities = pd.DataFrame(columns=["name", "country_code"])

# Find new places that are not in the existing custom_cities.csv
existing_places = set(existing_cities["name"].str.upper())
places_to_add["name"] = places_to_add["name"].str.upper()
new_places = places_to_add[~places_to_add["name"].isin(existing_places)]

# Combine existing and new places
combined_cities = pd.concat([existing_cities, new_places]).drop_duplicates(subset=["name"])

# Save updated custom_cities.csv
combined_cities.to_csv(custom_cities_path, index=False)

# Display how many new places were added
len(new_places)

3913

In [17]:
# Identify similar city names within each country code
from collections import defaultdict

# Set similarity threshold
similarity_threshold = 85  # Adjust as needed (higher = stricter matching)

# Group places by country code
country_groups = places_to_add.groupby('country_code')

# Dictionary to store similar place clusters for each country
similar_places = defaultdict(list)

# Process each country's places
for country_code, group in country_groups:
    places_list = group['name'].tolist()
    
    # Skip processing if too few places
    if len(places_list) < 2:
        continue
    
    # Find similar places within this country
    processed = set()
    
    for i, place1 in enumerate(places_list):
        if place1 in processed:
            continue
            
        cluster = [place1]
        processed.add(place1)
        
        for place2 in places_list[i+1:]:
            if place2 in processed:
                continue
                
            # Calculate similarity ratio
            similarity = fuzz.ratio(place1, place2)
            
            # If similar enough, add to cluster
            if similarity >= similarity_threshold:
                cluster.append(place2)
                processed.add(place2)
        
        # Store cluster if it contains multiple places
        if len(cluster) > 1:
            similar_places[country_code].append(cluster)

# Preview results
for country, clusters in similar_places.items():
    print(f"Country: {country}")
    for i, cluster in enumerate(clusters, 1):
        print(f"  Cluster {i}: {cluster}")
    print()

# Create a DataFrame with all similar place clusters for easier inspection
similar_df_rows = []
for country, clusters in similar_places.items():
    for cluster in clusters:
        for place in cluster:
            similar_df_rows.append({
                'country_code': country,
                'place_name': place,
                'cluster_repr': cluster[0]  # Using first item as cluster representative
            })

similar_df = pd.DataFrame(similar_df_rows)

# Export to CSV for manual review
similar_df.to_csv(os.path.join(folder, "similar_places.csv"), index=False)

# Count total clusters found
total_clusters = sum(len(clusters) for clusters in similar_places.values())
print(f"Found {total_clusters} clusters of similar place names across {len(similar_places)} countries")

Country: ETH
  Cluster 1: ['ARSI', 'ARSSI']
  Cluster 2: ['TIGRAY', 'TIGIRAY']
  Cluster 3: ['SHIRE', 'SIRE', 'SHIRIE']
  Cluster 4: ['KATA', 'KAFTA']
  Cluster 5: ['HOSSAENA', 'HOSANA', 'HOSSANA', 'HOSSAHNA']
  Cluster 6: ['ADDIS ABABA', 'ADDIS ABEBA']
  Cluster 7: ['ASELA', 'ASSELA', 'ASELLA']
  Cluster 8: ['DEBREZEIT', 'DEBREZIET', 'DEBRE ZEIT', 'DEBRE ZIT', 'DEBREZEYT']
  Cluster 9: ['SHOA', 'SHOWA']
  Cluster 10: ['MOJO', 'MODJO']
  Cluster 11: ['EAST WOLLEGA', 'WEST WOLLEGA']
  Cluster 12: ['GUNCHIRE', 'GUNCHRE', 'GUNCHERE']
  Cluster 13: ['DEJEN', 'DEJJEN']
  Cluster 14: ['NORTH SHOA', 'NORTH SHEWA']
  Cluster 15: ['BISHOFTU', 'BISHEFTU']
  Cluster 16: ['DEBREBIRHAN', 'DEBREBRHAN', 'DEBRE BRHAN', 'DEBRE BIRHAN', 'DEBREBREHAN']
  Cluster 17: ['DIRE DAWA', 'DIREDAWA']
  Cluster 18: ['DOYO GENA', 'DOYOGENA']
  Cluster 19: ['GIMBCHU', 'GIMBICHU']
  Cluster 20: ['WADLA', 'WADA']
  Cluster 21: ['TACH ARMACHIHO', 'TACHARMACHIHO']
  Cluster 22: ['ADAMITULU', 'ADAMI TULU']
  Cluster 23: 

In [None]:
# After reviewing similar_places.csv, create a mapping of misspelled names to correct names
# For example:
corrections = {
    # 'MISSPELLED': 'CORRECT',
    # 'NAIROBE': 'NAIROBI',
    # Add your corrections here
    "ADDIS ABEBA": "ADDIS ABABA",
}

# Apply corrections to the places_to_add DataFrame
places_to_add['name'] = places_to_add['name'].apply(lambda x: corrections.get(x, x))

# Remove duplicates after corrections
places_to_add = places_to_add.drop_duplicates(subset=['name', 'country_code'])

# Update combined_cities with corrected names
combined_cities = pd.concat([existing_cities, places_to_add]).drop_duplicates(subset=['name'])

# Save updated custom_cities.csv
combined_cities.to_csv(custom_cities_path, index=False)

print(f"Updated custom_cities.csv with corrections")