# Location Normalization for full_df_final.csv

This notebook applies the location normalization rules from `LLMtoDatabase.py` to the `event_loc` column of `full_df_final.csv`.

In [1]:
import pandas as pd
import re
import os

In [2]:
class LocationNormalizer:
    def __init__(self, cities_csv_path='data/nk_cities.csv'):
        # Load nk_cities and build maps for normalization
        try:
            self.nk_cities = pd.read_csv(cities_csv_path, encoding='euc-kr')
            self.BROAD_TERMS_MAP = {
                "평안도": ["평안남도", "평안북도"],
                "함경도": ["함경남도", "함경북도"],
                "황해도": ["황해남도", "황해북도"]
            }
            self.provinces_map, self.cities_map = self._build_maps()
            print("Normalization maps built successfully.")
        except Exception as e:
            print(f"Warning: Failed to load nk_cities.csv or build maps. Normalization will be skipped. Error: {e}")
            self.nk_cities = None
            self.provinces_map = {}
            self.cities_map = {}
            self.BROAD_TERMS_MAP = {}

    def _get_search_keys(self, name):
        if pd.isna(name): return [], None
        # Handle parentheses: "나선시(라선시)" -> parts: ["나선시", "라선시"]
        parts = re.split(r'[()]', name)
        parts = [p.strip() for p in parts if p.strip()]
        
        canonical_name = parts[0] # The first part is the canonical name
        
        keys = []
        for p in parts:
            # Strip suffixes '도', '시', '군', '구역' for search key
            key = p
            if key.endswith('도'): key = key[:-1]
            elif key.endswith('시'): key = key[:-1]
            elif key.endswith('군'): key = key[:-1]
            elif key.endswith('구역'): key = key[:-1]
            keys.append(key)
        return keys, canonical_name

    def _build_maps(self):
        provinces_map = {} # search_key -> canonical_full_name
        cities_map = {}    # search_key -> {'full': canonical_full_name, 'province': province_canonical_name}

        for idx, row in self.nk_cities.iterrows():
            # Process Province
            p_keys, p_canon = self._get_search_keys(row['도'])
            for k in p_keys:
                provinces_map[k] = p_canon
                
            # Process City
            c_keys, c_canon = self._get_search_keys(row['시'])
            for k in c_keys:
                cities_map[k] = {
                    'full': c_canon,
                    'province': p_canon # This might be None or a string
                }

        # Manual additions for abbreviations and broader terms
        abbr_map = {
            '평남': '평안남도',
            '평북': '평안북도',
            '함남': '함경남도',
            '함북': '함경북도',
            '황남': '황해남도',
            '황북': '황해북도',
            '양강': '양강도',
            '자강': '자강도',
            '강원': '강원도',
            '평안도': '평안도', # Broader term
            '황해도': '황해도', # Broader term
            '함경도': '함경도',  # Broader term
            '평안': '평안도' # Example 7: "평안" -> "평안도" (Assuming broader term)
        }

        for abbr, full in abbr_map.items():
            provinces_map[abbr] = full
            
        return provinces_map, cities_map

    def map_location_normalized(self, loc_str):
        if pd.isna(loc_str) or not isinstance(loc_str, str):
            return None
        
        found_provinces = set()
        found_cities = [] # List of dicts
        
        # 1. Search for Provinces
        for key, full_name in self.provinces_map.items():
            if key in loc_str:
                found_provinces.add(full_name)
                
        # 2. Search for Cities
        for key, info in self.cities_map.items():
            if key in loc_str:
                match_info = info.copy()
                match_info['key'] = key
                found_cities.append(match_info)
                
        # 3. Consolidate and Remove Redundancy
        
        # 3a. Identify implied provinces from found cities
        implied_provinces = set()
        for c in found_cities:
            if pd.notna(c['province']):
                implied_provinces.add(c['province'])
                
        # 3b. Remove found provinces if they are implied by the cities
        temp_provinces = set()
        for p in found_provinces:
            if p not in implied_provinces:
                temp_provinces.add(p)
        
        # 3c. Remove Broad Terms if Specific Terms are present
        all_present_specific_provinces = temp_provinces.union(implied_provinces)
        
        final_provinces = set()
        for p in temp_provinces:
            is_redundant_broad = False
            if p in self.BROAD_TERMS_MAP:
                # Check if any specific term for this broad term is present
                for specific in self.BROAD_TERMS_MAP[p]:
                    if specific in all_present_specific_provinces:
                        is_redundant_broad = True
                        break
            
            if not is_redundant_broad:
                final_provinces.add(p)
                
        # 4. Format Output
        final_results = set()
        
        # Add Remaining Provinces
        for p in final_provinces:
            final_results.add(p)
            
        # Add Cities (Format: "Province City" or "City")
        for c in found_cities:
            full_city = c['full']
            province = c['province']
            
            if pd.notna(province):
                final_results.add(f"{province} {full_city}")
            else:
                final_results.add(full_city)
                
        if not final_results:
            return None
            
        return ', '.join(sorted(list(final_results)))

In [3]:
# Load the data
input_file = 'data/full_df_final.csv'
output_file = 'data/event_loc_normalized.csv'

if os.path.exists(input_file):
    df = pd.read_csv(input_file)
    print(f"Loaded {input_file} with {len(df)} rows.")
else:
    print(f"Error: {input_file} not found.")
    df = None

Loaded data/full_df_final.csv with 19141 rows.


In [4]:
if df is not None:
    # Initialize Normalizer
    normalizer = LocationNormalizer()
    
    # Apply normalization
    print("Normalizing 'event_loc' column...")
    # We use a lambda to apply the function, handling potential non-string values gracefully (though the method handles them)
    df['event_loc_normalized'] = df['event_loc'].apply(normalizer.map_location_normalized)
    
    # Check some results
    print("Sample of normalized locations:")
    print(df[['event_loc', 'event_loc_normalized']].head(10))
    
    # Update the original column if desired, or keep as new column. 
    # The user asked to "normalize event_loc", so we should probably update the column.
    # However, keeping the original for comparison is often good practice in 'prep' files.
    # But to fulfill "normalize event_loc", I will update the column in the saved file.
    
    df['event_loc'] = df['event_loc_normalized']
    df.drop(columns=['event_loc_normalized'], inplace=True)
    
    # Save to new file
    df.to_csv(output_file, index=False)
    print(f"Saved normalized data to {output_file}")

Normalization maps built successfully.
Normalizing 'event_loc' column...
Sample of normalized locations:
                        event_loc              event_loc_normalized
0                       평안북도, 신의주                         평안북도 신의주시
1                             NaN                              None
2                             개성시                               개성시
3             평양시, 혜산시, 원산시, 사리원시  강원도 원산시, 양강도 혜산시, 평양시, 황해북도 사리원시
4  평안남도, 황해북도, 강원도, 양강도, 나선시, 개성시    강원도, 개성시, 나선시, 양강도, 평안남도, 황해북도
5                            평안남도                              평안남도
6                             평양시                               평양시
7                             양강도                               양강도
8              평양시, 함경도, 평안도, 양강도                양강도, 평안도, 평양시, 함경도
9                        강원도, 고성시                               강원도
Saved normalized data to data/full_df_final_normalized.csv
