# Entity Name Normalization Tool

This notebook helps normalize the 'relevant_entity' field across all JSON datapoint files by:
1. Collecting all unique entity names
2. Allowing review and mapping creation
3. Applying the normalization to all files

In [2]:
import os
import json
import pandas as pd
from collections import Counter
import glob
from IPython.display import display, HTML

## 1. Collect All Unique Entity Names

In [3]:
# Define the base directory containing all datapoint files
base_dir = "../Data/Ports"

# Function to find all JSON datapoint files
def find_json_files(base_dir):
    json_files = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith("_Datapoints.json"):
                json_files.append(os.path.join(root, file))
    return json_files

# Get all JSON files
json_files = find_json_files(base_dir)
print(f"Found {len(json_files)} JSON datapoint files")

Found 8 JSON datapoint files


In [4]:
# Extract all unique entity names and their counts
entity_counter = Counter()
file_entities = {}

for json_file in json_files:
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        # Track entities in this file
        file_entities[json_file] = []
        
        for item in data:
            if isinstance(item, dict) and 'relevant_entity' in item:
                entity = item['relevant_entity']
                if entity:  # Skip empty values
                    entity_counter[entity] += 1
                    file_entities[json_file].append(entity)
    except Exception as e:
        print(f"Error processing {json_file}: {e}")
        
# Convert to DataFrame for better visualization
entities_df = pd.DataFrame({
    'Entity Name': list(entity_counter.keys()),
    'Count': list(entity_counter.values())
}).sort_values('Count', ascending=False).reset_index(drop=True)

print(f"Found {len(entities_df)} unique entity names across all files")
display(entities_df)

Found 664 unique entity names across all files


Unnamed: 0,Entity Name,Count
0,Seller,116
1,Buyer,107
2,Authority,22
3,"Ship Masters, Agents",22
4,General,21
...,...,...
659,Pontoon,1
660,Tug Request,1
661,Land-Based Fumigation,1
662,Ship-Based Fumigation,1


## 2. Analyze Entity Names and Create Normalization Mapping

In [5]:
# Create a normalization mapping DataFrame
normalization_df = entities_df.copy()
normalization_df['Normalized Name'] = normalization_df['Entity Name']

# Add suggestions for similar names based on string matching
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def get_similar_entities(entity, threshold=0.8):
    similar_entities = []
    for other_entity in entities_df['Entity Name']:
        if entity != other_entity and similar(entity, other_entity) > threshold:
            similar_entities.append(other_entity)
    return similar_entities

normalization_df['Similar Entities'] = normalization_df['Entity Name'].apply(get_similar_entities)
display(normalization_df[['Entity Name', 'Count', 'Normalized Name', 'Similar Entities']])

Unnamed: 0,Entity Name,Count,Normalized Name,Similar Entities
0,Seller,116,Seller,[]
1,Buyer,107,Buyer,[]
2,Authority,22,Authority,[]
3,"Ship Masters, Agents",22,"Ship Masters, Agents","[Ship Masters, Ship Agents]"
4,General,21,General,[]
...,...,...,...,...
659,Pontoon,1,Pontoon,[]
660,Tug Request,1,Tug Request,[]
661,Land-Based Fumigation,1,Land-Based Fumigation,[Ship-Based Fumigation]
662,Ship-Based Fumigation,1,Ship-Based Fumigation,[Land-Based Fumigation]


### 2.2. Analyze Entities dictionary with LLM

In [6]:
# Export the normalization DataFrame to CSV
import os

# Create the System directory if it doesn't exist
os.makedirs("../Data/System", exist_ok=True)

# Export the full normalization DataFrame with all columns
csv_path = "../Data/System/entity_normalization_full.csv"
normalization_df.to_csv(csv_path, index=False)
print(f"Full normalization data exported to {csv_path}")

# Export a simplified version with just the mapping information
mapping_csv_path = "../Data/System/entity_normalization_mapping.csv"
normalization_df[['Entity Name', 'Count', 'Normalized Name']].to_csv(mapping_csv_path, index=False)
print(f"Simplified normalization mapping exported to {mapping_csv_path}")

# Create a JSON version of the mapping for easy import
import json
mapping_dict = dict(zip(normalization_df['Entity Name'], normalization_df['Normalized Name']))
json_path = "../Data/System/entity_normalization_mapping.json"
with open(json_path, 'w') as f:
    json.dump(mapping_dict, f, indent=2)
print(f"JSON mapping exported to {json_path}")

# Display the first few rows of the mapping
mapping_df = normalization_df[['Entity Name', 'Count', 'Normalized Name']]
display(mapping_df.head(10))

Full normalization data exported to ../Data/System/entity_normalization_full.csv
Simplified normalization mapping exported to ../Data/System/entity_normalization_mapping.csv
JSON mapping exported to ../Data/System/entity_normalization_mapping.json


Unnamed: 0,Entity Name,Count,Normalized Name
0,Seller,116,Seller
1,Buyer,107,Buyer
2,Authority,22,Authority
3,"Ship Masters, Agents",22,"Ship Masters, Agents"
4,General,21,General
5,Vessel,21,Vessel
6,Shipping Agent/Vessel,20,Shipping Agent/Vessel
7,Port Master,16,Port Master
8,Yangshan Port,13,Yangshan Port
9,Vessel Operator (Departing),12,Vessel Operator (Departing)


### Manual Normalization

Review the suggestions and edit the 'Normalized Name' column based on your domain knowledge. Below are common patterns that might need normalization:

1. Case differences (VESSEL vs Vessel)
2. Abbreviations (Ship Agent vs SA)
3. Synonym variations (Client vs Customer)
4. With/without qualifiers (Vessel vs Container Vessel)

**Instructions:**
- Review the similar entities and determine proper normalized names
- Edit the code cell below to create your normalization mapping

In [8]:
# Replace the existing manual_mapping section with your comprehensive mapping

# Copy and paste your entire mapping dictionary here
manual_mapping = {
    # --- Core Roles ---
    'Seller': 'Seller',
    'Buyer': 'Buyer',
    'Customer': 'Customer',
    'CUSTOMER': 'Customer', # Case normalization
    'Client': 'Customer',  # Synonym normalization
    'Shipper': 'Shipper',
    'Shippers': 'Shipper', # Plural normalization
    'Consignee': 'Consignee',
    'Consignees': 'Consignee', # Plural normalization
    'Carrier': 'Carrier',
    'Importer': 'Importer',
    'Exporter': 'Exporter',
    'Importer/Exporter': 'Importer/Exporter',
    'Importers/Exporters': 'Importer/Exporter', # Variation normalization
    'Importer, Exporter': 'Importer/Exporter', # Variation normalization
    'Seller/Buyer': 'Seller/Buyer',
    'Trader/Business': 'Trader/Business',
    'Trader at Exit': 'Trader', # Simplified
    'Shipping Agent': 'Shipping Agent',
    'Shipping Agency': 'Shipping Agent', # Synonym normalization
    'Shipper/Agent': 'Shipper/Agent',
    'Vessel Agent': 'Vessel Agent',
    'Vessel/Agent': 'Vessel Agent', # Variation normalization
    'Vessel Agent/Importer': 'Vessel Agent/Importer',
    'Shipping Agent/Vessel': 'Shipping Agent/Vessel',
    'Shipping Agent/Terminal': 'Shipping Agent/Terminal',
    'Shipper/Freight Forwarder': 'Shipper/Freight Forwarder',
    'Shippers/Freight Forwarders': 'Shipper/Freight Forwarder', # Variation normalization
    'Shipper, Freight Forwarder': 'Shipper/Freight Forwarder', # Variation normalization
    'Shippers/Forwarders': 'Shipper/Freight Forwarder', # Variation normalization
    'Trucker': 'Trucker',
    'Truck Driver': 'Trucker', # Synonym normalization
    'Haulage Company': 'Haulage Company',
    'Pilot': 'Pilot',
    'Authorised Pilot': 'Pilot', # Qualification grouping
    'Unauthorized Pilot': 'Unauthorized Pilot', # Keep distinct - important status
    'Owner (Vessel)': 'Vessel Owner', # Reworded for clarity
    'Shipowners': 'Vessel Owner', # Synonym normalization
    'Shipowners, Managers, Operators': 'Shipowner/Manager/Operator', # Compound role
    'Master': 'Master', # Assuming Ship Master
    'Vessel Master': 'Master', # Synonym normalization
    'Ship Masters': 'Master', # Plural normalization
    'Captain/Skipper': 'Master', # Synonym normalization
    'Seafarer': 'Seafarer',
    'Seafarers': 'Seafarer', # Plural normalization
    'All Seafarers': 'Seafarer', # Qualification grouping
    'Seaman': 'Seafarer', # Synonym normalization (modern term)
    'Crew Member': 'Crew Member',
    'Crew Members': 'Crew Member', # Plural normalization
    'Vessel Crews': 'Crew Member', # Variation normalization
    'Harbour Craft Crew': 'Crew Member (Harbour Craft)', # Keep specific context
    'Passenger': 'Passenger',
    'Declarant': 'Declarant',
    'Declarants': 'Declarant', # Plural normalization
    'Cargo Declarers': 'Declarant', # Synonym normalization
    'Shippers, Cargo Declarers': 'Shipper/Declarant', # Compound Role
    'Cargo Owners/Operators': 'Cargo Owner/Operator',
    'Cargo Operators': 'Cargo Operator',
    'Dangerous Cargo Operators': 'Cargo Operator (Dangerous Goods)',
    'Terminal Operator': 'Terminal Operator',
    'Terminal Operators, Stevedores': 'Terminal Operator/Stevedore', # Compound Role
    'Terminal Operators, Yard Management': 'Terminal Operator (Yard Management)', # Specific function
    'Terminal Operators, Yard Management, Security Personnel': 'Terminal Operator (Yard/Security)', # Compound function
    'Terminal Operators, Truckers, Gate Operations': 'Terminal Operator/Trucker (Gate)', # Compound Role/Location
    'Wharf/Dock Operators': 'Wharf/Dock Operator',
    'Wharf/Dock Owner': 'Wharf/Dock Owner',
    'Wharf/Premises Owner/Operator': 'Wharf/Premises Owner/Operator',
    'Bunker Supplier': 'Bunker Supplier',
    'Bunker Company': 'Bunker Supplier', # Synonym normalization
    'LNG Bunker Supplier': 'Bunker Supplier (LNG)', # Specific type
    'Bunker Operator': 'Bunker Operator',
    'Bunker Permit Holder/Vessel': 'Bunker Permit Holder/Vessel',
    'Contractor': 'Contractor',
    'Contractor Employees': 'Contractor Employee',
    'Logistics Professional': 'Logistics Professional',
    'Logistics Professionals': 'Logistics Professional', # Plural normalization
    'Port User': 'Port User',
    'Port Users': 'Port User', # Plural normalization
    'EPC User': 'EPC User', # Keep specific system user
    'KING Berthing System Users': 'KING Berthing System User', # Specific system user
    'Various Port Stakeholders': 'Port Stakeholder', # General term
    'Port Call Stakeholders': 'Port Stakeholder', # General term
    'Port Call Stakeholders (Optional)': 'Port Stakeholder', # General term

    # --- Authorities and Governance ---
    'Authority': 'Authority',
    'Relevant Authorities': 'Authority', # General term
    'Port Authority': 'Port Authority',
    'Port Authorities': 'Port Authority', # Plural normalization
    'Harbour Master': 'Harbour Master', # Specific role within Port Authority
    'Port Master': 'Harbour Master', # Synonym normalization
    'Port Authority - Environmental Compliance': 'Port Authority (Environmental)', # Specific function
    'Port Supervisory Authorities': 'Port Authority', # General term
    'Freeport of Riga Authority (FPRA)': 'Freeport of Riga Authority (FPRA)', # Specific Authority
    'Port of Rotterdam Authority': 'Port of Rotterdam Authority', # Specific Authority
    'MPA': 'MPA', # Specific Authority (likely Singapore)
    'Shanghai MSA': 'Shanghai MSA', # Specific Authority
    'Latvian Customs (VID)': 'Latvian Customs (VID)', # Specific Authority
    'Dutch Customs': 'Dutch Customs', # Specific Authority
    'German Customs': 'German Customs', # Specific Authority
    'Belgian Customs': 'Belgian Customs', # Specific Authority
    'Customs': 'Customs Authority', # General term
    'Customs Authorities': 'Customs Authority', # General term
    'Customs Administrations': 'Customs Authority', # General term
    'Customs/Trade': 'Customs Authority/Trade', # Compound
    'Customs/Importer': 'Customs Authority/Importer', # Compound
    'BMEL (Federal Ministry of Food and Agriculture)': 'BMEL', # Specific German Ministry
    'BAFA (Federal Office for Economic Affairs and Export Control)': 'BAFA', # Specific German Office
    'Immigration Authorities': 'Immigration Authority',
    'Immigration and Security Authorities': 'Immigration/Security Authority', # Compound
    'Port Health Authorities': 'Port Health Authority',
    'Flag States': 'Flag State',
    'Port States': 'Port State',
    'Contracting Governments': 'Contracting Government',
    'Municipal Executive': 'Municipal Executive',
    'Court': 'Court',
    'High Court': 'High Court',
    'Director of Marine': 'Director of Marine', # Specific title
    'Deputy Chairperson': 'Deputy Chairperson', # Specific title
    'Chairperson': 'Chairperson', # Specific title
    'Chief Executive': 'Chief Executive', # Specific title
    'Member': 'Member', # Generic, context needed
    'Authority Officer': 'Authority Officer',
    'Authority Employee': 'Authority Employee',
    'Appointed Officer': 'Appointed Officer',
    'Berthing Master': 'Berthing Master',
    'Port State Control Officers': 'Port State Control Officer',

    # --- Vessels and Crafts ---
    'Vessel': 'Vessel',
    'Vessels': 'Vessel', # Plural normalization
    'Ship': 'Vessel', # Synonym normalization
    'Ships': 'Vessel', # Synonym normalization
    'Every Vessel': 'Vessel', # Qualification grouping
    'All Vessels': 'Vessel', # Qualification grouping
    'Certain Vessels': 'Vessel', # Qualification grouping
    'Larger Vessels': 'Vessel', # Qualification grouping
    'High-Risk Vessels': 'Vessel (High Risk)', # Keep qualifier - important status
    'Foreign Vessels': 'Vessel (Foreign)', # Keep qualifier
    'Moored Vessel': 'Vessel', # State grouping
    'Detained Vessel': 'Vessel (Detained)', # Keep qualifier - important status
    'Arrested Vessel': 'Vessel (Arrested)', # Keep qualifier - important status
    'Vessel in Distress': 'Vessel (In Distress)', # Keep qualifier - important status
    'Vessel (Arriving)': 'Vessel', # State grouping
    'Vessel (Departing)': 'Vessel', # State grouping
    'Vessel (First Call)': 'Vessel', # State grouping
    'Vessel (Calling at Port)': 'Vessel', # State grouping
    'Vessel (Using Port)': 'Vessel', # State grouping
    'Vessel (in Pilotage District)': 'Vessel', # State grouping
    'Vessel (Moving within Pilotage District)': 'Vessel', # State grouping
    'Vessel (First Call/Changed Certificates)': 'Vessel', # State grouping
    'Vessel with Height Change': 'Vessel', # State grouping
    'Vessel Class (Minister Exempted)': 'Vessel (Exempted)', # Keep qualifier - important status
    'Vessel from Non-Consular Country': 'Vessel (Non-Consular Country)', # Keep qualifier
    'Vessel of Charged Owner/Master': 'Vessel (Charged Owner/Master)', # Keep qualifier - important status
    'Non-Compliant Owner/Master Vessel': 'Vessel (Non-Compliant Owner/Master)', # Keep qualifier - important status
    'Vessel Suspected of Cable Damage': 'Vessel (Suspected Cable Damage)', # Keep qualifier - specific event
    'Vessels with Machinery Failures': 'Vessel (Machinery Failure)', # Keep qualifier - important status
    'Vessels with Non-Compliant Equipment': 'Vessel (Non-Compliant Equipment)', # Keep qualifier - important status
    'Ships Arriving at Port': 'Vessel', # State grouping
    'Ships Arriving and Departing Ports': 'Vessel', # State grouping
    'Ships at Yangshan Port': 'Vessel', # Location grouping (redundant if context is Yangshan)
    'Vessels Operating at Yangshan': 'Vessel', # Location grouping
    'Vessels in Yangshan Port': 'Vessel', # Location grouping
    'Vessels Entering Yangshan Port': 'Vessel', # Location grouping
    'Vessels Navigating Yangshan Port': 'Vessel', # Location grouping
    'Vessels Transiting Yangshan Waters': 'Vessel', # Location grouping
    'Vessels Calling at Shanghai Ports': 'Vessel', # Location grouping
    'Vessels in Channels/Fairways': 'Vessel', # Location grouping
    'Vessels in Narrow Channels': 'Vessel', # Location grouping
    'Vessels using Traffic Separation Schemes': 'Vessel', # Location grouping
    'Vessel in TSS': 'Vessel', # Location grouping
    'Vessels Crossing Main Fairway': 'Vessel', # Action grouping
    'Vessels Crossing Jinshan Fairway': 'Vessel', # Action grouping
    'Vessels Not Using Channels': 'Vessel', # Action grouping
    'Vessels Navigating Precautionary Area': 'Vessel', # Location grouping
    'Vessels Navigating in Poor Visibility': 'Vessel', # Condition grouping
    'Vessels in Main Fairway and West of Huangzeyang': 'Vessel', # Location grouping
    'Vessels West of Tangnaoshan to Jinshan': 'Vessel', # Location grouping
    'Vessels in Jiaxing Waters': 'Vessel', # Location grouping
    'Vessels East of Huangzeyang Light-vessel': 'Vessel', # Location grouping
    'Vessel Approaching from West': 'Vessel', # Action grouping
    'Vessel Approaching from East': 'Vessel', # Action grouping
    'Vessel in Petroleumhaven': 'Vessel (Petroleumhaven)', # Keep specific location context
    'Ships involved in International Trade': 'Vessel (International Trade)', # Keep qualifier
    'International Voyage Ships': 'Vessel (International Voyage)', # Keep qualifier
    'Ships (certain sizes, international voyages)': 'Vessel (International Voyage)', # Keep qualifier
    'Vessel > 300 GT': 'Vessel (>300 GT)', # Keep qualifier
    'Deep-Draught Vessels': 'Vessel (Deep Draught)',
    'Deep-Draft Vessels': 'Vessel (Deep Draught)', # Variation normalization
    'Deep-Draught Vessels (Geulers/Channel Vessels)': 'Vessel (Deep Draught)', # Qualification grouping
    'Overtaking Vessel': 'Vessel (Overtaking)', # Keep qualifier - specific action/rule context
    'Power-driven Vessels': 'Vessel (Power-driven)', # Keep qualifier - specific rule context
    'Tanker': 'Tanker',
    'Single Hulled Tanker': 'Tanker (Single Hulled)',
    'Single-Hulled Tanker': 'Tanker (Single Hulled)', # Variation normalization
    'Sea-going Tanker (Outside Petroleumhaven)': 'Tanker (Sea-going)', # Location grouping
    'Tankers & Tank Barges (Hazardous/Polluting Goods)': 'Tanker/Tank Barge (Hazardous/Polluting)',
    'Container Vessel': 'Container Vessel',
    'Container Vessels': 'Container Vessel', # Plural normalization
    'Container Ships': 'Container Vessel', # Synonym normalization
    'Seagoing Container Vessel': 'Container Vessel (Sea-going)',
    'General Cargo Vessels (Container Ships)': 'Container Vessel', # Simplify
    'Inland Vessel': 'Inland Vessel',
    'Inland Vessels': 'Inland Vessel', # Plural normalization
    'Inland Vessels (Clean Engines)': 'Inland Vessel (Clean Engine)',
    'Inland Vessels (Clean Engines - NOx Reduction)': 'Inland Vessel (Clean Engine NOx Reduction)',
    'Inland Vessels (Clean Engines - Renewable Fuels)': 'Inland Vessel (Clean Engine Renewable Fuel)',
    'Green Award Certified Inland Vessels': 'Inland Vessel (Green Award)',
    'Non-Compliant Inland Vessels': 'Inland Vessel (Non-Compliant)',
    'Seagoing Vessel': 'Sea-going Vessel',
    'Seagoing Vessel (including Large Container Vessels)': 'Sea-going Vessel', # Qualification grouping
    'Warship': 'Warship',
    'Foreign Armed Forces': 'Foreign Armed Forces Vessel', # Clarify it's likely a vessel
    'Singapore Armed Forces Vessel': 'Singapore Armed Forces Vessel',
    'Government Vessel (Non-Commercial)': 'Government Vessel (Non-Commercial)',
    'Passenger Vessel': 'Passenger Vessel',
    'Ships with Passengers': 'Passenger Vessel', # Synonym normalization
    'Bunker Ship': 'Bunker Ship',
    'Harbour Craft': 'Harbour Craft',
    'Pontoon': 'Pontoon',
    'Vehicle': 'Vehicle', # Likely land vehicle
    'Heavy Trucks': 'Heavy Truck',

    # --- Cargo and Goods ---
    'Goods': 'Goods',
    'Cargo': 'Cargo', # Assuming distinct from Goods if both used
    'Container': 'Container',
    'Containerized Cargo (Import)': 'Containerized Cargo (Import)',
    'Containerized Cargo (Import/Export)': 'Containerized Cargo (Import/Export)',
    'Containerized Hazardous Goods Shipment (IMDG Class X)': 'Hazardous Goods Shipment (Containerized, IMDG Class X)', # Reworded
    'Dangerous Cargoes': 'Dangerous Goods', # Standard term
    'Hazardous Goods Shipment': 'Hazardous Goods Shipment',
    'Hazardous Chemical Shipments': 'Hazardous Chemical Shipment',
    'Hazardous Chemical Containers': 'Hazardous Chemical Container',
    'IMDG Code Chemicals': 'IMDG Code Chemical',
    'Catalog of Hazardous Chemicals (2015)': 'Catalog of Hazardous Chemicals (2015)', # Specific document
    'Companies Violating Chemical Ban': 'Company (Violating Chemical Ban)',
    'Individuals Responsible for Chemical Ban Violations': 'Individual (Chemical Ban Violation)',
    'Transportation of Prohibited Chemicals': 'Transportation (Prohibited Chemicals)',
    'Prohibition on Chemical Transportation': 'Prohibition (Chemical Transportation)',
    'Transportation of Highly Toxic/Dangerous Chemicals': 'Transportation (Highly Toxic/Dangerous Chemicals)',
    'Ships carrying packaged harmful substances': 'Vessel (Carrying Packaged Harmful Substances)',
    'Vessel Carrying Hazardous Cargo': 'Vessel (Carrying Hazardous Cargo)',
    'Vessel Carrying Dangerous Cargo': 'Vessel (Carrying Hazardous Cargo)', # Grouping Hazardous/Dangerous
    'Vessels Carrying Dangerous Bulk Goods': 'Vessel (Carrying Bulk Dangerous Goods)',
    'Vessel Discharging Residues': 'Vessel (Discharging Residues)',
    'Vessel with Oil Residues': 'Vessel (Oil Residues)',
    'Consignees/Agents of Hazardous Chemicals': 'Consignee/Agent (Hazardous Chemicals)',
    'Hazardous Cargo Shippers': 'Shipper (Hazardous/Dangerous Goods)', # Grouping Hazardous/Dangerous
    'Shippers of Dangerous Goods': 'Shipper (Hazardous/Dangerous Goods)', # Grouping Hazardous/Dangerous
    'Cargo Declarers for DG': 'Declarant (Dangerous Goods)',
    'Declarants (Hazardous Goods)': 'Declarant (Hazardous Goods)', # Keep distinct? Or group DG/Haz? Group for now. -> 'Declarant (Hazardous/Dangerous Goods)'
    'Declarant (Hazardous Goods)': 'Declarant (Hazardous/Dangerous Goods)', # Grouping
    'Personnel Preparing DGDs': 'Personnel (Preparing DGD)',
    'Shippers, DGD Preparers': 'Shipper/DGD Preparer', # Compound role
    'Personal Items Customs Declaration': 'Customs Declaration (Personal Items)',
    'Goods in Bonded Zone': 'Goods (Bonded Zone)',
    'Goods in Customs Sea Port': 'Goods (Customs Sea Port)',
    'Transit Cargo': 'Transit Cargo',
    'Excise Goods, Transit Cargo': 'Transit Cargo (Excise Goods)',
    'Cargo in Cabotage Pilot': 'Cargo (Cabotage Pilot)',
    'Cargo to Northern Ports': 'Cargo (Destination Northern Ports)',
    'Import Shipment': 'Import Shipment',
    'Export Shipment': 'Export Shipment',
    'Containerized Cargo (Import)': 'Containerized Cargo (Import)',
    'Recipient (Import Container)': 'Recipient (Import Container)',
    'Container Shipments Documentation List': 'Documentation List (Container Shipment)',
    'Container Shipments Customs Declaration': 'Customs Declaration (Container Shipment)',
    'Imports to Dalian, Tianjin, and Qingdao': 'Import (To Dalian/Tianjin/Qingdao)',
    'General Goods (Non-Hazardous) in Transit Warehouse (Non-Front Quay)': 'Goods (General, Non-Front Quay Warehouse)',
    'Packaged Hazardous Goods in Front Quay Transit Warehouse': 'Hazardous Goods (Packaged, Front Quay Warehouse)',

    # --- Locations and Facilities ---
    'Port': 'Port',
    'Ports': 'Port', # Plural normalization
    'Future Ports': 'Port (Future)', # Keep qualifier
    'Yangshan Port': 'Yangshan Port', # Specific Port
    'Port of Singapore': 'Port of Singapore', # Specific Port
    'Port of Rotterdam': 'Port of Rotterdam', # Specific Port
    'Port of Shanghai': 'Port of Shanghai', # Specific Port
    'Port of Riga': 'Port of Riga', # Specific Port
    'Port of Hamburg': 'Port of Hamburg', # Specific Port
    'Port of Antwerp-Bruges': 'Port of Antwerp-Bruges', # Specific Port
    'Chinese Ports': 'Port (China)', # General Location
    'Yangtze River Ports': 'Port (Yangtze River)', # General Location
    'Pilotage District': 'Pilotage District',
    'Wharf': 'Wharf',
    'Dock': 'Dock',
    'Berth': 'Berth', # Assuming 'Ship, Berth Usage' implies Berth
    'Ship, Berth Usage': 'Vessel/Berth Usage', # Standardized terms
    'Ship, Berth Usage Overtime': 'Vessel/Berth Usage (Overtime)', # Standardized terms
    'Premises': 'Premises',
    'Container Terminal': 'Container Terminal',
    'Container Terminals': 'Container Terminal', # Plural normalization
    'Automated Container Terminal': 'Container Terminal (Automated)',
    'Port of Riga Container Terminals': 'Container Terminal (Port of Riga)', # Specific Location
    'Riga Container Terminal (RCT) Gate': 'RCT Gate', # Specific Location/Facility
    'Riga Universal Terminal (ROT) Gate': 'ROT Gate', # Specific Location/Facility
    'Baltic Container Terminal (BCT) Gate': 'BCT Gate', # Specific Location/Facility
    'RCT': 'RCT', # Specific Terminal
    'ROT': 'ROT', # Specific Terminal (Riga Universal Terminal)
    'BCT': 'BCT', # Specific Terminal (Baltic Container Terminal)
    'Yangshan Port Terminals': 'Yangshan Port Terminal',
    'Container Terminal Facilities': 'Container Terminal Facility',
    'Terminal Operators, Yard Management': 'Terminal Operator (Yard Management)',
    'Export Container, Terminal Operator, Yard': 'Export Container/Terminal Operator/Yard', # Compound interaction
    'Import Container, Terminal Operator, Yard Cranes': 'Import Container/Terminal Operator/Yard Crane', # Compound interaction
    'Inland Container Terminal (ICT) Network': 'Inland Container Terminal Network',
    'Eurogeul Channel': 'Eurogeul Channel', # Specific Channel
    'Specialized Petroleumhavens (Nijlhaven, Yukonhaven)': 'Specialized Petroleumhaven', # Group specific names
    'Waste Reception Facility': 'Waste Reception Facility',
    'Port Reception Facilities': 'Port Reception Facility', # Plural normalization
    'Hazardous Goods Storage Facility': 'Hazardous Goods Storage Facility',
    'Front Quay Transit Warehouse': 'Front Quay Transit Warehouse',
    'Lighthouse': 'Lighthouse',
    'Beacon': 'Beacon',
    'Buoy': 'Buoy',
    'Navigation Aid Operators': 'Navigation Aid Operator',
    'Pontoon/Mooring Operators': 'Pontoon/Mooring Operator',
    'Private Mooring Operators': 'Private Mooring Operator',
    'Ship-to-Ship Transhipment Berths': 'Ship-to-Ship Transhipment Berth',
    'Shanghai Port Hinterland': 'Shanghai Port Hinterland',
    'Hinterland Connections': 'Hinterland Connection',

    # --- Processes, Services, and Concepts ---
    'Port Operations': 'Port Operation',
    'Berthing Operations': 'Berthing Operation',
    'Vessel Operations': 'Vessel Operation',
    'Ship Operations': 'Vessel Operation', # Synonym normalization
    'Maritime Activities at Yangshan Port': 'Maritime Activity (Yangshan)',
    'Maritime Operations at Yangshan Port': 'Maritime Operation (Yangshan)',
    'Port Activities': 'Port Activity',
    'Customs Clearance': 'Customs Clearance',
    'Customs Clearance Process': 'Customs Clearance', # Simplify
    'Customs Clearance, Importer': 'Customs Clearance/Importer', # Compound Process/Role
    'Customs Transit Procedure': 'Customs Transit Procedure',
    'Transit Procedure, Rail Transit': 'Transit Procedure (Rail)',
    'Rail Transit, Railway Undertaking': 'Rail Transit/Railway Undertaking', # Compound Process/Role
    'Bunkering Operation': 'Bunkering Operation',
    'LNG Bunker Operation': 'Bunkering Operation (LNG)',
    'Parties Involved in Bunkering': 'Party (Bunkering)', # General Role
    'Ship-to-Ship Transhipment Operations': 'Ship-to-Ship Transhipment',
    'Container Transhipment Operation': 'Container Transhipment',
    'Water-to-Water Transhipment': 'Water-to-Water Transhipment',
    'Dangerous Cargo Handling': 'Dangerous Goods Handling',
    'Hazardous Chemical Handling': 'Hazardous Chemical Handling',
    'Container Handling at Yangshan': 'Container Handling (Yangshan)',
    'Empty Container Logistics': 'Empty Container Logistics',
    'Empty Container Logistics Optimization': 'Empty Container Logistics Optimization',
    'ROT Services': 'ROT Service (General)', # Distinguish general from specific
    'ROT Services - Forklift': 'ROT Service (Specific)', # Group specific types
    'ROT Services - Portal Crane': 'ROT Service (Specific)', # Group specific types
    'ROT Services - Front Loader': 'ROT Service (Specific)', # Group specific types
    'ROT Services - Mobile Crane': 'ROT Service (Specific)', # Group specific types
    'ROT Services - Customs Formalities': 'ROT Service (Customs)', # Specific type by function
    'ROT Services - Ship Loading/Unloading': 'ROT Service (Ship Loading/Unloading)', # Specific type by function
    'ROT Services - Unforeseen Services': 'ROT Service (Unforeseen)', # Specific type by status
    'ROT Services - Downtime': 'ROT Service (Downtime)', # Specific type by status
    'ROT Services - Other': 'ROT Service (Other)', # Specific type
    'Port Services and Facilities': 'Port Service/Facility',
    'Marine Services and Facilities': 'Marine Service/Facility',
    'Intermodal Rail Services': 'Intermodal Rail Service',
    'Laytime': 'Laytime',
    'Contract of Carriage': 'Contract of Carriage',
    'International Maritime Transportation': 'International Maritime Transportation',
    'Domestic Maritime Transportation': 'Domestic Maritime Transportation',
    'Maritime Transport': 'Maritime Transport',
    'Maritime Shipping': 'Maritime Shipping',
    'Transportation between Mainland China and HK/Macau': 'Transportation (China Mainland-HK/Macau)',
    'International Container Transport': 'International Container Transport',
    'Inland Container Shipping': 'Inland Container Shipping',
    'Digitalization': 'Digitalization',
    'Digitalization Initiatives': 'Digitalization Initiative',
    'Blockchain Technology': 'Blockchain Technology',
    'Blockchain Cargo Release System': 'Blockchain Cargo Release System',
    'Automation': 'Automation', # General concept
    'Port Automation': 'Port Automation',
    'Yangshan Port Automation': 'Port Automation (Yangshan)',
    'Automation Benefits': 'Automation Benefit',
    'Automation Level at Yangshan Port': 'Automation Level (Yangshan)',
    'Automation at Yangshan Port': 'Automation (Yangshan)',
    'Automated Equipment Operations': 'Automated Equipment Operation',
    'Automated Cranes': 'Automated Crane',
    'Automated Guided Vehicles (AGVs)': 'Automated Guided Vehicle (AGV)',
    'Automated Machines': 'Automated Machine',
    'Intelligent Production Control System': 'Intelligent Production Control System',
    'Intelligent Production Management Control System': 'Intelligent Production Control System', # Variation normalization
    'Nextlogic': 'Nextlogic', # Specific System
    'APICS System': 'APICS System', # Specific System
    'TR02 System': 'TR02 System', # Specific System
    'PACT Ballast Water Treatment System': 'PACT Ballast Water Treatment System', # Specific System
    'Defect Reporting System': 'Defect Reporting System',
    'Advance Notification System': 'Advance Notification System',
    'Port Collection and Distribution System': 'Port Collection and Distribution System',
    'Safety Self-Check': 'Safety Self-Check',
    'Safety Self-Check Checklist': 'Safety Self-Check Checklist',
    'Safety Self-Check Requirement': 'Safety Self-Check Requirement',
    'SOLAS VI/2 Compliance': 'SOLAS VI/2 Compliance', # Specific Regulation Compliance
    'Customs Compliance': 'Customs Compliance',
    'Vessel Safety Verification': 'Vessel Safety Verification',
    'Defect Reporting Requirement': 'Defect Reporting Requirement',
    'Defect Reporting Protocol': 'Defect Reporting Protocol',
    'Failure to Report Defects': 'Failure to Report Defects',
    'Unreported Non-Compliance': 'Unreported Non-Compliance',
    'PSC Inspection Scheduling': 'PSC Inspection Scheduling',
    'PSC Boarding Inspections': 'PSC Boarding Inspection',
    'Rescheduled PSC Inspections': 'Rescheduled PSC Inspection',
    'Rescheduled PSC Inspection Initiative': 'Rescheduled PSC Inspection Initiative',
    'Ship Emergency Response': 'Ship Emergency Response',
    'Yangshan Port Emergency Response': 'Emergency Response (Yangshan)',
    'Machinery Failure Written Reports': 'Machinery Failure Report',
    'Machinery Failures Posing Safety Risks': 'Machinery Failure (Posing Safety Risk)',
    'Vessel Position Reports': 'Vessel Position Report',
    'Ship-Source Pollution': 'Ship-Source Pollution',
    'Ship Pollution Prevention and Control': 'Ship Pollution Prevention and Control',
    'Yangshan Port Environmental Compliance': 'Environmental Compliance (Yangshan)',
    'Port-Based BWT vs Shipboard Systems': 'Ballast Water Treatment (Port vs Ship)',
    'Port-Based BWT Facilities': 'Port-Based Ballast Water Treatment Facility',
    'Port-Based BWT Technology': 'Port-Based Ballast Water Treatment Technology',
    'Ballast Water Management Strategies': 'Ballast Water Management Strategy',
    'Cleaned Ballast Water': 'Cleaned Ballast Water',
    'PACT System Rapid Treatment': 'PACT System Rapid Treatment',
    'PACT System Performance': 'PACT System Performance',
    'PACT System Technology': 'PACT System Technology',
    'PACT System Certification': 'PACT System Certification',
    'PACT System Development': 'PACT System Development',
    'Land-Based Fumigation': 'Fumigation (Land-Based)',
    'Ship-Based Fumigation': 'Fumigation (Ship-Based)',
    'Fumigation Company': 'Fumigation Company',
    'Tug Request': 'Tug Request',

    # --- Documents and Data ---
    'Entry Summary Declaration (ENS)': 'Entry Summary Declaration (ENS)',
    'Single Administrative Document (SAD)': 'Single Administrative Document (SAD)',
    'SAD Copy 1': 'SAD Copy 1', # Specific document part
    'Dangerous Goods Declaration Form': 'Dangerous Goods Declaration', # Simplify
    'Hazardous Goods Declaration': 'Dangerous Goods Declaration', # Group Hazardous/Dangerous
    'Equipment Interchange Receipt (EIR)': 'Equipment Interchange Receipt (EIR)',
    'Equipment Interchange Receipt (EIR), Trucker': 'EIR/Trucker', # Compound Document/Role
    'Notice of Readiness (NOR)': 'Notice of Readiness (NOR)', # Implied from context
    'CUSTOMER, Ship, Notice of Readiness (NOR)': 'Customer/Vessel/NOR', # Compound Roles/Document
    'Public Licence': 'Public Licence',
    'Customs Guarantee': 'Customs Guarantee',
    'International Trade Declarations': 'International Trade Declaration',

    # --- Regulations and Policies ---
    'Regulations': 'Regulation',
    'Future Regulations': 'Regulation (Future)',
    'Regulations on International Maritime Transportation': 'Regulation (International Maritime Transportation)',
    'Maritime Transport Regulations': 'Regulation (Maritime Transport)',
    'Shanghai Municipality Ship Pollution Regulations': 'Regulation (Shanghai Ship Pollution)',
    'Marine Environment Protection Law': 'Marine Environment Protection Law',
    'Maritime Code': 'Maritime Code', # Specific Code
    'Cabotage Pilot Program': 'Cabotage Pilot Program',
    'Regional Cooperation Framework': 'Regional Cooperation Framework',
    'Regional Cooperation Mechanism': 'Regional Cooperation Framework', # Synonym normalization
    'Port Cooperation Agreements': 'Port Cooperation Agreement',
    'Regulatory Environment': 'Regulatory Environment',
    'Container Logistics Regulation': 'Container Logistics Regulation',
    'China\'s Shipping Regulation Approach': 'Shipping Regulation Approach (China)',
    'Yangshan Free Port Policies': 'Free Port Policy (Yangshan)',
    'Yangshan Free Port Status': 'Free Port Status (Yangshan)',
    'Yangshan Port Special Status': 'Special Status (Yangshan)',
    'Penalties for Chemical Ban Violations': 'Penalty (Chemical Ban Violation)',

    # --- Incoterms ---
    'FOB': 'FOB',
    'CFR': 'CFR',
    'CIF': 'CIF',
    'DPU': 'DPU',
    'CPT': 'CPT',
    'CIP': 'CIP',
    'DAP': 'DAP',
    'FAS': 'FAS',
    'FCA': 'FCA',
    'EXW': 'EXW',
    'DDP': 'DDP',

    # --- Metrics and Concepts ---
    'Rates': 'Rates',
    'Port Dues': 'Port Dues',
    'Dues': 'Dues',
    'Shipping Costs': 'Shipping Cost',
    'Container Storage Cost': 'Container Storage Cost',
    'Container Throughput Growth': 'Container Throughput Growth',
    'International Transshipment Volume': 'International Transshipment Volume',
    'Intermodal Rail Throughput': 'Intermodal Rail Throughput',
    'Intermodal Rail Throughput Growth': 'Intermodal Rail Throughput Growth',
    'Water-to-Water Transshipment Ratio': 'Water-to-Water Transshipment Ratio',
    'Yangshan Port Capacity': 'Port Capacity (Yangshan)',
    'Yangshan Port Competitiveness': 'Port Competitiveness (Yangshan)',
    'Yangshan Port Development': 'Port Development (Yangshan)',
    'Yangshan Port Expansion': 'Port Expansion (Yangshan)',
    'Yangshan Port Success Factors': 'Port Success Factors (Yangshan)',
    'Yangshan Port Operational Efficiency': 'Port Operational Efficiency (Yangshan)',
    'Yangshan Port Performance Ranking': 'Port Performance Ranking (Yangshan)',
    'CPPI Ranking': 'CPPI Ranking', # Specific Index
    'Yangshan Port Efficiency Benchmarks': 'Port Efficiency Benchmark (Yangshan)',
    'Yangshan Port Operational Model': 'Port Operational Model (Yangshan)',
    'Shanghai Port Connectivity': 'Port Connectivity (Shanghai)',
    'Shanghai Port Transshipment Role': 'Port Transshipment Role (Shanghai)',
    'Shanghai Port Transshipment Strategy': 'Port Transshipment Strategy (Shanghai)',
    'Yangshan Port Transshipment Business': 'Port Transshipment Business (Yangshan)',
    'Yangshan Port Transshipment Volume in 2024': 'Port Transshipment Volume (Yangshan 2024)',
    'Cargo Movement Efficiency': 'Cargo Movement Efficiency',
    'Logistical Efficiency': 'Logistical Efficiency',
    'Overall Supply Chain Network': 'Supply Chain Network',
    'Yangshan Port Cybersecurity': 'Cybersecurity (Yangshan)',
    'Yangshan Port Safety Systems': 'Safety System (Yangshan)',
    'General (Maritime Industry)': 'General (Maritime Industry)',
    'General (Shipping of Dangerous Goods)': 'General (Dangerous Goods Shipping)',
    'General (Shipping)': 'General (Shipping)',
    'General': 'General',
    'Authority Liability': 'Authority Liability',
    'Vessel Transit': 'Vessel Transit',

    # --- Compound Roles/Interactions (Kept specific) ---
    'Ship Masters, Agents': 'Master/Agent',
    'Shippers, Consignees, Banks': 'Shipper/Consignee/Bank',
    'Ships, Port Authorities, Immigration': 'Vessel/Port Authority/Immigration',
    'Ships, Port Authorities, Customs': 'Vessel/Port Authority/Customs',
    'Vessels, Vessel Traffic Service Center': 'Vessel/VTSC',
    'Harbour Master, Freeport Authority, Vessels': 'Harbour Master/Freeport Authority/Vessel',
    'Harbour Master, Vessels': 'Harbour Master/Vessel',
    'CUSTOMER, Ship Master, Shipping Agent': 'Customer/Master/Shipping Agent',
    'Vessel Master, Ship Agent': 'Master/Shipping Agent',
    'CUSTOMER, Ship': 'Customer/Vessel',
    'CUSTOMER, ROT Services': 'Customer/ROT Service',
    'CUSTOMER, Ship Master': 'Customer/Master',
    'Logistics Professional, Shipping Agent': 'Logistics Professional/Shipping Agent',
    'Importer, Exporter, Customs Broker': 'Importer/Exporter/Customs Broker',
    'CUSTOMER, ROT': 'Customer/ROT',
    'Trucker, Gate Attendant': 'Trucker/Gate Attendant',
    'Trucker, Crane Operator': 'Trucker/Crane Operator',
    'Trucker, Chassis': 'Trucker/Chassis',
    'Shipping Agent, RCT': 'Shipping Agent/RCT',
    'CUSTOMER, ROT Services - Downtime': 'Customer/ROT Service (Downtime)',
    'ROT Services, CUSTOMER': 'ROT Service/Customer',
    'Importer, Latvian Customs (VID)': 'Importer/Latvian Customs (VID)',
    'EU Businesses, Importers, Exporters': 'EU Business/Importer/Exporter',
    'CUSTOMER, Waste Management': 'Customer/Waste Management',
    'Latvian Customs (VID), Transit Cargo': 'Latvian Customs (VID)/Transit Cargo',
    'RCT, CUSTOMER, Customs': 'RCT/Customer/Customs Authority',
    'Laytime, Ship': 'Laytime/Vessel',
    'Port Master/Authority Officer/Police': 'Harbour Master/Authority Officer/Police',
    'Ships, Crew Members, Customs': 'Vessel/Crew Member/Customs Authority',
    'Customs Authorities, Importers/Exporters': 'Customs Authority/Importer/Exporter',
    'Shippers, Carriers, Port Authorities': 'Shipper/Carrier/Port Authority',
    'Carriers, Terminal Operators, IT Systems': 'Carrier/Terminal Operator/IT System',
    'Shippers, Terminal Operators': 'Shipper/Terminal Operator',
    'Carriers, Shipping Lines': 'Carrier/Shipping Line',
    'Carriers, Shippers, Terminal Operators': 'Carrier/Shipper/Terminal Operator',
    'Carriers, Terminal Operators, Stevedores': 'Carrier/Terminal Operator/Stevedore',
    'Carriers, Terminal Operators': 'Carrier/Terminal Operator',
    'Carriers, Terminal Operators, Shippers': 'Carrier/Terminal Operator/Shipper',
    'Consignees, Banks, Traders': 'Consignee/Bank/Trader',
    'Crew Members, Shipping Companies': 'Crew Member/Shipping Company',
    'Carriers, Shippers': 'Carrier/Shipper',
    'Port Authorities, Ship Agents': 'Port Authority/Shipping Agent',
    'Ship Suppliers, Shipping Companies': 'Ship Supplier/Shipping Company',
    'Ship Masters, Authorized Officers': 'Master/Authorized Officer',
    'Port Authorities, Technology Providers': 'Port Authority/Technology Provider',
    'Port Health Authorities, Ships': 'Port Health Authority/Vessel',
    'Global Shipping, Public Health Authorities': 'Global Shipping/Public Health Authority',
    'Ship Masters, Ship Agents': 'Master/Shipping Agent',
    'Ship Masters, Ship Surgeons': 'Master/Ship Surgeon',
    'Ship Masters, Health Authorities': 'Master/Health Authority',
    'Port Health Authorities, Public Health Organizations': 'Port Health Authority/Public Health Organization',
    'Ships, Port Health Authorities': 'Vessel/Port Health Authority',
    'Shippers, Carriers': 'Shipper/Carrier',
    'Shippers, Carriers, Consignees': 'Shipper/Carrier/Consignee',
    'Carriers and Terminals': 'Carrier/Terminal',
    'Export Container, Terminal Operator, Quay Cranes': 'Export Container/Terminal Operator/Quay Crane',
    'Import Container, Importer, Customs Authorities': 'Import Container/Importer/Customs Authority',
    'Import Container, Terminal Operator, Quay Cranes': 'Import Container/Terminal Operator/Quay Crane',
    'Trucker, BCT Gate': 'Trucker/BCT Gate',
    'Trucker, RCT Gate Access': 'Trucker/RCT Gate Access',
    'Importer, Customs Broker': 'Importer/Customs Broker',
    'Emergency Services, Berth Operator': 'Emergency Service/Berth Operator',
    'Exporter, Cultural Objects': 'Exporter (Cultural Objects)',
    'Importer, Food Products': 'Importer (Food Products)',
    'Importer, Hazardous Goods': 'Importer (Hazardous Goods)',
    'Every Vessel, Seafarers': 'Vessel/Seafarer',
    'Port Authorities, Governments': 'Port Authority/Government',
    'Developing Countries, WCO': 'Developing Country/WCO',
    'Businesses, Customs Administrations': 'Business/Customs Authority',
    'Customs Administrations, AEOs': 'Customs Authority/AEO',
    'Customs Administrations, Businesses': 'Customs Authority/Business',
    'Vessel Masters and Navigational Officers': 'Master/Navigational Officer',
    'Shanghai and Jiangsu Provinces': 'Shanghai/Jiangsu Province', # Geographic Entity
    'Contractor & HHLA Coordinator': 'Contractor/HHLA Coordinator',
    'Shipping Companies, Freight Forwarders, Cargo Owners': 'Shipping Company/Freight Forwarder/Cargo Owner',
    'Logistics Planners, Haulage Companies': 'Logistics Planner/Haulage Company',
    'Customers, Potentially Smaller Companies': 'Customer (Potentially Small)',
    'Truck Drivers, Smaller Haulage Companies': 'Trucker/Haulage Company (Small)',
    'Customers of EUROGATE': 'Customer (EUROGATE)',
    'Clients of HHLA': 'Customer (HHLA)',
    'Port Area Companies (Specific Types)': 'Port Area Company (Specific Types)',
    'Port Users (Witnessing/Causing Oil Spill)': 'Port User (Oil Spill Witness/Cause)',
    'Port Users & Port Authority (General Benefit of APICS)': 'Port User/Port Authority (APICS Benefit)',
    'Bunker Companies & Harbour Master\'s Office': 'Bunker Company/Harbour Master Office',
    'Port Users (General)': 'Port User', # Qualification grouping
    'Port Users (Violating Category 1 & 2 Violations)': 'Port User (Violation Cat 1/2)',

    # --- Remaining / Specific / Less Frequent ---
    'Shipping Industry in China': 'Shipping Industry (China)',
    'Shanghai International Shipping Center': 'Shanghai International Shipping Center',
    'International Operators': 'International Operator',
    'International Shipping Operators': 'International Shipping Operator',
    'Foreign Shipping Lines': 'Foreign Shipping Line',
    'Shipping Companies': 'Shipping Company',
    'Logistics Operations': 'Logistics Operation',
    'Port and Shipping Logistics': 'Port and Shipping Logistics',
    'Yangshan Free Port Area': 'Yangshan Free Port Area',
    'Container Storage': 'Container Storage',
    'Yangshan Port Facilities': 'Yangshan Port Facility',
    'Chassis Line': 'Chassis Line', # Specific term? Keep as is.
    'Equipment': 'Equipment',
    'Automated Guided Vehicles (AGVs)': 'Automated Guided Vehicle (AGV)', # Already covered, consistency check
    'Reach Stackers': 'Reach Stacker',
    'Quay Cranes, STS Cranes': 'Quay Crane/STS Crane',
    'RTG Cranes, Yard Cranes': 'RTG Crane/Yard Crane',
    'Terminal Tractors, Trailers': 'Terminal Tractor/Trailer',
    'Harbour Craft Design & Equipment': 'Harbour Craft Design/Equipment',
    'Participating Vessels in Cabotage Pilot': 'Vessel (Cabotage Pilot Participant)',
    'International Participation in Shipping': 'International Participation (Shipping)',
    'Party Requesting Change of Depositary (Paper Fallback)': 'Party (Requesting Depositary Change - Paper)',
    'Party Requesting Change of Depositary': 'Party (Requesting Depositary Change)',
    'Client/Deliverer (Hazardous Cargo IMO Class 1 & 7)': 'Client/Deliverer (Hazardous Cargo Class 1&7)',
    'Customer (Hazardous Cargo Shipper)': 'Customer (Hazardous Cargo Shipper)', # Keep specific context? Or group to Customer? Let's group. -> 'Customer'
    'Customer (Hazardous Cargo Shipper)': 'Customer', # Grouping
    'Client/Deliverer': 'Client/Deliverer',
    'Delivering Party': 'Delivering Party',
    'Customer/Deliverer': 'Customer/Deliverer',
    'Vessel Duty Officers': 'Vessel Duty Officer',
    'Human Personnel in Automated Operations': 'Human Personnel (Automated Operations)',
    'Manual Labor in Ports': 'Manual Labor (Ports)',
    'Human Personnel in Automated Ports': 'Human Personnel (Automated Ports)',
    'Businesses Importing to EU': 'Business (Importing to EU)',
    'Businesses Needing EORI Number': 'Business (Needing EORI)',
    'Businesses seeking AEO status': 'Business (Seeking AEO)',
    'Authorized Economic Operators (AEOs)': 'Authorized Economic Operator (AEO)',
    'Import-Export Enterprises': 'Import-Export Enterprise',
    'Humanitarian Aid Organizations': 'Humanitarian Aid Organization',
    'Humanitarian Aid Organizations, Importer': 'Humanitarian Aid Organization/Importer',
    'Logistics Professionals, Legal Teams': 'Logistics Professional/Legal Team',
    'Logistics Professional (Seeking Tariff Info)': 'Logistics Professional (Seeking Tariff)',
    'Logistics Professional (Cargo Agent)': 'Logistics Professional (Cargo Agent)',
    'Issuer of Receiving Note/Quay Order (Export Container)': 'Issuer (Receiving Note/Quay Order)',
    'Customer (Transhipment Container)': 'Customer', # Grouping
    'Customer (Container Subject to Customs Inspection)': 'Customer', # Grouping
    'Client (Shipping OOG Containers)': 'Customer', # Grouping
    'Customer (Requesting After-Hours Service)': 'Customer', # Grouping
    'Customer (Reefer Container User)': 'Customer', # Grouping
    'Client (Reefer Container User)': 'Customer', # Grouping
    'Importer/Agent': 'Importer/Agent',
    'Importer/Agent (Empty Containers)': 'Importer/Agent (Empty Containers)',
    'Importer/Customs Broker': 'Importer/Customs Broker',
    'Shippers, Consignees': 'Shipper/Consignee',
    'Vessel Operator': 'Vessel Operator',
    'Vessel Operators': 'Vessel Operator', # Plural normalization
    'Vessel Operator (Departing)': 'Vessel Operator', # State grouping
    'Vessel Operator (Arriving)': 'Vessel Operator', # State grouping
    'Vessel Operator (Transit)': 'Vessel Operator', # State grouping
    'Vessel Operator (Compulsory Pilotage)': 'Vessel Operator (Compulsory Pilotage)', # Keep specific context
    'Vessel Operator (Fire Incident)': 'Vessel Operator (Fire Incident)', # Keep specific event context
    'Vessel Operator (Departing without Clearance)': 'Vessel Operator (Departing without Clearance)', # Keep specific status context
    'High-Risk Vessel Operators': 'Vessel Operator (High Risk)',
    'Vessel Operators Seeking Reporting Guidance': 'Vessel Operator (Seeking Reporting Guidance)',
    'Inland Barge Operators': 'Inland Barge Operator',
    'Harbour Craft Operators': 'Harbour Craft Operator',
    'Offender (Direction Disobedience)': 'Offender (Direction Disobedience)',
    'Reporting Line': 'Reporting Line', # Technical term? Keep.
    'AIS-Equipped Vessels': 'Vessel (AIS Equipped)',
    'Haulage Company Collecting T1 Goods': 'Haulage Company (Collecting T1 Goods)',
    'Truck Driver Arriving Late': 'Trucker (Arriving Late)',
    'Party Conducting Activities on Sea-going Vessels': 'Party (Activities on Sea-going Vessel)',
    'Persons in Petroleumhaven': 'Person (In Petroleumhaven)',
    'Seafarers with Certificates of Competency': 'Seafarer (Certified)',
    'Pilotage Committee': 'Pilotage Committee',
    'Master of Vessel from Foreign Port': 'Master (From Foreign Port)',
    'All Parties in Transport Chain': 'Party (Transport Chain)',
    'All Parties in Container Release': 'Party (Container Release)',
}

# Apply manual mapping
for idx, row in normalization_df.iterrows():
    original_name = row['Entity Name']
    if original_name in manual_mapping:
        normalization_df.at[idx, 'Normalized Name'] = manual_mapping[original_name]
    
# Print normalization stats
original_count = len(normalization_df['Entity Name'].unique())
normalized_count = len(normalization_df['Normalized Name'].unique())
reduction = original_count - normalized_count

print(f"Original unique entities: {original_count}")
print(f"Normalized unique entities: {normalized_count}")
print(f"Reduction: {reduction} ({reduction/original_count:.1%} fewer unique entities)")

# Display updated mapping
display(normalization_df[['Entity Name', 'Count', 'Normalized Name']].sort_values('Count', ascending=False))

Original unique entities: 664
Normalized unique entities: 564
Reduction: 100 (15.1% fewer unique entities)


Unnamed: 0,Entity Name,Count,Normalized Name
0,Seller,116,Seller
1,Buyer,107,Buyer
2,Authority,22,Authority
3,"Ship Masters, Agents",22,Master/Agent
4,General,21,General
...,...,...,...
328,Importer/Exporter,1,Importer/Exporter
327,Importer/Customs Broker,1,Importer/Customs Broker
326,Dutch Customs,1,Dutch Customs
325,Customs/Importer,1,Customs Authority/Importer


## 3. Review and Export Normalization Mapping

In [9]:
# Export the normalized mapping
import os
import json

# Create the System directory if it doesn't exist
os.makedirs("../Data/System", exist_ok=True)

# Export the mapping as JSON
json_path = "../Data/System/entity_normalization_mapping.json"
with open(json_path, 'w') as f:
    json.dump(manual_mapping, f, indent=2)
print(f"Mapping exported to {json_path}")

# Export normalized table as CSV for reference
csv_path = "../Data/System/entity_normalization_table.csv"
normalization_df[['Entity Name', 'Count', 'Normalized Name']].to_csv(csv_path, index=False)
print(f"Normalization table exported to {csv_path}")

# Create a markdown summary for documentation
md_path = "../Data/System/entity_normalization_summary.md"
with open(md_path, 'w') as f:
    f.write("# Entity Name Normalization Summary\n\n")
    f.write(f"Original unique entities: {original_count}\n\n")
    f.write(f"Normalized unique entities: {normalized_count}\n\n")
    f.write(f"Reduction: {reduction} ({reduction/original_count:.1%} fewer unique entities)\n\n")
    f.write("## Top 20 most frequent entities\n\n")
    f.write("| Original Entity | Count | Normalized Entity |\n")
    f.write("|----------------|-------|-------------------|\n")
    
    for _, row in normalization_df.sort_values('Count', ascending=False).head(20).iterrows():
        f.write(f"| {row['Entity Name']} | {row['Count']} | {row['Normalized Name']} |\n")
    
    f.write("\n\nSee complete mapping in `entity_normalization_table.csv`")
    
print(f"Summary report exported to {md_path}")

Mapping exported to ../Data/System/entity_normalization_mapping.json
Normalization table exported to ../Data/System/entity_normalization_table.csv
Summary report exported to ../Data/System/entity_normalization_summary.md


## 4. Apply Normalization to All JSON Files

In [10]:
# Apply normalization to JSON files but keep originals intact

import os
import json
import shutil
from datetime import datetime
from pathlib import Path

# Create a new directory for normalized datapoints
normalized_base_dir = "../Data/Datapoints"
os.makedirs(normalized_base_dir, exist_ok=True)
print(f"Created directory for normalized datapoints: {normalized_base_dir}")

Created directory for normalized datapoints: ../Data/Datapoints


In [11]:
# Find all JSON datapoint files
def find_datapoint_files(base_dir="../Data/Ports"):
    datapoint_files = []
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith("_Datapoints.json"):
                datapoint_files.append(os.path.join(root, file))
    return datapoint_files

datapoint_files = find_datapoint_files()
print(f"Found {len(datapoint_files)} datapoint files to process")

# Stats tracking
files_processed = 0
total_changes = 0
entity_changes = {}  # Track each unique entity change

# Process each file
for file_path in datapoint_files:
    try:
        # Read the original file
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Determine target path in normalized directory
        rel_path = os.path.relpath(file_path, "../Data/Ports")
        normalized_path = os.path.join(normalized_base_dir, rel_path)
        
        # Create subdirectories if needed
        os.makedirs(os.path.dirname(normalized_path), exist_ok=True)
        
        file_changes = 0
        
        # Process each datapoint
        for item in data:
            if isinstance(item, dict) and 'relevant_entity' in item and item['relevant_entity']:
                original = item['relevant_entity']
                if original in manual_mapping:
                    normalized = manual_mapping[original]
                    if normalized != original:
                        # Update the entity name
                        item['relevant_entity'] = normalized
                        file_changes += 1
                        
                        # Track the change
                        change_key = f"{original} → {normalized}"
                        entity_changes[change_key] = entity_changes.get(change_key, 0) + 1
        
        # Save the normalized file
        with open(normalized_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4)
        
        files_processed += 1
        total_changes += file_changes
        
        if file_changes > 0:
            print(f"Normalized {file_changes} entities in {os.path.basename(file_path)}")
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Display summary statistics
print("\n--- Normalization Complete ---")
print(f"Files processed: {files_processed}")
print(f"Files with entity changes: {sum(1 for k,v in entity_changes.items() if v > 0)}")
print(f"Total entity name changes: {total_changes}")
print(f"Normalized files saved to: {normalized_base_dir}")

Found 8 datapoint files to process
Normalized 35 entities in Rotterdam_Datapoints.json
Normalized 180 entities in Yangshan_Datapoints.json
Normalized 41 entities in Hamburg_Datapoints.json
Normalized 37 entities in Antwerp_Datapoints.json
Normalized 122 entities in Singapore_Datapoints.json
Normalized 103 entities in Riga_Datapoints.json
Normalized 153 entities in IMO_Datapoints.json

--- Normalization Complete ---
Files processed: 8
Files with entity changes: 433
Total entity name changes: 671
Normalized files saved to: ../Data/Datapoints


In [12]:
# Create a report of changes made
report_path = "../Data/System/entity_normalization_changes.md"
with open(report_path, 'w') as f:
    f.write("# Entity Name Normalization Changes\n\n")
    f.write(f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    f.write(f"**Files processed:** {files_processed}\n\n")
    f.write(f"**Files with entity changes:** {sum(1 for k,v in entity_changes.items() if v > 0)}\n\n")
    f.write(f"**Total entity name changes:** {total_changes}\n\n")
    f.write(f"**Normalized files location:** {normalized_base_dir}\n\n")
    
    f.write("## Changes Made\n\n")
    f.write("| Original Entity | Normalized Entity | Count |\n")
    f.write("|----------------|-------------------|-------|\n")
    
    # Sort changes by count (most frequent first)
    sorted_changes = sorted(entity_changes.items(), key=lambda x: x[1], reverse=True)
    for change, count in sorted_changes:
        if count > 0:  # Only show actual changes
            original, normalized = change.split(" → ")
            f.write(f"| {original} | {normalized} | {count} |\n")

print(f"Change report saved to {report_path}")

# Create a README file explaining the normalized directory
readme_path = os.path.join(normalized_base_dir, "README.md")
with open(readme_path, 'w') as f:
    f.write("# Normalized Datapoints\n\n")
    f.write("This directory contains normalized versions of the original datapoint files.\n\n")
    f.write("## Normalization Process\n\n")
    f.write(f"- **Date of normalization:** {datetime.now().strftime('%Y-%m-%d')}\n")
    f.write(f"- **Total files normalized:** {files_processed}\n")
    f.write(f"- **Entity name changes made:** {total_changes}\n\n")
    f.write("## Directory Structure\n\n")
    f.write("The directory structure mirrors the original structure in `Data/Ports/`, but with normalized entity names.\n\n")
    f.write("## Reference\n\n")
    f.write("See `Data/System/entity_normalization_changes.md` for details on specific changes made.\n")
    f.write("The normalization mapping is available at `Data/System/entity_normalization_mapping.json`.\n")

print(f"Created README at {readme_path}")

# Provide instructions for further use
print("\n--- Next Steps ---")
print("1. Review the normalized files in the new directory")
print("2. If everything looks good, you can use these normalized files for your vector database")
print("3. The original files remain untouched in their original location")
print(f"4. See change report at {report_path} for a detailed breakdown of normalizations")

Change report saved to ../Data/System/entity_normalization_changes.md
Created README at ../Data/Datapoints/README.md

--- Next Steps ---
1. Review the normalized files in the new directory
2. If everything looks good, you can use these normalized files for your vector database
3. The original files remain untouched in their original location
4. See change report at ../Data/System/entity_normalization_changes.md for a detailed breakdown of normalizations
