In [18]:
import pandas as pd

In [57]:

# Define the standardization map for country names
# Key: The variant name found in the data
# Value: The standardized name to be used for counting
COUNTRY_MAPPING = {
    "Turkey": "Türkiye",
    "Turkiye": "Türkiye",    # Add other common variations here if needed
    "UK": "United Kingdom",
    "U.S.A.": "United States",
    "U.S." : "United States",
    "US" : "United States",
    "United States of America": "United States",
    "USA" : "United States",
    "the United States" : "United States",
    "Republic of Turkey" : "Türkiye",
    "the Republic of Turkey" : "Türkiye",
    "Republic of Türkiye" : "Türkiye",
    "Turkish Republic of Northern Cyprus (TRNC)" : "Northern Cyprus",
    "TRNC" : "Northern Cyprus",
    "Turkish Republic of Northern Cyprus" : "Northern Cyprus",
    "People’s Republic of China" : "China",
    "People's Republic of China" : "China",
    "Islamic Republic of Iran" : "Iran",
    "Islamic Republic of Afghanistan" : "Afghanistan",
    "State of Palestine" : "Palestine",
    "Palestinian territories" : "Palestine",
    "Palestinian" : "Palestine",
    "UAE" : "United Arab Emirates",
    "United Arab Emirates (UAE)" : "United Arab Emirates",
    "Republic of Sudan" : "Sudan",
    "Russian Federation" : "Russia",
    "Kyrgyz Republic" : "Kyrgyzstan",
    "Federal Republic of Germany" : "Germany",
    "Democratic People’s Republic of Korea (DPRK)" : "North Korea",
    "Democratic People’s Republic of Korea" : "North Korea",
    "Democratic People's Republic of Korea" : "North Korea",
    "Republic of Korea" : "South Korea",
    "the Republic of Korea" : "South Korea",
    "Korea" : "South Korea",
    "the Netherlands" : "Netherlands",
    "Kingdom of Saudi Arabia" : "Saudi Arabia",
    "Macedonia" : "North Macedonia",
    "Republic of Macedonia" : "North Macedonia",
    
    # ... add more mappings as you discover variations
}

file_path = 'tur.jsonl'
country_frequency_dict = {}

# 1. Read the JSONL file into a pandas DataFrame
try:
    df = pd.read_json(file_path, lines=True)
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
    exit()

# 2. Iterate over the 'entities' column
for entities_dict in df['entities']:
    
    # Access the list of countries (assuming structure)
    countries_list = entities_dict['countries']
    
    for country in countries_list:
        
        # --- SOLUTION FOR SIMILAR NAMES ---
        # 3. Normalize the country name using the mapping
        # .get(key, default) returns the value if the key is found, 
        # otherwise it returns the original country name (default)
        normalized_country = COUNTRY_MAPPING.get(country, country)
        
        # 4. Use the normalized name to increment the count
        current_count = country_frequency_dict.get(normalized_country, 0)
        country_frequency_dict[normalized_country] = current_count + 1


# --- Output the results ---
print(f"Successfully processed {len(df)} records from '{file_path}'.")
print("\nNormalized Country Frequency Dictionary:")

# Print the dictionary, sorted by frequency
sorted_counts = dict(sorted(country_frequency_dict.items(), key=lambda item: item[1], reverse=True))
for country, count in sorted_counts.items():
    print(f"- {country}: {count}")



Successfully processed 5673 records from 'tur.jsonl'.

Normalized Country Frequency Dictionary:
- Türkiye: 3902
- Greece: 328
- United States: 325
- Israel: 313
- Syria: 311
- Iraq: 293
- Afghanistan: 267
- Pakistan: 195
- Russia: 178
- Palestine: 175
- Libya: 161
- Azerbaijan: 161
- Germany: 160
- Egypt: 148
- Cyprus: 148
- Iran: 146
- Somalia: 145
- France: 136
- Northern Cyprus: 130
- Ukraine: 129
- Yemen: 113
- Armenia: 105
- Saudi Arabia: 99
- Nigeria: 81
- Italy: 74
- Qatar: 73
- United Kingdom: 72
- Lebanon: 64
- Kazakhstan: 61
- Sudan: 56
- Netherlands: 55
- India: 54
- China: 53
- Mali: 53
- Kyrgyzstan: 51
- Indonesia: 49
- United Arab Emirates: 48
- Bosnia and Herzegovina: 48
- Georgia: 47
- Serbia: 47
- Austria: 45
- North Macedonia: 44
- Spain: 43
- Jordan: 43
- Switzerland: 43
- Belgium: 41
- Japan: 40
- Sweden: 40
- Uzbekistan: 40
- Kosovo: 38
- Tunisia: 37
- South Korea: 37
- Poland: 36
- Denmark: 36
- Burkina Faso: 35
- Romania: 33
- Finland: 33
- Brazil: 32
- Ethiopia: