In [None]:
import os
import pandas as pd
from collections import Counter
from tqdm import tqdm  # specific for progress bars

# 1. SETUP: The Mapping Dictionary 
# Doing this part manually was both and tiring job
COUNTRY_MAPPING = {
    "Turkey": "Türkiye",
    "Turkiye": "Türkiye",
    "UK": "United Kingdom",
    "U.S.A.": "United States",
    "U.S.": "United States",
    "US": "United States",
    "United States of America": "United States",
    "USA": "United States",
    "the United States": "United States",
    "Republic of Turkey": "Türkiye",
    "the Republic of Turkey": "Türkiye",
    "Republic of Türkiye": "Türkiye",
    "Turkish Republic of Northern Cyprus (TRNC)": "Northern Cyprus",
    "TRNC": "Northern Cyprus",
    "Turkish Republic of Northern Cyprus": "Northern Cyprus",
    "People’s Republic of China": "China",
    "People's Republic of China": "China",
    "Republic of China" : "Taiwan",
    "Republic of China (Taiwan)": "Taiwan",
    "China (Taiwan)": "Taiwan",
    "Islamic Republic of Iran": "Iran",
    "Islamic Republic of Afghanistan": "Afghanistan",
    "State of Palestine": "Palestine",
    "Palestinian territories": "Palestine",
    "Palestinian": "Palestine",
    "Syrian Arab Republic" : "Syria",
    "UAE": "United Arab Emirates",
    "United Arab Emirates (UAE)": "United Arab Emirates",
    "Republic of Sudan": "Sudan",
    "Democratic Republic of the Congo" : "Congo",
    "Democratic Republic of Congo" : "Congo",
    "Democratic Republic of Congo (DRC)" : "Congo",
    "Republic of Congo" : "Congo",
    "Russian Federation": "Russia",
    "Republic of Armenia" : "Armenia",
    "Republic of Azerbaijan" : "Azerbaijan",
    "Kyrgyz Republic": "Kyrgyzstan",
    "Republic of Tajikistan" : "Tajikistan",
    "Republic of Kazakhstan" : "Kazakhstan",
    "Republic of Uzbekistan" : "Uzbekistan",
    "Federal Republic of Germany": "Germany",
    "Czech Republic" : "Czechia",
    "Democratic People’s Republic of Korea (DPRK)": "North Korea",
    "Democratic People’s Republic of Korea": "North Korea",
    "Democratic People's Republic of Korea": "North Korea",
    "Republic of Korea": "South Korea",
    "Republic of Korea (ROK)" : "South Korea",
    "the Republic of Korea": "South Korea",
    "ROK" : "South Korea",
    "Korea": "South Korea",
    "the Netherlands": "Netherlands",
    "Kingdom of Saudi Arabia": "Saudi Arabia",
    "Macedonia": "North Macedonia",
    "Bosnia" : "Bosnia and Herzegovina",
    "Bosnia-Herzegovina" : "Bosnia and Herzegovina",
    "Federation of Bosnia and Herzegovina" : "Bosnia and Herzegovina",
    "Federation of BiH" : "Bosnia and Herzegovina",
    "Federation of Bosnia and Herzegovina (FBiH)" : "Bosnia and Herzegovina",
    "Federation of Bosnia and Herzegovina (FBiH)" : "Bosnia and Herzegovina",
    "BiH" : "Bosnia and Herzegovina",
    "Republic of Moldova" : "Moldova",
    "Republic of Macedonia": "North Macedonia",
    "Slovak Republic" : "Slovakia",
    "Republic of Slovenia" : "Slovenia",
    "Great Britain" : "United Kingdom",
    "Britain" : "United Kingdom",
    "United Kingdom of Great Britain and Northern Ireland" : "United Kingdom",
    "The Netherlands" : "Netherlands",
    "Brunei Darussalam" : "Brunei",
    "Dominica" : "Dominican Republic",
    "Republic of Equatorial Guinea" : "Equatorial Guinea",

    # Add any new mappings you find here
}

# 2. CONFIGURATION
ROOT_DIRECTORY = 'downloaded_data'  # The folder containing CHN_exec, etc.
cumulative_counts = Counter()       # This handles the math for us

# 3. LOCATE FILES
# We walk through the folder structure to find every "news.jsonl"
files_to_process = []
for root, dirs, files in os.walk(ROOT_DIRECTORY):
    for file in files:
        if file == "news.jsonl":
            files_to_process.append(os.path.join(root, file))

print(f"Found {len(files_to_process)} news files to process.")

# 4. PROCESS FILES
for file_path in tqdm(files_to_process, desc="Processing Files"):
    try:
        # Read the file (using lines=True as per your notebook)
        df = pd.read_json(file_path, lines=True)
        
        # Check if 'entities' column exists to avoid errors
        if 'entities' not in df.columns:
            continue

        # Iterate through the rows
        for entities_dict in df['entities']:
            # Safety check: ensure entities_dict is actually a dictionary and has 'countries'
            if isinstance(entities_dict, dict) and 'countries' in entities_dict:
                countries_list = entities_dict['countries']
                
                # Normalize and count
                for country in countries_list:
                    normalized_country = COUNTRY_MAPPING.get(country, country)
                    cumulative_counts[normalized_country] += 1

    except ValueError:
        print(f"Skipping {file_path}: Invalid JSON format.")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# 5. OUTPUT RESULTS
print("\n--- Cumulative Frequency Results ---")

# Sort by frequency (highest to lowest)
sorted_counts = cumulative_counts.most_common()

for country, count in sorted_counts:
    print(f"- {country}: {count}")

# Optional: Save to a CSV file for safekeeping
# pd.DataFrame(sorted_counts, columns=['Country', 'Count']).to_csv("total_country_counts.csv", index=False)

Found 248 news files to process.


Processing Files: 100%|██████████| 248/248 [00:43<00:00,  5.75it/s]


--- Cumulative Frequency Results ---
- United States: 172622
- Russia: 113772
- United Kingdom: 62493
- China: 61547
- Ukraine: 61369
- Kyrgyzstan: 53538
- New Zealand: 49726
- South Africa: 47001
- Syria: 42274
- Germany: 42221
- India: 42127
- Türkiye: 40479
- France: 39649
- Colombia: 38816
- Japan: 36432
- Philippines: 35095
- Iran: 30670
- Israel: 30483
- Canada: 30385
- Hungary: 30347
- South Korea: 30272
- Uruguay: 29630
- United Arab Emirates: 29419
- Spain: 27840
- Italy: 26604
- Indonesia: 26140
- Afghanistan: 25440
- Brazil: 25288
- Georgia: 24210
- Iraq: 24202
- Mexico: 24107
- Ecuador: 23732
- Venezuela: 23362
- Slovakia: 23318
- Bosnia and Herzegovina: 22401
- Peru: 22092
- Equatorial Guinea: 21584
- Cuba: 21419
- Australia: 21100
- Egypt: 20109
- Guatemala: 20004
- Argentina: 18771
- Panama: 18621
- Qatar: 18419
- Romania: 18327
- Netherlands: 18075
- Armenia: 17957
- Poland: 17951
- Azerbaijan: 17536
- Kazakhstan: 17262
- Serbia: 17242
- Pakistan: 17085
- Jordan: 17033




Now, to continue in the next steps, I am discarding the freqs below 1000

In [5]:
import os
import pandas as pd
from collections import Counter
from tqdm import tqdm

# 1. SETUP: The Updated Mapping Dictionary (From your latest file)
COUNTRY_MAPPING = {
    "Turkey": "Türkiye",
    "Turkiye": "Türkiye",
    "UK": "United Kingdom",
    "U.S.A.": "United States",
    "U.S.": "United States",
    "US": "United States",
    "United States of America": "United States",
    "USA": "United States",
    "the United States": "United States",
    "Republic of Turkey": "Türkiye",
    "the Republic of Turkey": "Türkiye",
    "Republic of Türkiye": "Türkiye",
    "Turkish Republic of Northern Cyprus (TRNC)": "Northern Cyprus",
    "TRNC": "Northern Cyprus",
    "Turkish Republic of Northern Cyprus": "Northern Cyprus",
    "People’s Republic of China": "China",
    "People's Republic of China": "China",
    "Republic of China" : "Taiwan",
    "Republic of China (Taiwan)": "Taiwan",
    "China (Taiwan)": "Taiwan",
    "Islamic Republic of Iran": "Iran",
    "Islamic Republic of Afghanistan": "Afghanistan",
    "State of Palestine": "Palestine",
    "Palestinian territories": "Palestine",
    "Palestinian": "Palestine",
    "Syrian Arab Republic" : "Syria",
    "UAE": "United Arab Emirates",
    "United Arab Emirates (UAE)": "United Arab Emirates",
    "Republic of Sudan": "Sudan",
    "Democratic Republic of the Congo" : "Congo",
    "Democratic Republic of Congo" : "Congo",
    "Democratic Republic of Congo (DRC)" : "Congo",
    "Republic of Congo" : "Congo",
    "Russian Federation": "Russia",
    "Republic of Armenia" : "Armenia",
    "Republic of Azerbaijan" : "Azerbaijan",
    "Kyrgyz Republic": "Kyrgyzstan",
    "Republic of Tajikistan" : "Tajikistan",
    "Republic of Kazakhstan" : "Kazakhstan",
    "Republic of Uzbekistan" : "Uzbekistan",
    "Federal Republic of Germany": "Germany",
    "Czech Republic" : "Czechia",
    "Democratic People’s Republic of Korea (DPRK)": "North Korea",
    "Democratic People’s Republic of Korea": "North Korea",
    "Democratic People's Republic of Korea": "North Korea",
    "Republic of Korea": "South Korea",
    "Republic of Korea (ROK)" : "South Korea",
    "the Republic of Korea": "South Korea",
    "ROK" : "South Korea",
    "Korea": "South Korea",
    "the Netherlands": "Netherlands",
    "Kingdom of Saudi Arabia": "Saudi Arabia",
    "Macedonia": "North Macedonia",
    "Bosnia" : "Bosnia and Herzegovina",
    "Bosnia-Herzegovina" : "Bosnia and Herzegovina",
    "Federation of Bosnia and Herzegovina" : "Bosnia and Herzegovina",
    "Federation of BiH" : "Bosnia and Herzegovina",
    "Federation of Bosnia and Herzegovina (FBiH)" : "Bosnia and Herzegovina",
    "BiH" : "Bosnia and Herzegovina",
    "Republic of Moldova" : "Moldova",
    "Republic of Macedonia": "North Macedonia",
    "Slovak Republic" : "Slovakia",
    "Republic of Slovenia" : "Slovenia",
    "Great Britain" : "United Kingdom",
    "Britain" : "United Kingdom",
    "United Kingdom of Great Britain and Northern Ireland" : "United Kingdom",
    "The Netherlands" : "Netherlands",
    "Brunei Darussalam" : "Brunei",
    "Dominica" : "Dominican Republic",
    "Republic of Equatorial Guinea" : "Equatorial Guinea",
}

# 2. CONFIGURATION
ROOT_DIRECTORY = 'downloaded_data'
MIN_FREQUENCY = 1000   # Filter threshold
OUTPUT_FILE = 'filtered_country_counts.csv'

cumulative_counts = Counter()

# 3. LOCATE FILES
files_to_process = []
for root, dirs, files in os.walk(ROOT_DIRECTORY):
    for file in files:
        if file == "news.jsonl":
            files_to_process.append(os.path.join(root, file))

print(f"Found {len(files_to_process)} news files to process.")

# 4. PROCESS FILES
for file_path in tqdm(files_to_process, desc="Processing Files"):
    try:
        df = pd.read_json(file_path, lines=True)
        
        if 'entities' not in df.columns:
            continue

        for entities_dict in df['entities']:
            if isinstance(entities_dict, dict) and 'countries' in entities_dict:
                countries_list = entities_dict['countries']
                
                for country in countries_list:
                    # Normalize using the updated mapping
                    normalized_country = COUNTRY_MAPPING.get(country, country)
                    cumulative_counts[normalized_country] += 1

    except ValueError:
        pass # Skip malformed files silently
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# 5. FILTER AND SAVE
print("\n--- Filtering and Saving ---")

# Filter: Keep only countries with count >= 1000
filtered_data = {k: v for k, v in cumulative_counts.items() if v >= MIN_FREQUENCY}

# Sort highest to lowest
sorted_filtered_data = dict(sorted(filtered_data.items(), key=lambda item: item[1], reverse=True))

if not sorted_filtered_data:
    print("No countries found with frequency >= 1000.")
else:
    # Convert to DataFrame and Save to CSV
    df_result = pd.DataFrame(list(sorted_filtered_data.items()), columns=['Country', 'Frequency'])
    df_result.to_csv(OUTPUT_FILE, index=False)
    
    print(f"Success! Saved {len(df_result)} countries to '{OUTPUT_FILE}'.")
    print(df_result.head(10)) # Preview top 10

Found 248 news files to process.


Processing Files: 100%|██████████| 248/248 [00:42<00:00,  5.84it/s]


--- Filtering and Saving ---
Success! Saved 189 countries to 'filtered_country_counts.csv'.
          Country  Frequency
0   United States     172622
1          Russia     113772
2  United Kingdom      62493
3           China      61547
4         Ukraine      61369
5      Kyrgyzstan      53538
6     New Zealand      49726
7    South Africa      47001
8           Syria      42274
9         Germany      42221



