# UN Country/Region Data Cleaning

This notebook loads the UN Methodology dataset and outputs a cleaned CSV with columns: country name, ISO 3166-1 alpha-3 code, and region. The region is chosen by backoff: Intermediate Region Name (if present), else Sub-region Name, else Region Name.

In [17]:
import pandas as pd

In [None]:
# Load the UN Methodology file
un_m49_cleaned = pd.read_csv(
    'UNSD — Methodology.csv',
    delimiter=';',
)

In [19]:
# Function to select region with backoff
def select_region(row):
    if pd.notnull(row['Intermediate Region Name']):
        return row['Intermediate Region Name']
    elif pd.notnull(row['Sub-region Name']):
        return row['Sub-region Name']
    else:
        return row['Region Name']

In [None]:
# Clean and select relevant columns
cleaned = un_m49_cleaned[
    ['Country or Area', 'ISO-alpha3 Code', 'Intermediate Region Name', 'Sub-region Name', 'Region Name']
].copy()
cleaned['region'] = cleaned.apply(select_region, axis=1)
cleaned = cleaned.rename(columns={'Country or Area': 'country_name', 'ISO-alpha3 Code': 'country_code'})[
    ['country_name', 'country_code', 'region']
]
cleaned = cleaned.dropna(subset=['country_code'])  # Drop rows without a country code
cleaned = cleaned.drop_duplicates(subset=['country_code'])
cleaned = cleaned.sort_values('country_code')  # Sort by country_code
cleaned.head()

Unnamed: 0,country_name,country_code,region
62,Aruba,ABW,Caribbean
141,Afghanistan,AFG,Southern Asia
29,Angola,AGO,Middle Africa
60,Anguilla,AIA,Caribbean
178,Åland Islands,ALA,Northern Europe


In [None]:
# Output cleaned data to CSV
cleaned.to_csv('un_m49_cleaned_cleaned.csv', index=False)