In [16]:
import pandas as pd
import re
import country_converter as coco
import numpy as np

# ==========================================
# 1. SETUP & DATA ACQUISITION
# ==========================================

# Loading the UNCTAD ISDS Navigator data
path = "UNCTAD-ISDS-Navigator-data-set-31December2023.xlsx"
raw_df = pd.read_excel(path, header=11)

# Feature selection and renaming
df = raw_df[[
    "RESPONDENT STATE",
    "HOME STATE OF INVESTOR",
    "ECONOMIC SECTOR",
    "ECONOMIC SUBSECTOR",
    "SUMMARY OF THE DISPUTE",
    "AMOUNT CLAIMED (EXPRESSED IN MILLIONS)",
    "YEAR OF INITIATION"
]].rename(columns={
    "RESPONDENT STATE": "respondent_state",
    "HOME STATE OF INVESTOR": "investor_home_state",
    "ECONOMIC SECTOR": "economic_sector",
    "ECONOMIC SUBSECTOR": "economic_subsector",
    "SUMMARY OF THE DISPUTE": "summary_of_dispute",
    "AMOUNT CLAIMED (EXPRESSED IN MILLIONS)": "amount_claimed_raw",
    "YEAR OF INITIATION": "year"
})

# 2. FEATURE EXTRACTION & CLEANING

def parse_claim_amount(x):
    if pd.isna(x): return None
    s = str(x).replace(",", "")
    m = re.search(r"\(\s*([0-9]+(?:\.[0-9]+)?)\s*USD\s*\)", s)
    if m: return float(m.group(1))
    m = re.search(r"([0-9]+(?:\.[0-9]+)?)\s*USD\b", s)
    if m: return float(m.group(1))
    return None

df["amount_claimed_musd"] = df["amount_claimed_raw"].apply(parse_claim_amount)
df["year"] = pd.to_numeric(df["year"], errors="coerce")

# STRIP PRIMARY/SECONDARY/TERTIARY FROM ECONOMIC_SECTOR
# This changes "Primary - A - Agriculture..." to "A - Agriculture..."
df['economic_sector'] = df['economic_sector'].str.replace(r'^(Primary|Secondary|Tertiary)\s*-\s*', '', regex=True)

# 3. GEOSPATIAL HARMONIZATION (ISO3 CODES)

cc = coco.CountryConverter()
ISO_CACHE = {}

def get_iso3_codes(country_string):
    if pd.isna(country_string): return None
    s = str(country_string).strip()
    if s.lower() in ["data not available", "selection deleted", "european union", ""]:
        return None
    countries = [c.strip() for c in s.split(";\n") if c.strip()]
    codes = []
    for c in countries:
        if c not in ISO_CACHE:
            res = cc.convert(names=c, to="ISO3", not_found=None)
            ISO_CACHE[c] = res
        if ISO_CACHE[c] and ISO_CACHE[c] != "not found":
            codes.append(ISO_CACHE[c])
    return "; ".join(sorted(set(codes))) if codes else None

for col in ["respondent_state", "investor_home_state"]:
    df[f"{col}_iso3"] = df[col].apply(get_iso3_codes)

# 4. BINARY ENCODING FOR ECONOMIC SECTORS

# Broad ISIC Letters A through O
ISDS_LETTERS = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O"]

for let in ISDS_LETTERS:
    # Column names are clean: 'sector_A', 'sector_B', etc.
    col_name = f"sector_{let}"
    # Match if sector text starts with the letter (e.g., "A - ...") or contains it after a semicolon
    df[col_name] = df['economic_sector'].apply(lambda x: 1 if re.search(rf"(^|;\s*){let}\b", str(x)) else 0)

# 5. FINAL EXPORT
df.to_csv("ISDS_processed_dataset_final.csv", index=False)

print(f"Pipeline complete. Dimensions: {df.shape}")
print("GHTN classification removed. Dataset is now focused on cleaned NACE sectors.")

Pipeline complete. Dimensions: (1332, 25)
GHTN classification removed. Dataset is now focused on cleaned NACE sectors.
