# SDG Multilingual Media Narratives — Notebook 00: Setup & Project Config

This notebook sets up a lightweight, reproducible folder structure, environment checks, and shared configuration.

**Project context:** derived from your document “Cross-Cultural Media Narratives: A Multilingual Analysis of SDG Coverage”. fileciteturn0file0

> Notes
- These notebooks are written to run without paid APIs. GDELT is used as a public source.
- Social media APIs (X/Twitter, Weibo, etc.) are left as placeholders because access varies.


In [1]:
import os, sys, platform
import pandas as pd
print('Python:', sys.version)
print('Platform:', platform.platform())
print('Pandas:', pd.__version__)


Python: 3.13.5 (main, Jun 11 2025, 15:36:57) [Clang 17.0.0 (clang-1700.0.13.3)]
Platform: macOS-15.5-arm64-arm-64bit-Mach-O
Pandas: 2.3.3


In [2]:
PROJECT_DIR = os.path.abspath('.')
DATA_DIR = os.path.join(PROJECT_DIR, 'data')
RAW_DIR = os.path.join(DATA_DIR, 'raw')
PROCESSED_DIR = os.path.join(DATA_DIR, 'processed')
FIG_DIR = os.path.join(PROJECT_DIR, 'figures')
REPORTS_DIR = os.path.join(PROJECT_DIR, 'reports')

for d in [DATA_DIR, RAW_DIR, PROCESSED_DIR, FIG_DIR, REPORTS_DIR]:
    os.makedirs(d, exist_ok=True)

print('Created/checked folders:')
for d in [DATA_DIR, RAW_DIR, PROCESSED_DIR, FIG_DIR, REPORTS_DIR]:
    print(' -', d)


Created/checked folders:
 - /Users/sergey/code/sdg-multilingual-media-narratives/data
 - /Users/sergey/code/sdg-multilingual-media-narratives/data/raw
 - /Users/sergey/code/sdg-multilingual-media-narratives/data/processed
 - /Users/sergey/code/sdg-multilingual-media-narratives/figures
 - /Users/sergey/code/sdg-multilingual-media-narratives/reports


## Shared helpers
We define a few helper functions used by later notebooks.


In [3]:

import os
import re
import json
import time
import math
import hashlib
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

import pandas as pd
import numpy as np

def sha1_text(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8", errors="ignore")).hexdigest()

def ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)

def now_utc_iso() -> str:
    import datetime
    return datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"

def safe_get(d: dict, *keys, default=None):
    cur = d
    for k in keys:
        if not isinstance(cur, dict) or k not in cur:
            return default
        cur = cur[k]
    return cur

def normalize_whitespace(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.replace("\u00a0", " ")
    text = re.sub(r"\s+", " ", text).strip()
    return text

def parse_gdelt_date(date_str: str):
    # GDELT often returns YYYYMMDDHHMMSS
    if not isinstance(date_str, str):
        return pd.NaT
    if re.fullmatch(r"\d{14}", date_str):
        return pd.to_datetime(date_str, format="%Y%m%d%H%M%S", errors="coerce", utc=True)
    # fallback
    return pd.to_datetime(date_str, errors="coerce", utc=True)

def language_bucket(lang: str) -> str:
    # normalize a few common labels
    if not isinstance(lang, str) or not lang:
        return "unknown"
    lang = lang.lower()
    mapping = {
        "zh-cn": "zh",
        "zh-tw": "zh",
        "zh-hk": "zh",
        "zh": "zh",
        "en": "en",
        "es": "es",
        "ru": "ru",
    }
    return mapping.get(lang, lang)


In [4]:
# Save helpers to a local python module so later notebooks can import it.
helpers_path = os.path.join(PROJECT_DIR, 'sdg_helpers.py')
with open(helpers_path, 'w', encoding='utf-8') as f:
    f.write('\nimport os\nimport re\nimport json\nimport time\nimport math\nimport hashlib\nfrom dataclasses import dataclass\nfrom typing import Dict, List, Optional, Tuple\n\nimport pandas as pd\nimport numpy as np\n\ndef sha1_text(s: str) -> str:\n    return hashlib.sha1(s.encode("utf-8", errors="ignore")).hexdigest()\n\ndef ensure_dir(path: str) -> None:\n    os.makedirs(path, exist_ok=True)\n\ndef now_utc_iso() -> str:\n    import datetime\n    return datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"\n\ndef safe_get(d: dict, *keys, default=None):\n    cur = d\n    for k in keys:\n        if not isinstance(cur, dict) or k not in cur:\n            return default\n        cur = cur[k]\n    return cur\n\ndef normalize_whitespace(text: str) -> str:\n    if not isinstance(text, str):\n        return ""\n    text = text.replace("\\u00a0", " ")\n    text = re.sub(r"\\s+", " ", text).strip()\n    return text\n\ndef parse_gdelt_date(date_str: str):\n    # GDELT often returns YYYYMMDDHHMMSS\n    if not isinstance(date_str, str):\n        return pd.NaT\n    if re.fullmatch(r"\\d{14}", date_str):\n        return pd.to_datetime(date_str, format="%Y%m%d%H%M%S", errors="coerce", utc=True)\n    # fallback\n    return pd.to_datetime(date_str, errors="coerce", utc=True)\n\ndef language_bucket(lang: str) -> str:\n    # normalize a few common labels\n    if not isinstance(lang, str) or not lang:\n        return "unknown"\n    lang = lang.lower()\n    mapping = {\n        "zh-cn": "zh",\n        "zh-tw": "zh",\n        "zh-hk": "zh",\n        "zh": "zh",\n        "en": "en",\n        "es": "es",\n        "ru": "ru",\n    }\n    return mapping.get(lang, lang)\n')
print('Wrote', helpers_path)


Wrote /Users/sergey/code/sdg-multilingual-media-narratives/sdg_helpers.py


## Parquet I/O (pyarrow-first)

On some Python 3.13 + pandas/pyarrow combos, `pd.read_parquet()` / `df.to_parquet()` can fail
due to Arrow extension-type registry issues. We therefore use **pyarrow** read/write helpers.


In [None]:
import os
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

def write_parquet(df: pd.DataFrame, path: str, compression: str = "snappy") -> None:
    """Write parquet via pyarrow (avoids pandas' parquet engine registry issues)."""
    table = pa.Table.from_pandas(df, preserve_index=False)
    try:
        pq.write_table(table, path, compression=compression)
    except Exception:
        # If compression codec not available, write uncompressed.
        pq.write_table(table, path, compression=None)

def read_parquet(path: str) -> pd.DataFrame:
    """Read parquet via pyarrow and return pandas DataFrame."""
    return pq.read_table(path).to_pandas()

# Save these helpers to a module for other notebooks.
parquet_path = os.path.join(os.path.abspath("."), "sdg_parquet.py")
with open(parquet_path, "w", encoding="utf-8") as f:
    f.write(
        "import pyarrow as pa\n"
        "import pyarrow.parquet as pq\n"
        "import pandas as pd\n\n"
        "def write_parquet(df: pd.DataFrame, path: str, compression: str = 'snappy') -> None:\n"
        "    table = pa.Table.from_pandas(df, preserve_index=False)\n"
        "    try:\n"
        "        pq.write_table(table, path, compression=compression)\n"
        "    except Exception:\n"
        "        pq.write_table(table, path, compression=None)\n\n"
        "def read_parquet(path: str) -> pd.DataFrame:\n"
        "    return pq.read_table(path).to_pandas()\n"
    )
print("Wrote", parquet_path)


## Keywords (starter)
Keyword tagging is a **baseline** (weak supervision). You can replace with a multilingual classifier later.


In [5]:

# Minimal starter keyword sets (extend as you like).
# IMPORTANT: keyword lists are imperfect; treat as weak supervision.
SDG_KEYWORDS = {
    "SDG1_No_Poverty": ["poverty", "poor", "low-income", "homeless", "соцзащита", "бедност", "贫困", "pobreza"],
    "SDG2_Zero_Hunger": ["hunger", "food security", "malnutrition", "famine", "голод", "饥饿", "hambre"],
    "SDG3_Good_Health": ["health", "pandemic", "hospital", "vaccine", "well-being", "covid", "здоров", "疫苗", "salud"],
    "SDG4_Quality_Education": ["education", "school", "university", "literacy", "образован", "教育", "educación"],
    "SDG5_Gender_Equality": ["gender", "women", "girls", "equality", "femin", "гендер", "妇女", "igualdad de género"],
    "SDG6_Clean_Water": ["water", "sanitation", "wastewater", "clean drinking", "вода", "卫生", "agua potable"],
    "SDG7_Clean_Energy": ["renewable", "solar", "wind power", "clean energy", "能源转型", "возобновляем", "energía renovable"],
    "SDG8_Decent_Work": ["jobs", "employment", "labor", "wages", "economic growth", "занятость", "就业", "empleo"],
    "SDG9_Industry_Innovation": ["innovation", "infrastructure", "industry", "technology", "инфраструктур", "创新", "infraestructura"],
    "SDG10_Reduced_Inequalities": ["inequality", "inequalities", "migration", "minorities", "неравенств", "不平等", "desigualdad"],
    "SDG11_Sustainable_Cities": ["cities", "urban", "housing", "transport", "resilience", "город", "城市", "ciudades sostenibles"],
    "SDG12_Responsible_Consumption": ["recycling", "waste", "circular economy", "consumption", "отход", "循环经济", "consumo responsable"],
    "SDG13_Climate_Action": ["climate change", "global warming", "carbon emissions", "net zero", "парников", "气候变化", "cambio climático"],
    "SDG14_Life_Below_Water": ["ocean", "marine", "fishery", "plastic pollution", "海洋", "океан", "océano"],
    "SDG15_Life_On_Land": ["biodiversity", "forest", "deforestation", "wildlife", "生物多样性", "лес", "biodiversidad"],
    "SDG16_Peace_Justice": ["corruption", "justice", "conflict", "rule of law", "коррупц", "法治", "justicia"],
    "SDG17_Partnerships": ["partnership", "multilateral", "UN", "SDGs", "cooperation", "партнерств", "合作", "alianzas"],
}


In [6]:
# Save default SDG keywords to JSON for reuse/editing
import json
kw_path = os.path.join(PROJECT_DIR, 'sdg_keywords.json')
with open(kw_path, 'w', encoding='utf-8') as f:
    json.dump(SDG_KEYWORDS, f, ensure_ascii=False, indent=2)
print('Wrote', kw_path)


Wrote /Users/sergey/code/sdg-multilingual-media-narratives/sdg_keywords.json
