# SDG Multilingual Media Narratives — Notebook 02: Preprocess & SDG Tagging

We:
1) load raw articles,
2) clean text,
3) normalize language labels,
4) assign SDG tags via keyword matching.

This corresponds to the preprocessing/annotation pipeline described in your document. fileciteturn0file0


In [12]:
import os, json, re
import pandas as pd
import numpy as np

from sdg_helpers import normalize_whitespace, ensure_dir, language_bucket

# Try to import project parquet helpers; fallback to pyarrow directly.
try:
    from sdg_parquet import read_parquet, write_parquet
except Exception:
    import pyarrow as pa
    import pyarrow.parquet as pq

    def write_parquet(df: pd.DataFrame, path: str, compression: str = "snappy") -> None:
        table = pa.Table.from_pandas(df, preserve_index=False)
        try:
            pq.write_table(table, path, compression=compression)
        except Exception:
            pq.write_table(table, path, compression=None)

    def read_parquet(path: str) -> pd.DataFrame:
        return pq.read_table(path).to_pandas()

PROJECT_DIR = os.path.abspath('.')
RAW_DIR = os.path.join(PROJECT_DIR, 'data', 'raw')
PROCESSED_DIR = os.path.join(PROJECT_DIR, 'data', 'processed')
ensure_dir(PROCESSED_DIR)

# Read deduplicated articles (unique by URL) from notebook 01
df = read_parquet(os.path.join(RAW_DIR, 'gdelt_articles_dedup_2024_2025.parquet'))
df.shape



(56000, 10)

In [13]:
df['text'] = (df['title'].fillna('') + '. ' + df['snippet'].fillna('')).apply(normalize_whitespace)
df['lang'] = df['language'].fillna('').apply(language_bucket)

# basic cleanup: remove very short texts
df = df[df['text'].str.len() >= 40].copy()
df.shape


(50330, 12)

In [14]:
# Load keyword sets
kw_path = os.path.join(PROJECT_DIR, 'sdg_keywords.json')
with open(kw_path, 'r', encoding='utf-8') as f:
    SDG_KEYWORDS = json.load(f)

list(SDG_KEYWORDS.keys())[:5]


['SDG1_No_Poverty',
 'SDG2_Zero_Hunger',
 'SDG3_Good_Health',
 'SDG4_Quality_Education',
 'SDG5_Gender_Equality']

## SDG tagging
We allow **multi-label** assignment. Each item can map to multiple SDGs.


In [15]:
def compile_keyword_patterns(sdg_keywords: dict) -> dict:
    patterns = {}
    for sdg, kws in sdg_keywords.items():
        # escape and join as a single OR regex; keep it simple
        kws_clean = [k.strip() for k in kws if k and isinstance(k, str)]
        kws_escaped = [re.escape(k) for k in kws_clean]
        if not kws_escaped:
            continue
        patterns[sdg] = re.compile(r'(' + r'|'.join(kws_escaped) + r')', flags=re.IGNORECASE)
    return patterns

PATTERNS = compile_keyword_patterns(SDG_KEYWORDS)
len(PATTERNS)


17

In [16]:
def tag_sdgs(text: str, patterns: dict):
    if not isinstance(text, str) or not text:
        return []
    hits = []
    for sdg, pat in patterns.items():
        if pat.search(text):
            hits.append(sdg)
    return hits

df['sdg_labels'] = df['text'].apply(lambda t: tag_sdgs(t, PATTERNS))
df['n_sdgs'] = df['sdg_labels'].apply(len)
df[['text', 'lang', 'n_sdgs']].head()


Unnamed: 0,text,lang,n_sdgs
0,Ελληνική οικονομία 2024 : Οι ανοιχτές προκλήσε...,greek,0
1,Beacon Hill Roll Call : Senate support of Gov ...,english,0
2,Energy bill price hike takes effect as record ...,english,0
3,Κίνα : Έτοιμος να συνεργαστεί με τον Μπάιντεν ...,greek,0
4,2024 är ett supervalår : Miljarder väntas rösta.,swedish,0


In [17]:
# Keep only rows with at least 1 SDG label
df_tagged = df[df['n_sdgs'] > 0].copy()
df_tagged.shape


(17795, 14)

In [18]:
# Explode to long format for easy aggregation
long = df_tagged.explode('sdg_labels').rename(columns={'sdg_labels':'sdg'})
long[['id', 'lang', 'sdg']].head()


Unnamed: 0,id,lang,sdg
5,6f202b4df3957add3fa17de65672a82904d9fcb2,croatian,SDG17_Partnerships
7,479d7917a11d10a913745b60c1d43ba23a533838,indonesian,SDG17_Partnerships
11,ce53a613efdffa49adbcbff281ed77c3b2cce8d0,romanian,SDG17_Partnerships
13,034d93aab57c33f94bd5d542a225af9af9d26faf,english,SDG3_Good_Health
20,2666157994a9949e23d0adc9ee279c01ddaea724,german,SDG17_Partnerships


In [19]:
out_long = os.path.join(PROCESSED_DIR, 'articles_sdg_long.parquet')
out_wide = os.path.join(PROCESSED_DIR, 'articles_tagged.parquet')

write_parquet(long, out_long)
write_parquet(df_tagged, out_wide)

out_long, out_wide


('/Users/sergey/code/sdg-multilingual-media-narratives/data/processed/articles_sdg_long.parquet',
 '/Users/sergey/code/sdg-multilingual-media-narratives/data/processed/articles_tagged.parquet')

## Quick sanity checks


In [20]:
long['sdg'].value_counts().head(10)


sdg
SDG17_Partnerships          11722
SDG3_Good_Health             2200
SDG13_Climate_Action          887
SDG7_Clean_Energy             748
SDG4_Quality_Education        711
SDG1_No_Poverty               622
SDG5_Gender_Equality          566
SDG9_Industry_Innovation      555
SDG8_Decent_Work              514
SDG6_Clean_Water              461
Name: count, dtype: int64

In [21]:
long['lang'].value_counts().head(10)


lang
english       13227
spanish        3175
german         1230
french          802
italian         543
indonesian      337
portuguese      306
romanian        218
chinese         203
turkish         120
Name: count, dtype: int64