In [7]:
import pandas as pd
import numpy as np

# ---------- helpers ----------

def nuts_filter(df):
    """
    Keep only articles that have at least one German NUTS code (starting with 'DE'),
    and collapse to one row per article id.
    """
    df = df.copy()
    
    df["NUTS"] = df["NUTS"].fillna("").astype(str)

    df_nuts_combined = (
        df.groupby("id").agg({
            "NUTS": lambda x: ", ".join(sorted({code for code in x if code.startswith("DE")})),
            "url": "first",
            "date": "first",
            "cos_dist": "first",
        })
        .reset_index()
    )

    df_nuts_combined = df_nuts_combined[df_nuts_combined["NUTS"] != ""].copy()

    return df_nuts_combined


def date_filter(df, start_year, end_year):
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    return df[
        (df["date"].dt.year >= start_year) &
        (df["date"].dt.year <= end_year)
    ].copy()


# ---------- main script ----------

file_path = "/Users/zoekirsman/Desktop/femicide_29-32/500000_32_homicide-female_DE.csv"
df = pd.read_csv(file_path)

print("Original rows:", len(df))

df = nuts_filter(df)
print("Rows after German NUTS filter:", len(df))

df = date_filter(df, 2017, 2023)
print("Rows after date filter (2017â€“2023):", len(df))

# ---------- diagnostics ----------

print("\nDiagnostics after filtering:")
print("  Unique IDs:", df["id"].nunique())
print("  Total rows:", len(df))
print("  Year range:", df["date"].dt.year.min(), "to", df["date"].dt.year.max())

print("\nSample NUTS values:")
print(df["NUTS"].head(10))

# ---------- cosine bins & sampling ----------

min_cos = df["cos_dist"].min()
print("min cos", min_cos)
max_cos = df["cos_dist"].max()
print("max cos", max_cos)
bins = np.arange(0.16, max_cos + 0.02, 0.02)

df["cosine_bin"] = pd.cut(df["cos_dist"], bins=bins, include_lowest=True)

print("\nArticles per cosine bin:")
bin_counts = df["cosine_bin"].value_counts().sort_index()
print(bin_counts)

SAMPLES_PER_BIN = 150
samples = []
per_bin_taken = {}

for b in df["cosine_bin"].cat.categories:
    bin_df = df[df["cosine_bin"] == b]
    if len(bin_df) == 0:
        continue
    n = min(SAMPLES_PER_BIN, len(bin_df))
    per_bin_taken[str(b)] = n
    samples.append(bin_df.sample(n, random_state=42))

label_df = pd.concat(samples).reset_index(drop=True)

# ---------- sampling diagnostics ----------

print("\nSampling plan:")
for b in df["cosine_bin"].cat.categories:
    b_str = str(b)
    available = bin_counts.get(b, 0)
    taken = per_bin_taken.get(b_str, 0)
    print(f"  {b_str}: available={available}, taken={taken}")

print("\nTotal sampled:", len(label_df))

# ---------- output ----------

label_df = label_df[["id", "url", "NUTS", "cos_dist", "cosine_bin"]]

out_path = "/Users/zoekirsman/Desktop/samples_manual_tagging.csv"
label_df.to_csv(out_path, index=False)
print("\nSaved labeling CSV to:", out_path)

Original rows: 1143913
Rows after German NUTS filter: 375215
Rows after date filter (2017â€“2023): 364570

Diagnostics after filtering:
  Unique IDs: 364570
  Total rows: 364570
  Year range: 2017 to 2023

Sample NUTS values:
0            DEF08, DEF0E
1     DEA12, DEA1F, DEA34
2                   DE300
3     DE222, DE914, DEB1B
4            DEA47, DEA5B
5                   DE300
6            DE212, DEA22
8            DE261, DE264
9            DEB31, DEB3I
10    DE718, DE71E, DE721
Name: NUTS, dtype: object
min cos 0.16486917
max cos 0.2714406

Articles per cosine bin:
cosine_bin
(0.159, 0.18]        20
(0.18, 0.2]        2110
(0.2, 0.22]       19062
(0.22, 0.24]      57751
(0.24, 0.26]     135993
(0.26, 0.28]     149634
Name: count, dtype: int64

Sampling plan:
  (0.159, 0.18]: available=20, taken=20
  (0.18, 0.2]: available=2110, taken=150
  (0.2, 0.22]: available=19062, taken=150
  (0.22, 0.24]: available=57751, taken=150
  (0.24, 0.26]: available=135993, taken=150
  (0.26, 0.28]: av