In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
#set source and output paths
source_path = '../../data/'
csv_output_path = '../../data/processed/'
figure_output_path = '../../paper/figures/'

#upload raw query data
df_32_raw = pd.read_csv(f'{source_path}repository_queries/500000_32_homicide-female_DE.csv') 
#upload manually tagged articles with json data
df_tag = pd.read_csv(f'{source_path}processed/manual-tag_all_parsedson.csv') 
#upload keyword data
df_key = pd.read_csv(f'{source_path}manual_tag/femicide_keywords.csv') 

#filter to only one entry per NUTS
df_32_raw['NUTS'] = df_32_raw['NUTS'].fillna('').astype(str)
df_32 = df_32_raw.groupby('id').agg({
    'NUTS': lambda x: ', '.join(sorted(set(code for code in x if code.startswith('DE')))),
    'url': 'first',
    'hostname': 'first',
    'date': 'first',
    'cos_dist': 'first' # these values will all be the same
    }).reset_index()

In [1]:
#get subsets of sampled and checked articles

#ids that were sampled and had a T/F outcome 
#filter out nan for 'query_32_cosine_bin' and 'woman_murdered'
#select only id
df_sampled = df_tag.dropna(subset=['query_32_cosine_bin', 'woman_murdered'])
#get list of ids
sampled_ids = df_sampled['id']
#filter raw dataset to just those ids
df_32_sampled = df_32[df_32['id'].isin(sampled_ids)].copy()

#all ids that were manualy checked that had a T/F outcome
#filter out nan for 'woman_murdered'
df_checked = df_tag[df_tag['woman_murdered'].notna()]
#get list of ids
checked_ids = df_checked['id']
#filter raw dataset to just those ids
df_32_checked = df_32[df_32['id'].isin(checked_ids)].copy()


# Verify the result
print(f"Original shape: {df_tag.shape}")
print(f"sample shape: {df_sampled.shape}")
print(f"all checked shape: {df_checked.shape}")
print(f"32 sample shape: {df_32_sampled.shape}")
print(f"32 all checked shape: {df_32_checked.shape}") 
#less than all checked because some checked articles were not included in q 32 results

NameError: name 'df_tag' is not defined

In [None]:
# plt.rcParams.update(bundles.beamer_moml(rel_width=0.3))
fig, ax = plt.subplots()

ax.plot(
    df_f["finish_time"],
    0.5 * u_f,
    "o",
    label=f"{n_f} Girls",
    color=rgb.tue_red,
    alpha=0.5,
    mec="none",
    ms=4,
)
ax.plot(
    df_m["finish_time"],
    0.5 * u_m + 0.5,
    "o",
    label=f"{n_m} Boys",
    color=rgb.tue_blue,
    alpha=0.5,
    mec="none",
    ms=4,
)

# empirical CDFs:
y = np.linspace(120, 280, 100)
cdf_f = np.array([np.sum(df_f["finish_time"] <= yi) for yi in y]) / len(
    df_f["finish_time"]
)
cdf_m = np.array([np.sum(df_m["finish_time"] <= yi) for yi in y]) / len(
    df_m["finish_time"]
)

ax.plot(y, (cdf_f), color=rgb.tue_red, alpha=0.75, label="empirical CDF of girls")
ax.plot(y, cdf_m, color=rgb.tue_blue, alpha=0.75, label="empirical CDF of boys")

for alpha in [0.5, 0.1, 0.05, 0.01]:
    DKW_f_lower = np.array(
        [
            np.max([0, cdf_f[i] - np.sqrt(np.log(2 / alpha) / (2 * n_f))])
            for i in range(len(y))
        ]
    )
    DKW_f_upper = np.array(
        [
            np.min([1, cdf_f[i] + np.sqrt(np.log(2 / alpha) / (2 * n_f))])
            for i in range(len(y))
        ]
    )
    DKW_m_lower = np.array(
        [
            np.max([0, cdf_m[i] - np.sqrt(np.log(2 / alpha) / (2 * n_m))])
            for i in range(len(y))
        ]
    )
    DKW_m_upper = np.array(
        [
            np.min([1, cdf_m[i] + np.sqrt(np.log(2 / alpha) / (2 * n_m))])
            for i in range(len(y))
        ]
    )

    ax.fill_between(
        y,
        DKW_f_lower,
        DKW_f_upper,
        color=rgb.tue_red,
        alpha=0.25,
        label=fr"DKW $\alpha={alpha * 100:.0f}\%$ interval",
    )
    ax.fill_between(
        y,
        DKW_m_lower,
        DKW_m_upper,
        color=rgb.tue_blue,
        alpha=0.25,
        # label=fr"DKW $\alpha={alpha * 100:.0f}\%$ interval of boys",
    )

# set the x-axis formatter to the custom function
ax.xaxis.set_major_locator(plt.MultipleLocator(10))
ax.xaxis.set_minor_locator(plt.MultipleLocator(1))
ax.xaxis.set_major_formatter(plt.FuncFormatter(format_seconds))
ax.legend(loc="center right", framealpha=0.9, facecolor="white")
ax.grid(axis="x", which="major", alpha=0.5, color=rgb.tue_dark)
ax.grid(axis="x", which="minor", alpha=0.5, color=rgb.tue_gray)
# ax.yaxis.set_visible(False)
ax.set_ylim(0, 1)
ax.yaxis.set_major_locator(plt.MultipleLocator(0.25))
ax.grid(axis="y", which="major", alpha=0.5)
ax.set_ylabel("CDF($t$)")

fig.savefig("erbelauf_U10_DKW.pdf")


# fig.savefig("erbelauf_U10_DKW_narrow.pdf")