In [None]:
import pandas as pd
import joblib
import os
import numpy as np

In [None]:
df = pd.read_parquet('/home/manuel/Documents/oii/moderation_workforce/data/youtube/ryan_raw_samples/processed_fasttext_pred.parquet')
df.head()

In [None]:
df.shape

In [None]:
id_df = pd.read_csv('/home/manuel/Documents/oii/moderation_workforce/data/youtube/ryan_raw_samples/random_prefix_26000_20240617_163129_706620 trimmed.csv')
id_df.head()

In [None]:
id_df = id_df.drop_duplicates(subset=['id'])

In [None]:
id_df.shape

In [None]:
id_list = id_df['id'].tolist()

In [None]:
df = df.loc[df['id'].isin(id_list)]

In [None]:
langs = ['__label__bul_Cyrl', '__label__hrv_Latn',
       '__label__ces_Latn', '__label__dan_Latn', '__label__est_Latn',
       '__label__fin_Latn', '__label__ell_Grek', '__label__hun_Latn',
       '__label__lvs_Latn', '__label__lit_Latn', 
       '__label__ron_Latn', '__label__slk_Latn', '__label__slv_Latn',
       '__label__swe_Latn', '__label__nld_Latn', '__label__eng_Latn',
       '__label__fra_Latn', '__label__deu_Latn', '__label__ita_Latn',
       '__label__pol_Latn', '__label__por_Latn', '__label__spa_Latn']

## calibration

In [None]:
model_path = '/home/manuel/Documents/oii/moderation_workforce/data/calibration_fasttext/youtube/calibration_models'

In [None]:
for lang in langs:
    lang_abbr = lang.replace('__label__', '')
    model = joblib.load(os.path.join(model_path, f"isotonic_regression_model_{lang_abbr}.joblib"))
    df[f'calibrated_{lang}'] = model.transform(df[lang])

## load mod count data

In [None]:
estimate_dict = dict()

In [None]:
mod_count_df = pd.read_csv('/home/manuel/Documents/oii/moderation_workforce/data/youtube/moderator_workforce_data - youtube_moderator_count_new.csv')
mod_count_df.head()

In [None]:
mod_count_df = mod_count_df[['Language', 'Average moderator count']]
mod_count_df.columns = ['language_full', 'avg_mod_count']
mod_count_df.head()

In [None]:
mod_count_df.head(50)

In [None]:
lang_map = {
    "Lithuanian": "lit_Latn",
    "Hungarian": "hun_Latn",
    "Polish": "pol_Latn",
    "Latvian": "lvs_Latn",
    "Czech": "ces_Latn",
    "Greek": "ell_Grek",
    "German": "deu_Latn",
    "Dutch": "nld_Latn",
    "Finnish": "fin_Latn",
    "Slovak": "slk_Latn",
    "Bulgarian": "bul_Cyrl",
    "Danish": "dan_Latn",
    "Italian": "ita_Latn",
    "Swedish": "swe_Latn",
    "French": "fra_Latn",
    "Croatian": "hrv_Latn",
    "Romanian": "ron_Latn",
    "English": "eng_Latn",
    "Spanish": "spa_Latn",
    "Portuguese": "por_Latn",
    "Estonian": "est_Latn",
    "Irish": "gle_Latn",
    "Maltese": "mlt_Latn",
    "Slovenian": "slv_Latn",
    "Agnostic": None  # or "agnostic"
}

# Create the new column
mod_count_df["lang"] = mod_count_df["language_full"].map(lang_map)

In [None]:
mod_count_df.head()

In [None]:
mod_count_df.head(30)

In [None]:
lang_to_avg_mod_count = dict(zip(mod_count_df["lang"], mod_count_df["avg_mod_count"]))

## estimation

In [None]:
# Example: langs = ["lit_Latn", "hun_Latn", "pol_Latn", ...]
results = {}

n_bootstrap = 5000

for lang in langs:
    lang_abbr = lang.replace('__label__', '')
    print(lang)
    col_name = f"calibrated_{lang}"
    boot_sums = []
    mod_count = lang_to_avg_mod_count[lang_abbr]
    # Bootstrap sampling
    for _ in range(n_bootstrap):
        sample = df.sample(frac=1, replace=True)  # same size as df, with replacement
        count_estimate =  sample[col_name].sum()
        daily_overall_count_estimate = count_estimate * (571443/7009)
        if daily_overall_count_estimate>0:
            normalized_mod_count = mod_count/(daily_overall_count_estimate/1000)
            boot_sums.append(normalized_mod_count)

    boot_sums = np.array(boot_sums)
    
    # Compute stats
    mean_sum = boot_sums.mean()
    lower_ci = np.percentile(boot_sums, 2.5)
    upper_ci = np.percentile(boot_sums, 97.5)

    results[lang] = {
        "mean": mean_sum,
        "ci_lower": lower_ci,
        "ci_upper": upper_ci
    }

# Convert to DataFrame for nice viewing
bootstrap_df = pd.DataFrame.from_dict(results, orient="index")
bootstrap_df.reset_index(names="lang", inplace=True)


In [None]:
bootstrap_df.head(50)

In [None]:
bootstrap_df['lang_abbr'] = bootstrap_df['lang'].apply(lambda x: x.replace('__label__', ''))

In [None]:
bootstrap_df.head()

In [None]:
# Invert lang_map: fasttext code → full language name
abbr_to_full = {v: k for k, v in lang_map.items() if v is not None}

# Map 'lang_abbr' to 'lang_full'
bootstrap_df['lang_full'] = bootstrap_df['lang_abbr'].map(abbr_to_full)


In [None]:
bootstrap_df.head()

In [None]:
import matplotlib.pyplot as plt

# Sort by mean descending
bootstrap_df_sorted = bootstrap_df.sort_values('mean', ascending=False)

# Bar positions
y_pos = range(len(bootstrap_df_sorted))

# Plot
plt.figure(figsize=(10, 7))
plt.barh(
    y=y_pos, 
    width=bootstrap_df_sorted['mean'], 
    xerr=[bootstrap_df_sorted['mean'] - bootstrap_df_sorted['ci_lower'], 
          bootstrap_df_sorted['ci_upper'] - bootstrap_df_sorted['mean']],
    color='skyblue', 
    ecolor='gray',
    capsize=4
)
plt.yticks(y_pos, bootstrap_df_sorted['lang_full'])
plt.gca().invert_yaxis()  # Highest values on top
plt.xlabel('Moderator count per thousand daily videos')
plt.title('YouTube')
plt.tight_layout()
plt.show()


## estimation, duration

In [None]:
df.shape

In [None]:
df = df.merge(id_df[['id', 'duration']], on=['id'])
df.head()

In [None]:
# Example: langs = ["lit_Latn", "hun_Latn", "pol_Latn", ...]
results = {}

n_bootstrap = 1000

for lang in langs:
    lang_abbr = lang.replace('__label__', '')
    print(lang)
    col_name = f"calibrated_{lang}"
    boot_sums = []
    mod_count = lang_to_avg_mod_count[lang_abbr]
    # Bootstrap sampling
    for _ in range(n_bootstrap):
        sample = df.sample(frac=1, replace=True)  # same size as df, with replacement
        sample[f'{col_name}_weighted'] = sample[col_name] * sample['duration']
        count_estimate =  sample[f'{col_name}_weighted'].sum()
        daily_overall_hours_estimate = (count_estimate/60)/60 * (571443/7009)
        if daily_overall_count_estimate>0:
            normalized_mod_count = mod_count/daily_overall_hours_estimate
            boot_sums.append(normalized_mod_count)

    boot_sums = np.array(boot_sums)
    
    # Compute stats
    mean_sum = boot_sums.mean()
    lower_ci = np.percentile(boot_sums, 2.5)
    upper_ci = np.percentile(boot_sums, 97.5)

    results[lang] = {
        "mean": mean_sum,
        "ci_lower": lower_ci,
        "ci_upper": upper_ci
    }

# Convert to DataFrame for nice viewing
bootstrap_df = pd.DataFrame.from_dict(results, orient="index")
bootstrap_df.reset_index(names="lang", inplace=True)

In [None]:
bootstrap_df.head(50)

In [None]:
bootstrap_df['lang_abbr'] = bootstrap_df['lang'].apply(lambda x: x.replace('__label__', ''))

In [None]:
bootstrap_df.head()

In [None]:
# Invert lang_map: fasttext code → full language name
abbr_to_full = {v: k for k, v in lang_map.items() if v is not None}

# Map 'lang_abbr' to 'lang_full'
bootstrap_df['lang_full'] = bootstrap_df['lang_abbr'].map(abbr_to_full)


In [None]:
bootstrap_df.head()

In [None]:
import matplotlib.pyplot as plt

# Sort by mean descending
bootstrap_df_sorted = bootstrap_df.sort_values('mean', ascending=False)

# Bar positions
y_pos = range(len(bootstrap_df_sorted))

# Plot
plt.figure(figsize=(10, 7))
plt.barh(
    y=y_pos, 
    width=bootstrap_df_sorted['mean'], 
    xerr=[bootstrap_df_sorted['mean'] - bootstrap_df_sorted['ci_lower'], 
          bootstrap_df_sorted['ci_upper'] - bootstrap_df_sorted['mean']],
    color='skyblue', 
    ecolor='gray',
    capsize=4
)
plt.yticks(y_pos, bootstrap_df_sorted['lang_full'])
plt.gca().invert_yaxis()  # Highest values on top
plt.xlabel('Moderator count per daily video hour')
plt.title('YouTube')
plt.tight_layout()
plt.show()


## estimation, grouping baltic state languages

In [None]:
# # Example: langs = ["lit_Latn", "hun_Latn", "pol_Latn", ...]
# results = {}

# n_bootstrap = 5000

# for lang in langs:
#     lang_abbr = lang.replace('__label__', '')
#     print(lang)
#     col_name = f"calibrated_{lang}"
#     boot_sums = []
#     mod_count = lang_to_avg_mod_count[lang_abbr]
#     # Bootstrap sampling
#     for _ in range(n_bootstrap):
#         sample = df.sample(frac=1, replace=True)  # same size as df, with replacement
#         count_estimate =  sample[col_name].sum()
#         daily_overall_count_estimate = count_estimate * (571443/7009)
#         if daily_overall_count_estimate>0:
#             normalized_mod_count = mod_count/(daily_overall_count_estimate/1000)
#             boot_sums.append(normalized_mod_count)

#     boot_sums = np.array(boot_sums)
    
#     # Compute stats
#     mean_sum = boot_sums.mean()
#     lower_ci = np.percentile(boot_sums, 2.5)
#     upper_ci = np.percentile(boot_sums, 97.5)

#     results[lang] = {
#         "mean": mean_sum,
#         "ci_lower": lower_ci,
#         "ci_upper": upper_ci
#     }

# # Convert to DataFrame for nice viewing
# bootstrap_df = pd.DataFrame.from_dict(results, orient="index")
# bootstrap_df.reset_index(names="lang", inplace=True)

In [None]:
langs

In [None]:
langs = ['__label__baltic',
    '__label__bul_Cyrl',
 '__label__hrv_Latn',
 '__label__ces_Latn',
 '__label__dan_Latn',
 '__label__fin_Latn',
 '__label__ell_Grek',
 '__label__hun_Latn',
 '__label__ron_Latn',
 '__label__slk_Latn',
 '__label__slv_Latn',
 '__label__swe_Latn',
 '__label__nld_Latn',
 '__label__eng_Latn',
 '__label__fra_Latn',
 '__label__deu_Latn',
 '__label__ita_Latn',
 '__label__pol_Latn',
 '__label__por_Latn',
 '__label__spa_Latn',
]

In [None]:
lang_to_avg_mod_count

In [None]:
# Example: langs = ["lit_Latn", "hun_Latn", "pol_Latn", ...]
results = {}

n_bootstrap = 1000

for lang in langs:
    lang_abbr = lang.replace('__label__', '')
    print(lang)
    col_name = f"calibrated_{lang}"
    boot_sums = []
    if lang == '__label__baltic':
        mod_count = lang_to_avg_mod_count['est_Latn'] + lang_to_avg_mod_count['lvs_Latn'] + lang_to_avg_mod_count['lit_Latn']
    else:
        mod_count = lang_to_avg_mod_count[lang_abbr]
    # Bootstrap sampling
    for _ in range(n_bootstrap):
        sample = df.sample(frac=1, replace=True)  # same size as df, with replacement
        if lang == '__label__baltic':
            count_estimate = sample['calibrated___label__est_Latn'].sum() + sample['calibrated___label__lit_Latn'].sum() + sample['calibrated___label__lvs_Latn'].sum()
        else:
            count_estimate =  sample[col_name].sum()
        daily_overall_count_estimate = count_estimate * (571443/7009)
        if daily_overall_count_estimate>0:
            normalized_mod_count = mod_count/(daily_overall_count_estimate/1000)
            boot_sums.append(normalized_mod_count)

    boot_sums = np.array(boot_sums)
    
    # Compute stats
    mean_sum = boot_sums.mean()
    lower_ci = np.percentile(boot_sums, 2.5)
    upper_ci = np.percentile(boot_sums, 97.5)

    results[lang] = {
        "mean": mean_sum,
        "ci_lower": lower_ci,
        "ci_upper": upper_ci
    }

# Convert to DataFrame for nice viewing
bootstrap_df = pd.DataFrame.from_dict(results, orient="index")
bootstrap_df.reset_index(names="lang", inplace=True)

In [None]:
bootstrap_df.head(50)

In [None]:
bootstrap_df['lang_abbr'] = bootstrap_df['lang'].apply(lambda x: x.replace('__label__', ''))

In [None]:
bootstrap_df.head()

In [None]:
lang_map['Baltic State Languages'] = 'baltic'

In [None]:
# Invert lang_map: fasttext code → full language name
abbr_to_full = {v: k for k, v in lang_map.items() if v is not None}

# Map 'lang_abbr' to 'lang_full'
bootstrap_df['lang_full'] = bootstrap_df['lang_abbr'].map(abbr_to_full)


In [None]:
bootstrap_df.head()

In [None]:
import matplotlib.pyplot as plt

# Sort by mean descending
bootstrap_df_sorted = bootstrap_df.sort_values('mean', ascending=False)

# Bar positions
y_pos = range(len(bootstrap_df_sorted))

# Plot
plt.figure(figsize=(10, 7))
plt.barh(
    y=y_pos, 
    width=bootstrap_df_sorted['mean'], 
    xerr=[bootstrap_df_sorted['mean'] - bootstrap_df_sorted['ci_lower'], 
          bootstrap_df_sorted['ci_upper'] - bootstrap_df_sorted['mean']],
    color='skyblue', 
    ecolor='gray',
    capsize=4
)
plt.yticks(y_pos, bootstrap_df_sorted['lang_full'])
plt.gca().invert_yaxis()  # Highest values on top
plt.xlabel('Moderator count per thousand daily videos')
plt.title('YouTube')
plt.tight_layout()
plt.show()


In [None]:
bootstrap_df.to_csv('/home/manuel/Documents/oii/moderation_workforce/data/plots/plot_data/bar_plot_normalized_counts_youtube_latest.csv',
                 index=False)

## produce data for scatter plot

In [None]:
langs = ['__label__bul_Cyrl', '__label__hrv_Latn',
       '__label__ces_Latn', '__label__dan_Latn', '__label__est_Latn',
       '__label__fin_Latn', '__label__ell_Grek', '__label__hun_Latn',
       '__label__lvs_Latn', '__label__lit_Latn', 
       '__label__ron_Latn', '__label__slk_Latn', '__label__slv_Latn',
       '__label__swe_Latn', '__label__nld_Latn', '__label__eng_Latn',
       '__label__fra_Latn', '__label__deu_Latn', '__label__ita_Latn',
       '__label__pol_Latn', '__label__por_Latn', '__label__spa_Latn']

In [None]:
mod_count_df.head()

In [None]:
mod_count_df['avg_daily_video_count'] = [None]*mod_count_df.shape[0]

In [None]:
mod_count_df.head(50)

In [None]:
for i in range(mod_count_df.shape[0]):
    lang = mod_count_df['lang'][i]
    col_name = f"calibrated___label__{lang}"
    if col_name in df:
        video_count_sample = df[col_name].sum()
        daily_video_count_avg_total = video_count_sample * (571443/7009) 
        mod_count_df['avg_daily_video_count'][i] = daily_video_count_avg_total

In [None]:
mod_count_df = mod_count_df.loc[~mod_count_df['avg_daily_video_count'].isnull()]

In [None]:
mod_count_df

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))

# Scatter plot
plt.scatter(mod_count_df["avg_daily_video_count"], mod_count_df["avg_mod_count"], s=60)

# Add labels for each language ISO code
for i, row in mod_count_df.iterrows():
    plt.text(row["avg_daily_video_count"], row["avg_mod_count"], 
             row["lang"], fontsize=9, ha='center', va='bottom')

# Log scale for both axes
plt.xscale("log")
plt.yscale("log")

# Labels and title
plt.xlabel("Average Daily Video Count (log scale)")
plt.ylabel("Average Moderator Count (log scale)")
plt.title("Moderators vs Daily Videos by Language (Log-Log Plot)")

plt.tight_layout()
plt.show()


In [None]:
mod_count_df.to_csv('/home/manuel/Documents/oii/moderation_workforce/data/plots/plot_data/scatter_plot_data_youtube.csv',
                   index=False)