In [None]:
import pandas as pd

## twitter

In [None]:
twitter_df = pd.read_csv('/home/manuel/Documents/oii/moderation_workforce/data/twitter/normalized_data.csv')
twitter_df.head()

In [None]:
twitter_df.columns

In [None]:
twitter_df = twitter_df[['Primary Language', 'aug23_oct23', 'oct23_march24',
       'apr24_sep24_primary', 'apr24_sep24_secondary']]

In [None]:
twitter_df.head()

In [None]:
from pathlib import Path
path_data = '/home/manuel/Documents/oii/just_another_day/data/lang_detect_fasttext_count'
fasttext_df = pd.concat([pd.read_parquet(path) for path in Path(path_data).glob('*.parquet')])
fasttext_df.head()

In [None]:
fasttext_df.columns = ['tweet_lang_fasttext', 'post_count', 'share']

In [None]:
lang_map = {
    'arb_Arab': 'Arabic',
    'bul_Cyrl': 'Bulgarian',
    'hrv_Latn': 'Croatian',
    'ces_Latn': 'Czech',
    'dan_Latn': 'Danish',
    'nld_Latn': 'Dutch',
    'eng_Latn': 'English',
    'est_Latn': 'Estonian',
    'fin_Latn': 'Finnish',
    'fra_Latn': 'French',
    'deu_Latn': 'German',
    'ell_Grek': 'Greek',
    'heb_Hebr': 'Hebrew',
    'hun_Latn': 'Hungarian',
    'gle_Latn': 'Irish',
    'ita_Latn': 'Italian',
    'lav_Latn': 'Latvian',
    'lit_Latn': 'Lithuanian',
    'mlt_Latn': 'Maltese',
    'pol_Latn': 'Polish',
    'por_Latn': 'Portuguese',
    'ron_Latn': 'Romanian',
    'slk_Latn': 'Slovak',
    'slv_Latn': 'Slovenian',
    'spa_Latn': 'Spanish',
    'swe_Latn': 'Swedish'
}


In [None]:
fasttext_df["Primary Language"] = fasttext_df["tweet_lang_fasttext"].map(lang_map)
fasttext_df.head()

In [None]:
twitter_df = twitter_df.merge(fasttext_df, on=['Primary Language'])

In [None]:
twitter_df.head(50)

In [None]:
twitter_df["avg_moderator_count"] = twitter_df[["aug23_oct23", "oct23_march24", "apr24_sep24_primary"]].mean(axis=1, skipna=True)

In [None]:
twitter_df.head()

In [None]:
lang_mod_count_dict = twitter_df.set_index("tweet_lang_fasttext")["avg_moderator_count"].to_dict()
lang_mod_count_dict

In [None]:
import os
import pickle

def load_list(lang):
    # Directory containing the pickle files
    directory = f'/home/manuel/Documents/oii/moderation_workforce/data/calibration_fasttext/twitter/bootstrap_samples/{lang}'
    
    # Initialize an empty list to hold all elements
    combined_list = []
    
    # Loop through all .pkl files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.pkl'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'rb') as f:
                data = pickle.load(f)
                if isinstance(data, list):
                    combined_list.extend(data)
                else:
                    print(f"Warning: {filename} does not contain a list.")
    
    # Optional: check the result
    print(f"Combined list contains {len(combined_list)} elements.")
    return combined_list

In [None]:
lang_post_count_dict = dict()

In [None]:
for fast_text_lang in lang_mod_count_dict:
    print(fast_text_lang)
    lang = fast_text_lang.split('_')[0]
    if lang == 'eng':
        lang = 'eng_new'
    if lang_mod_count_dict[fast_text_lang] > 0:
        if lang == 'arb':
            lang_post_count_dict[fast_text_lang] = load_list(fast_text_lang)
        else:
            lang_post_count_dict[fast_text_lang] = load_list(lang)
            

In [None]:
final_dict = dict()

In [None]:
for fast_text_lang in lang_post_count_dict:
    value_list = list()
    for post_count in lang_post_count_dict[fast_text_lang]:
        value_list.append(lang_mod_count_dict[fast_text_lang]/(post_count/1000000))
    final_dict[fast_text_lang] = value_list

In [None]:
final_dict

In [None]:
32.5/1.6

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Prepare data from final_dict
summary_data = []
for lang, values in final_dict.items():
    values_array = np.array(values)
    mean = values_array.mean()
    lower = np.percentile(values_array, 2.5)
    upper = np.percentile(values_array, 97.5)
    summary_data.append({
        "Language": lang,
        "Mean": mean,
        "CI_lower": lower,
        "CI_upper": upper
    })

summary_df = pd.DataFrame(summary_data)

# Sort by mean descending
summary_df = summary_df.sort_values(by="Mean", ascending=True).reset_index(drop=True)

In [None]:
summary_df.head(100)

In [None]:
summary_df.head(100)

In [None]:
# Mapping from fastText codes to full names
lang_code_to_name = {
    "pol_Latn": "Polish",
    "nld_Latn": "Dutch",
    "ita_Latn": "Italian",
    "spa_Latn": "Spanish",
    "arb_Arab": "Arabic",
    "hrv_Latn": "Croatian",
    "por_Latn": "Portuguese",
    "heb_Hebr": "Hebrew",
    "fra_Latn": "French",
    "eng_Latn": "English",
    "deu_Latn": "German",
    "bul_Cyrl": "Bulgarian"
}

# Replace language codes with full names
summary_df["Language"] = summary_df["Language"].map(lang_code_to_name)

In [None]:


# Plot settings
fontsize = 20
fig, ax = plt.subplots(figsize=(12, 8))

# Extract values for plotting
y_pos = np.arange(len(summary_df))
means = summary_df["Mean"]
errors = [
    means - summary_df["CI_lower"],
    summary_df["CI_upper"] - means
]

# Horizontal bar plot with error bars
ax.barh(y_pos, means, xerr=errors, align='center', color='skyblue', ecolor='black', capsize=4)

# Set y-ticks and labels
ax.set_yticks(y_pos)
ax.set_yticklabels(summary_df["Language"], fontsize=fontsize)

# Axis labels and title
ax.set_xlabel("Moderator count per million daily posts", fontsize=fontsize)
ax.set_ylabel("Language", fontsize=fontsize)
ax.set_title("Twitter/X", fontweight='bold', fontsize=fontsize+2)

# Format ticks
ax.tick_params(axis='x', labelsize=fontsize)
ax.tick_params(axis='y', labelsize=fontsize)
plt.tight_layout()
plt.show()


In [None]:
summary_df.to_csv('/home/manuel/Documents/oii/moderation_workforce/data/plots/plot_data/bar_plot_normalized_counts_twitter_latest.csv',
                 index=False)