In [None]:
import os, re, json
import pandas as pd
import spacy
from spacy.cli import download as spacy_download
from tqdm.notebook import tqdm
import plotly.graph_objects as go
from transformers import AutoTokenizer
import pickle
import metrics

#Put all json files inside languages_json folder and run notebook to generate pickle object, then paste it into plot_charts.py
LANG_FOLDER = "/content/languages_json"

In [2]:
LANG_CONFIG = {
    "en": ("English",  "en_core_web_sm"),
    "de": ("German",   "de_core_news_sm"),
    "es": ("Spanish",  "es_core_news_sm"),
    "fr": ("French",   "fr_core_news_sm"),
    "it": ("Italian",  "it_core_news_sm"),
    "nl": ("Dutch",    "nl_core_news_sm"),
    "pt": ("Portuguese","pt_core_news_sm"),
    "ro": ("Romanian", "ro_core_news_sm"),
    "ja": ("Japanese", "ja_core_news_sm"),
    "zh": ("Chinese",  "zh_core_web_sm"),
}

In [6]:
def ensure_spacy(model_name):
    try:
        return spacy.load(model_name)
    except OSError:
        spacy_download(model_name)
        return spacy.load(model_name)

def load_languages(folder_path, lang_config):
    files = [f for f in os.listdir(folder_path) if f.endswith(".json")]
    data_by_lang = {}
    nlp_by_lang = {}

    for filename in files:
        match = re.search(r"_([A-Za-z]+)\.json$", filename)
        if not match:
            continue

        code = match.group(1).lower()
        if code not in lang_config:
            continue

        lang_name, model_name = lang_config[code]

        filepath = os.path.join(folder_path, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            data_by_lang[lang_name] = json.load(f)

        nlp_by_lang[lang_name] = ensure_spacy(model_name)

    return data_by_lang, nlp_by_lang

def collect_pronouns_all(lang_data, nlp_by_lang, verbose=False):
    results = {}

    for lang_name in tqdm(lang_data.keys(), desc="Processing languages"):
        nlp = nlp_by_lang[lang_name]
        data = lang_data[lang_name]

        res_lang = metrics.pronouns(data, nlp, level="lang")
        res_doc = metrics.pronouns(data, nlp, level="doc")
        res_page = metrics.pronouns(data, nlp, level="page")
        res_sent = metrics.pronouns(data, nlp, level="sentence")

        if verbose:
            print(f"\n{lang_name}")
            print(res_lang, "\n")
            print(res_doc.head(), "\n")
            print(res_page.head(), "\n")
            print(res_sent.head(), "\n")

        results[lang_name] = {
            "lang": res_lang,
            "doc": res_doc,
            "page": res_page,
            "sentence": res_sent
        }

    return results

In [5]:
lang_data, nlp_by_lang = load_languages(LANG_FOLDER, LANG_CONFIG)

In [None]:
pronouns_by_lang = collect_pronouns_all(lang_data, nlp_by_lang, verbose=False)

In [9]:
with open("pronouns_by_lang.pkl", "wb") as f:
    pickle.dump(pronouns_by_lang, f)