In [1]:
import spacy
import json
from spacy.lang.en import STOP_WORDS as en_stop_words
from spacy.lang.fr import STOP_WORDS as fr_stop_words
from nltk.corpus import stopwords

en_stop_nltk = set(stopwords.words('english'))
en_stops = set(en_stop_words)
en_stops.update(en_stop_nltk)

fr_stops_nltk = set(stopwords.words('french'))
fr_stops = set(fr_stop_words)
fr_stops.update(fr_stops_nltk)

In [12]:
en_stops.update([
    "chapter",
    'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x',
    'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx',
    'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx',
    'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii'
    ])

In [6]:
fr_stops.update([
    # Contractions with j'
    "j'ai", "j'avais", "j'aurai", "j'aurais", "j'étais", "j’aie", "j’en", "j’y",
    "j’ai", "j’avais", "j’aurai", "j’aurais", "j’étais", "j’aie", "j’en", "j’y",

    # Contractions with c'
    "c'est", "c'était", "c’étaient", "c’est-à-dire",
    "c’est", "c’était", "c’étaient", "c’est-à-dire",

    # Contractions with d'
    "d'être", "d'avoir", "d'un", "d'une", "d’autres", "d’ici", "d’accord",
    "d’être", "d’avoir", "d’un", "d’une", "d’autres", "d’ici", "d’accord",

    # Contractions with l'
    "l'ai", "l'avais", "l'on", "l'un", "l'une", "l'autre", "l’était", "l’avait",
    "l’ai", "l’avais", "l’on", "l’un", "l’une", "l’autre", "l’était", "l’avait",

    # Contractions with m'
    "m'a", "m'avait", "m'as", "m’en", "m’y",
    "m’a", "m’avait", "m’as", "m’en", "m’y",

    # Contractions with n'
    "n'ai", "n'a", "n'avait", "n'est", "n'était", "n’en", "n’y",
    "n’ai", "n’a", "n’avait", "n’est", "n’était", "n’en", "n’y",

    # Contractions with s'
    "s'est", "s'était", "s'en", "s’y",
    "s’est", "s’était", "s’en", "s’y",

    # Contractions with t'
    "t'ai", "t'as", "t'a", "t’en", "t’y",
    "t’ai", "t’as", "t’a", "t’en", "t’y",

    # Contractions with qu'
    "qu'il", "qu'elle", "qu'on", "qu'un", "qu'une", "qu’ils", "qu’elles", "qu’y", "qu’en", "qu’est-ce",
    "qu’il", "qu’elle", "qu’on", "qu’un", "qu’une", "qu’ils", "qu’elles", "qu’y", "qu’en", "qu'est-ce", 

    # Misc compound or pronoun forms
    "peut-être", "vis-à-vis", "c'est-à-dire", "au-dessus", "au-dessous",
    "quelqu'un", "quelqu’une", "quelqu’un", "quelqu’une",
    "chacun", "chacune", "aucun", "aucune", "nul", "nulle",

    # Verb forms (être, avoir)
    "suis", "es", "est", "sommes", "êtes", "sont",
    "étais", "était", "étions", "étiez", "étaient",
    "ai", "as", "a", "avons", "avez", "ont",

    # Pronouns and determiners
    "celui-ci", "celui-là", "celle-ci", "celle-là",
    "ceux-ci", "ceux-là", "celles-ci", "celles-là",

    # Other
    "-NULL-", "-UNK-"
])

In [13]:
with open('word_pairs/IBM2/word_pairs_epoch_15.json', 'r') as f:
    word_pairs = json.load(f)

    filtered_pairs = {}

    for en_word, stats in word_pairs.items():

        if en_word.lower() in en_stops:
            continue

        filtered_translations = {}
        for fr_word, fr_stats in stats['french_translations'].items():
            if (fr_word.lower() not in fr_stops) and (fr_stats.get("total_count", 0) > 1):
                filtered_translations[fr_word] = fr_stats
        if filtered_translations:
            filtered_pairs[en_word] = {
                'total_translation_count': len(filtered_translations),
                'french_translations': filtered_translations
                }
    total_en_after = len(filtered_pairs)
    print(f"Total English words after filtering: {total_en_after}")

with open('word_pairs_filtered.json', 'w') as f:
    json.dump(filtered_pairs, f, indent=2, ensure_ascii=False)


Total English words after filtering: 7262
