# Lemmatize Text

In [1]:
import spacy
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
from spacy import displacy

l2 = FrenchLefffLemmatizer()

In [5]:
french_text_origin_path = "./french_text.txt"
french_lemmatized_path = "./french_lem.txt"

In [112]:
file = open(french_text_origin_path, mode='r', encoding="cp1252")
original = file.readlines()
file.close()

original_corrected = []
for para in original:
    original_corrected.append(para.replace('\"', ' \" '))

In [111]:
nlp = spacy.load('fr', n_threads=4)
#nlp.remove_pipe('parser')
nlp.remove_pipe('ner')

('ner', <spacy.pipeline.EntityRecognizer at 0x7f6394147728>)

In [182]:
pos_mapping = {
    "SCONJ": 'csu',
    "PRON": 'pro',
    "VERB": 'v',
    "NUM": "n",
    "NOUN": "n",
    "AUX": "auxAvoir",
    "PUNCT": "ponctw",
    "DET": "det",
    "ADJ": "adj",
    "ADP": "prep",
    "PROPN": "np",
    "ADV": "adv",
    'SPACE': ' ',
    'CCONJ': 'cc',
    '': '',
    'INTJ': 'intj',
    'PART': '',
    'X': ''
}

def lem_selector(text, pos_, dep_, doc):
    lem = l2.lemmatize(text, pos_mapping[pos_])
    if pos_ == 'PUNCT':
        res = '__punctuation__'
    elif isinstance(lem, str):
        res = lem + '_' + pos_mapping[token.pos_]
    elif isinstance(lem, tuple):
        res = lem[0] + '_' + lem[1]
    elif len(lem) == 0:
        return '\n'
    elif len(lem) == 1:
        res = lem[0][0] + '_' + lem[0][1]
    elif pos_ == 'PRON':
        res = '__PRONOUN__'
    elif pos_ == 'AUX' or dep_ == 'aux':
        res = '__AUXILARY__'
    elif pos_ == 'CCONJ':
        res = '__CCONJ__'
    elif pos_ == 'ADV':
        return lem_selector(text, 'VERB', dep_, doc)
    else:
        res = '__UNRESOLVED__' 
        #print(text, pos_, dep_, lem, doc)
    
    return res + '__origin' + text

In [184]:
output_file = open(french_lemmatized_path, mode='w+')
for para in original_corrected:
    doc = nlp(para)
    res = []
    for token in doc:
        res.append(lem_selector(token.text, token.pos_, token.dep_, doc.text))
    #print(' '.join(res))    
    output_file.write(' '.join(res))
    
output_file.close()

# Obtain Vocabulary - LPP

In [6]:
file = open(french_lemmatized_path, mode='r')
original = file.readlines()
file.close() 

In [7]:
vocabulary = {}

def vocabulary_updater(word_pos, vocabulary=vocabulary):
    if word_pos in vocabulary:
        vocabulary[word_pos] += 1
    else:
        vocabulary[word_pos] = 1

In [8]:
for para in original:
    data = para.strip().split(' ')
    for token in data:
        word_parse = token.split('__origin')
        vocabulary_updater(word_parse[0])

In [9]:
word_list = [k for k in sorted(vocabulary, key=vocabulary.get, reverse=True)]

In [202]:
overwrite_check = False

if overwrite_check:
    output_file = open('vocabulary_pos_lpp.txt', mode='w+')

    for word in word_list:
        output_file.write(word + ';' + str(vocabulary[word]) + '\n');

    output_file.close()

# Vocabulary Intersection with WOLF

In [10]:
file = open('./vocabulary_pos_wolf.txt')
vocabulary_wolf = file.read()
vocabulary_wolf = set(vocabulary_wolf.split('\n'))

In [11]:
vocabulary_lpp = set(vocabulary.keys())

In [12]:
freq_accounted = 0
for word in vocabulary_lpp.intersection(vocabulary_wolf):
    freq_accounted += vocabulary[word]

In [13]:
freq_unaccounted = 0
for word in vocabulary_lpp.difference(vocabulary_wolf):
    freq_unaccounted += vocabulary[word]

In [14]:
freq_accounted, freq_unaccounted

(4408, 14869)

## LPP POS Tag Analyse

In [15]:
lpp_pos_dict = dict()

for entry in vocabulary_lpp:
    try:
        word, pos = entry.rsplit('_', 1)
    except:
        print('Error', entry)    
        continue
    if pos not in lpp_pos_dict:
        lpp_pos_dict[pos] = set()
        lpp_pos_dict[pos].add(word)
    else:
        lpp_pos_dict[pos].add(word)

Error 
Error États-Unis


In [16]:
lpp_pos_dict.keys()

dict_keys(['v', 'nc', 'n', 'adj', 'np', 'adv', 'clneg', 'prep', 'pro', 'coo', 'det', 'csu', 'auxAvoir', '', 'advneg', 'prel', 'cln'])

In [17]:
pos_wolf_map = {
    "adv": "b",
    "adj": "a",
    "np": "n",
    "nc": "n",
    "n": "n",
    "v": 'v',
}

functional_pos_set = {'prep', 'coo', 'auxAvoir', 'det', 'clneg', 
                      'csu', 'pro', 'advneg', 'cln', 'prel'}

def fuzzy_match(word_included, vocabulary_including):
    if word_included[:2] == "__":
        return '__GIVEN_FUNCTIONAL__'
    try:
        word, pos = word_included.rsplit('_', 1)
    except:
        print('Error splitting', word_included)
        return False
    if pos in functional_pos_set:
        return '__MATCHED_FUNCTIONAL__'
    else:
        try:
            pos = pos_wolf_map[pos]
        except:
            print('Unknown pos tag', pos, word)
        if word in vocabulary_including:
            return word
        else:
            word = word.lower() + '_' + pos
            if word in vocabulary_including:
                return word
            else:
                print('Unable to match', word_included)
                return False

In [18]:
to_match = [k for k in sorted(vocabulary_lpp.difference(vocabulary_wolf), key=vocabulary.get, reverse=True)]
unmatched_1 = set()
match_1 = dict()
for word in to_match:
    #print(word, fuzzy_match(word, vocabulary_wolf))
    res = fuzzy_match(word, vocabulary_wolf)
    if not res:
        unmatched_1.add(word)
    else:
        match_1[word] = res

Error splitting 
Unable to match falloir_v
Unable to match allumeur_n
Unable to match toi_n
Unable to match Ah_n
Unable to match bonhomme_n
Unable to match Pourquoi_adv
Unable to match t_n
Unable to match yeux_n
Unable to match attendre_v
Unable to match personne_v
Unable to match voilà_v
Unable to match pourquoi_adv
Unable to match vaniteux_n
Unable to match Celui-là_n
Unable to match moutons_n
Unable to match aurais_n
Unable to match ramoner_v
Unable to match géographe_v
Unable to match suffit_n
Unable to match conter_adv
Unable to match Parce_np
Unable to match seras_n
Unable to match muselière_n
Unable to match hem_n
Unable to match Lui_np
Unable to match qu'_adv
Unable to match plaire_v
Unable to match comprendre_adj
Unable to match 612_n
Unable to match Comment_adv
Unable to match voici_v
Unable to match drôle_v
Unable to match oeil_n
Unable to match grelot_n
Unable to match éteint_n
Unable to match tas_n
Unable to match demanda_n
Unable to match mienne_n
Unable to match stupéfai

In [19]:
for word in match_1:
    freq_accounted += vocabulary[word]
    freq_unaccounted -= vocabulary[word]

In [20]:
freq_accounted, freq_unaccounted

(17687, 1590)

In [21]:
word = ''
unmatched_1.discard(word)
freq_accounted += vocabulary[word]
freq_unaccounted -= vocabulary[word]

numbers_list = ['11_n', '15_n', '16_n', '17_n', '18_n', '1909_n',
 '1920_n', '20_n', '22_n', '24_n', '25_n', '26_n', '27_n', '3251_n',
 '325_n', '326_n', '327_n', '328_n', '329_n', '330_n', '612_n',
 '6_n', '7_n', '9_n']
for word in numbers_list:
    unmatched_1.discard(word)
    freq_accounted += vocabulary[word]
    freq_unaccounted -= vocabulary[word]

In [22]:
freq_accounted, freq_unaccounted

(18517, 760)

In [37]:
built_vocabulary_lpp = vocabulary_lpp.intersection(vocabulary_wolf).union(match_1.keys())
built_vocabulary = vocabulary_lpp.intersection(vocabulary_wolf).union(match_1.values())

In [38]:
len(unmatched_1), len(built_vocabulary), len(built_vocabulary_lpp)

(438, 1283, 1395)

In [27]:
manual_match = {
    'pleines_n': 'plein_a', 'possèdes_n': 'posséder_v', 'pourquoi_adv': "pourquoi_n",
    'proie_v': 'proie_n', 'promène_n': 'promener_v', 'puissant_v': 'puissant_a', 'pâle_v': "pâle_a",
    "qu'_adv": "__MANUAL_REJ__", 'quarante-trois_n': "__MANUAL_REJ__", 'ralluma_n': 'rallumer_v', 'rallumer_n': 'rallumer_v', 'ramena_n': 'ramener_v', 'rapporte_n': 'rapporter_v', 'rare_v': 'rare_a', 'rassura_n': 'rassurer_v', 'ravissante_n': "ravissant_a", 'rayonnement_v': 'rayonnement_n', 'recommença_n': 'recommencer_v', 'regarda_n': 'regarder_v', 'regarde_n': 'regarder_v', 'regarder_adv': 'regarder_v', 'regarderas_n': 'regarder_v', 'regrettes_n': 'regretter_v', 'rejoindre_n': 'rejoindre_v',
    'remord_n': 'remords_n', 'rends_n': 'rendre_v', 'respectueusement_v': 'respectueusement_b', 'retournais_n': 'retourner_v',
    'reviendras_n': 'revenir_v', 'revolver_v': 'revolver_n', 'riche_v': 'riche_a', 'ronds_v': 'rond_a', 'roser_v': 'rose_n',
    'réjouir_n': 'réjouir_v', 'répondis_n': 'répondre_v', 'répondu_adj': 'répondre_v', 'réussir_adv': 'réussir_v', 'réussir_n': 'réussir_v',
    'réussis_n': 'réussir_v', 'réveille_n': 'se_réveiller_v', 'révélé_n': 'révéler_v',
    'salée_n': 'salé_a', 'sept._n': "__MANUAL_REJ__", 'seras_n': 'être_v', 'serrais_n': 'serrer_v', 'servir_n': "servir_v", 'si_adv': 'tellement_b',
    'sire_n': 'sir_n', 'soixante-douze_n': "__MANUAL_REJ__", 'soi£_n': 'soif_n',
    'sorti_adj': 'sortir_v', 'stupéfait_v': 'stupéfait_a', 'su_n': 'savoir_v',
    'suffit_n': 'suffire_v', "t'_adv": "__MANUAL_REJ__", "t'_v": "__MANUAL_REJ__",
    't_n': "__MANUAL_REJ__", 'tas_n': 'ta_n', 'tellement_n': "tellement_b", 'tellement_v': 'tellement_b', 'terribles_n': 'terrible_a', 'terribles_v': "terrible_a", 'tien_n': 'tenir_v', 'tienne_n': "__MANUAL_REJ__", 'tiennes_n': "__MANUAL_REJ__", 'timidement_n': 'timidement_b', 'toi_n': "__MANUAL_REJ__", 'tombé_adj': 'tomber_v', 'ton_adv': "__MANUAL_REJ__", 'tranquille_n': 'tranquille_a', 'travaux_n': 'travail_n', 'trouva_n': 'trouver_v', 'trouves_n': 'trouver_v', 'tu_adj': "__MANUAL_REJ__",
    'tut_n': 'se_taire_v', 'vaniteux_n': 'vaniteux_a', 'vends_n': 'vendre_v', 'vent_v': 'vent_n', 'verras_n': 'voir_v', 'vilain_adj': 'vilain_n', 'vint_n': 'venir_v', 'voici_v': 'voici_b', 'voilà_v': 'voici_b', 'vole_n': 'voler_v', 'vu_adj': 'voir_v', 'vît_n': 'voir_v', 'yeux_n': 'œil_n', 'États-Unis': "États-Unis_d'Amérique_n",
    'énormes_n': 'énorme_a', 'éphémères_v': 'éphémère_a', 'éteins_n': 'éteindre_v', 'éteint_n': 'éteindre_v',
    'éteints_n': 'éteindre_v', 'étire_n': 'étirer_v', 'étrange_n': 'étrange_a', 'éveillé_n': "éveiller_v", 'êtes_n': 'être_v',    "Acheva_np": "achever_v",    "Admire_n": "admirer_v",    "Admirer_np": "admirer_v",
    "Afrique_np": "Afrique_n",    "Allons_n": "aller_v",    "Arizona_np": "Arizona_n",
    "Australie_np": "Australie_n",    "Ah_n": "__MANUAL_REJ__",    "Ah_adv": "__MANUAL_REJ__",    "Ben_v": "__MANUAL_REJ__",    'Bonsoir_np': "bonjour_n",    'Celui-là_n': "__MANUAL_REJ__", 'Combien_adv': "__MANUAL_REJ__", 'Comment_adv': "__MANUAL_REJ__", 'Crois_n': "croire_v", 'Demandez_np': "demander_v", 'Dessine_np': "dessiner_v", 'Dis_np': "dire_v",
    'Europe_np': "Europe_n", 'Exact_n': 'exact_a', 'Fais_n': 'faire_v', 'Faites_n': 'faire_v', 'France_np': 'France_n', 'Hein_np': "__MANUAL_REJ__", 'Hem_n': "__MANUAL_REJ__",
    'Interrogea_n': "interroger_v", 'Jupiter_np': "Jupiter_n", 'Justement_np': 'justement_b', 'Lui_np': "__MANUAL_REJ__",
    "N'_adv": "__MANUAL_REJ__", 'Nouvelle-Zélande_np': "Nouvelle-Zélande_n",
    'Noël_np': 'Noël_n', 'Ordonnez_np': "ordonner_v", 'Ouf_np': "__MANUAL_REJ__", 'Parce_np': "__MANUAL_REJ__",
    'Pardonnez_np': "pardonner_v", 'Pourquoi_adv': "__MANUAL_REJ__",
    'Puis-je_np': "pouvoir_v", 'Quant_np': "__MANUAL_REJ__", 'Rappela_np': 'rappeler_v',
    'Regardez_np': 'regarder_v', 'Reviens_n': 'revenir_v', 'Riposta_np': "riposter_v", 'Russie_np': "Russie_n",
    'Répéta_np': 'répéter_v', 'Sibérie_np': 'Sibérie_n', 'Soyez_np': 'être_v', 'Sûrement_v': 'sûrement_b',
    'Tes_n': "__MANUAL_REJ__", 'Toi_np': "__MANUAL_REJ__", "Va-t'en_np": "aller_v", 'Viens_n': 'venir_v',
    'Voulais_n': 'vouloir_v', 'Voyons_n': 'voir_v', 'Vécues_np': 'vivre_v', 'Vénus_np': 'Vénus_n',
    'absurde_v': 'absurde_a', 'acclame_n': 'acclamer_v', 'admire_n': 'admirer_v', 'agiterai_n': 'agiter_v', 'ah_n': "__MANUAL_REJ__",
    'ailler_v': 'aller_v', 'aimes_n': 'aimer_v', 'aisément_v': 'aisément_b', 'ajouta_n': 'ajouter_v',
    'ajouté_adj': 'ajouter_v', 'allaient_n': 'aller_v', 'apercevais_n': 'apercevoir_v', 'aperçut_n': 'apercevoir_v',
    'apprivoise_n': 'apprivoiser_v', 'arrivé_n': 'arrivée_n', 'assoiras_n': 'asseoir_v',
    'attacher_n': 'attacher_v', 'attendre_v': 'attendre_avec_impatience_v',
    'au_n': "__MANUAL_REJ__", 'au_v': "__MANUAL_REJ__", 'auprès_adv': "__MANUAL_REJ__",
    'aurais_n': 'avoir_v', 'autrefois_n': 'autrefois_b', 'avais_n': 'avoir_v', 'aval_adj': "aval_n", 'avoir_adv': 'avoir_v', 'belles_v': 'beau_a',
    'bizarre_n': 'bizarre_a', 'bois-tu_n': 'boire_v', 'bonsoir_n': 'bon_après-midi_n',
    'brillent_n': 'briller_v', 'bruire_v': 'bruit_n', 'brusquement_v': 'brusquement_b',
    'bâilla_n': 'bâiller_v', 'bâillant_n': 'bâiller_v', 'bâille_n': 'bâiller_v',
    'bénie_n': 'bénir_v', "c'est-à-dire_n": "c'est-à-dire_b", 'calcul_v': 'calcul_n',
    'champs_n': 'champ_n', 'chapitrer_adv': 'chapitre_b', 'chauffer_n': 'chauffer_v',
    "chef-d'oeuvre_n": "chef-d'œuvre_n", 'cherche_n': "chercher_v", 'chercher_n': "chercher_v", 'cherchons_n': "chercher_v", 'choux_n': 'chou_n', 'cinquante-quatre_n': "__MANUAL_REJ__",
    'clore_adj': "clos_a", 'combien_adv': "__MANUAL_REJ__", 'comment_adv': "__MANUAL_REJ__", 'commença_n': 'commencer_v',
    'comprendras_n': 'comprendre_v', 'comprendre_adj': 'comprendre_v', 'condamneras_n': 'condamner_v',
    'confidence_n': 'confiance_n', 'confus_n': 'confus_a', 'connaissent_n': 'connaître_v', 'connaître_n': 'connaître_v',
    'consolé_adj': 'consoler_v', 'consolé_n': 'consoler_v', 'conséquent_n': 'conséquemment_b', 'contente_n': 'content_a',
    'conter_adv': 'content_a', 'contradictoires_n': 'contradictoire_a',
    'coqueter_v': 'coquet_a', 'crayon_v': 'crayon_n', 'croître_n': 'croître_b', "d'Amérique_np": "États-Unis_d'Amérique_n",
    'danger_v': 'danger_n', 'debout_n': 'debout_a', 'demanda_n': 'demander_v',
    'demandai_n': 'demander_v', 'dessin_v': 'dessin_n', 'dessine_n': 'dessiner_v', 'deviens_n': "devenir_v",
    'devinai_n': "deviner_v", 'différent_adv': 'différent_a', 'disais_n': 'dire_n',
    'distingue_n': 'distinguer_v', 'dit_adj': 'dire_v', 'dort_n': 'dormir_v',
    'doré_n': 'doré_a', 'drôle_v': 'drôle_a', 'décidément_v': 'décidément_b',
    'décrire_n': 'décrire_v', 'dérangea_n': "déranger_v", 'désirais_n': "désirer_v",
    'désobéir_n': "désobéir_v", 'eft_v': 'être_v', 'enjamber_v': 'enjambée_n',
    'ensoleillée_n': 'ensoleillé_a', 'endort_n': "s'endormir_v", 'endormir_v': "s'endormir_v", 'entendis_n': 'entendre_v', 'entends_n': 'entendre_v',
    'enfoui_n':    "s'enfuir_v",    'enfuir_v': "s'enfuir_v", "entr'ouvertes_v": 'entrouvert_a', 'exigerai_n': 'exiger_v', 'explorateur_adj': 'explorateur_n',
    'extraordinaire_n': 'extraordinaire_a', 'e§t_v': 'être_v', 'faibles_v': 'faible_a', 'fais_n': 'faire_v',
    'faites_n': 'faire_v', 'feux_n': 'feu_n', 'fidèle_v': 'fidèle_a',
    'fier_n': 'fier_a', 'fis_n': "répondre_v", 'front_v': 'front_n', 'gagnes_n': 'gagner_v',
    'gentil_v': 'gentil_a', 'glacer_v': 'glacé_a', 'grand-chose_n': 'chose_n',
    'gère_n': 'gérer_v', 'géographe_v': 'géographe_n', 'gêne_n': 'gêner_v',
    'habiterai_n': 'habiter_v', 'halte_v': 'halte_n', 'hasard_v': 'hasard_n', 'haute_n': 'haut_a',
    'hem_n': "__MANUAL_REJ__", 'honnête_n': 'honnête_a', 'horreur_v': 'horreur_n',
    'humilier_n': 'humilier_v', 'hâter_v': 'se_hâter_v', 'hésitant_n': 'hésiter_v',
    'i._v': "__MANUAL_REJ__", 'idée_v': 'idée_n', 'imposa_n': 'imposer_v', 'impressionner_adv': 'impressionnant_a', 'inde_n': 'Inde_n',
    'intelligent_n': 'intelligent_a', 'interdis_n': 'interdire_v', 'interroge_n': 'interroger_v',
    'interrogea_n': 'interroger_v', 'intimide_n': 'intimider_v', 'intéressants_n': 'intéressant_a', 'isolé_n': 'isolé_a',
    'joli_adj': 'beau_a', 'joli_n': 'beau_a', 'joli_v': 'beau_a', 'jolie_v': 'beau_a', 'jolis_v': 'beau_a',
    'laid_v': 'laid_a', 'lança_n': 'lancer_v', 'lever_adv': 'lever_v',
    'long_n': 'long_a', 'là-bas_n': 'là-bas_b', 'là-dessus_n': 'là-dessus_b',
    'là-haut_v': 'là_b', "m'as_n": "__MANUAL_REJ__", 'magnifique_n': 'magnifique_a',
    'majestueusement_v': 'majestueusement_b', 'majestueux_adj': 'fier_a',
    'mange_n': 'manger_v', 'marcheras_n': 'marcher_v', 'mer_v': 'mer_n', 'meurt_n': 'mourir_v',
    'mienne_n': "__MANUAL_REJ__", 'miennes_n': "__MANUAL_REJ__", 'mille_v': 'mille_a',
    'mince_n': "mince_a", 'miser_v': 'mettre_v', 'modestement_v': 'modestement_b',
    'morde_n': 'mordre_v', 'moutons_n': 'mouton_n', 'mâcher_n': 'mâcher_v',
    'médité_n': 'méditer_v', 'naïf_n': 'naïf_a', 'naïve_v': 'naïf_a', 'naïves_v': 'naïf_a',
    'numéro_v': "numéro_n", 'occupe_n': "s'occuper_de_v", 'oeil_n': 'œil_n',
    'orgueilleuse_v': 'orgueilleuse_a', 'oublier_n': 'oublier_v', 'pacifique_n': 'Pacifique_n', 'pardon_v': 'pardon_n', 'parfait_v': 'parfait_a', 'parlais_n': 'parler_v', 'parles_n': 'parler_v', 'perdu_n': 'perdre_v',
    'perfore_n': 'perforer_v', 'personne_v': 'personne_n', 'petite_v': 'petit_a',
}


In [134]:
"terrible_a" in vocabulary_wolf

True

In [1]:
manual_residual = {
 'aiguillage_n', 'aiguiser_v', 'allumeur_n', 'ascension_n', 
    'aplomb_n', 'attendrir_v', 'autrui_n', 'avouer_v', 'baliverne_n', 
    'bonhomme_n', 'bredouiller_v', 'berçai_n', 'brusquerie_n', 
    'cache-nez_v', 'cambouis_n', 'compréhensif_adj', 'breveter_v', 
    'centaine_n', 'contemplation_n', 'crayonner_v', 'distraction_n', 
    'digérer_v', 'démoder_v', 'démontage_n', 'dévisser_v', 'déconcerté_n',
    'décoiffer_v', 'embellir_v', 'enrhumer_v', 'enquérir_v', 
    'enfourner_v',  'escamoter_v', 'faufiler_v', 'fripée_n',
    "fripée_n", 'falloir_v', 'frôler_v', 'gracieras_n', 'entrevoir_v', 
    'granit_n', 'grelot_n', 'ignorer_v', 'hocher_v', 'indiscipline_n', 
    'illuminé_adj', 'instruire_v',  'insuccès_n', 'indulgence_n', 
    'irrité_n', 'intrigué_adj', 'lampion_n', 'irréparable_n', 
    'lucide_adj',  'meeting_n', 'légère_n', 'margelle_n', 'moi-même_n', 
    'moraliste_n',  'muselière_n', 'méditatif_adj', 'objecter_v', 
    'ombrageux_adj','paravent_n', 'pendant_adj', 'pensum_n', 
    'paravent_nc',  'piquet_n', 'perfectionné_adj', 'évadé_adj', 
    'plaire_v', 'poliment_adv', 'provision_n', 'raisonner_v', 
    'raller_v',  'ramoner_v', 'recompte_n', 'redescendre_v', 
    'repartir_v', 'redevenir_v',  'ressemblants_n', 'rougissant_n', 
    'répliquer_v', 'simplifié_adj', 'sienne_n', 'sinon_adv', 'soi-même_n',
    'stylographe_n', 'tracer_v', 'tâtonner_v', 'valoir_v', 'vexer_adv', 
    'vexé_adj', 'vieillir_v', 'écria_n', 'écrier_v', 'éloignée_n', 
    'émerveiller_v', 'éponger_v', 'étonné_adj'
}


In [2]:
len(manual_residual)

98

In [34]:
freq_residual = 0
for word in manual_residual:
    freq_residual += vocabulary[word]

In [35]:
freq_residual

181

In [39]:
built_vocabulary_lpp

{'éclater_v',
 'monarque_nc',
 'sauter_v',
 'étonner_v',
 'mauvais_n',
 'forêt_n',
 'entendre_v',
 'certain_adj',
 'trône_n',
 'acte_n',
 'ébaucher_v',
 'orgueil_n',
 'fleurir_v',
 'pommier_n',
 'étrange_adj',
 'tirer_v',
 'caisse_n',
 'faible_adj',
 'fou_n',
 'baobab_n',
 'miel_n',
 'régulièrement_adv',
 'chance_n',
 'pâle_adj',
 'brûler_v',
 'cou_n',
 'patience_n',
 'ne_clneg',
 'régler_v',
 'croire_v',
 'enrouler_v',
 'sujet_n',
 'fidèle_n',
 'jeu_n',
 'sorte_n',
 'nord_n',
 'chasseur_n',
 'verser_v',
 'modeste_adj',
 'irriter_v',
 'tourner_v',
 'courage_nc',
 'dangereux_adj',
 'résoudre_v',
 'débarrasser_v',
 'efforcer_v',
 'signifier_v',
 'lumière_n',
 'livre_n',
 'mystère_n',
 'frotter_v',
 'décourager_v',
 'ballet_n',
 'consoler_v',
 'milieu_n',
 'nez_n',
 'oreille_n',
 'servir_v',
 'année_n',
 'rhume_n',
 'évasion_n',
 'outil_n',
 'appeler_v',
 'rapide_adj',
 'admirateur_n',
 'mètre_n',
 'rayonner_v',
 'urgence_n',
 'rêver_v',
 'aimer_v',
 'confondre_v',
 'vacance_n',
 'souffri

In [54]:
lpp_wolf_mapping = dict()

for word in vocabulary_lpp:
    if word in vocabulary_wolf:
        lpp_wolf_mapping[word] = word
    elif word in match_1:
        lpp_wolf_mapping[word] = match_1[word]
    elif word in manual_match:
        lpp_wolf_mapping[word] = manual_match[word]
    elif word in manual_residual:
        lpp_wolf_mapping[word] = '__UNMATECHED__'
    else:
        lpp_wolf_mapping[word] = '__MANUAL_REJ__'

## Mapped frequency

In [55]:
wolf_freq = dict()
for origin in lpp_wolf_mapping:
    target = lpp_wolf_mapping[origin]
    if target not in wolf_freq:
        wolf_freq[target] = vocabulary[origin]
    else:
        wolf_freq[target] += vocabulary[origin]

In [56]:
[(k, wolf_freq[k]) for k in sorted(wolf_freq, key=wolf_freq.get, reverse=True)]

[('__GIVEN_FUNCTIONAL__', 6107),
 ('__MATCHED_FUNCTIONAL__', 5176),
 ('__MANUAL_REJ__', 953),
 ('petit_a', 217),
 ('dire_v', 194),
 ('__UNMATECHED__', 181),
 ('pas_b', 179),
 ('prince_n', 171),
 ('avoir_v', 119),
 ('faire_v', 107),
 ('bien_b', 98),
 ('tu_n', 82),
 ('planète_n', 66),
 ('fleur_n', 66),
 ('plus_b', 62),
 ('répondre_v', 52),
 ('très_b', 49),
 ('étoile_n', 48),
 ('alors_b', 47),
 ('savoir_v', 46),
 ('tout_a', 44),
 ('voir_v', 43),
 ('pouvoir_v', 43),
 ('jamais_b', 42),
 ('être_v', 41),
 ('peu_b', 41),
 ('personne_n', 40),
 ('encore_b', 39),
 ('mouton_n', 39),
 ('là_b', 36),
 ('roi_n', 36),
 ('renard_n', 36),
 ('grand_a', 35),
 ('jour_n', 35),
 ('vouloir_v', 34),
 ('aussi_b', 33),
 ('seul_a', 30),
 ('tout_b', 30),
 ('homme_n', 28),
 ('trop_b', 28),
 ('regarder_v', 27),
 ('fois_n', 27),
 ('venir_v', 27),
 ('autre_a', 27),
 ('comprendre_v', 27),
 ('cent_n', 26),
 ('chose_n', 26),
 ('donc_b', 26),
 ('ami_n', 24),
 ('aller_v', 24),
 ('toujours_b', 23),
 ('dessin_n', 23),
 ('chap

In [57]:
special_set = set()
for word in wolf_freq:
    if word[:2] == '__':
        special_set.add(word)

In [58]:
special_set

{'__GIVEN_FUNCTIONAL__',
 '__MANUAL_REJ__',
 '__MATCHED_FUNCTIONAL__',
 '__UNMATECHED__'}

In [59]:
freq_valid = 0
freq_rejected = 0
freq_unaccounted = wolf_freq['__UNMATECHED__']
for word in wolf_freq:
    if word[:2] != '__':
        freq_valid += wolf_freq[word]
    else:
        freq_rejected += wolf_freq[word]

In [60]:
freq_valid, freq_rejected, freq_unaccounted

(6860, 12417, 181)

In [61]:
overwrite_check = True

if overwrite_check:
    output_file = open('lpp_tuned_wolf_vocabulary.txt', mode='w+')
    output_file.write('\n'.join(list(set(wolf_freq).difference(special_set))))
    output_file.close()