# Lemmatizing files using spaCy

In this notebook, we will lemmatize our corpus. This needs to be done for each language separately. Lemmatizing is not obligatory for Topic Modeling, but if your lemmatization model works well with your corpus, we recommend it, since this can improve the quality of the topics.<br>  
<i>spaCy</i> is a python library for natural language processing. See more: https://spacy.io/. 

In [63]:
import warnings
warnings.filterwarnings('ignore')
from cophi_toolbox import preprocessing
import metadata_toolbox.utils as metadata
import pandas as pd
from pathlib import Path
import spacy

In [44]:
data = 'Y:/data/projekte/dispecs/topicModeling'
language = 'de' # language 2 letter abbreviation
path_to_corpus = Path(data, 'dispecs_'+language+'_lemmatized') # Careful! The files will be overwritten, so make a backup :)

In [46]:
pattern = '{year}_{journal}_{author}_{volume}_{issue}_{id}'#1716_Le-Spectateur-ou-le-Socrate-moderne_Anonym_Table-des-Matieres_119-1257
meta = pd.concat([metadata.fname2metadata(str(path), pattern=pattern) for path in path_to_corpus.glob('*.txt')])

In [47]:
meta[:5]

Unnamed: 0,year,journal,author,volume,issue,id
Y:\data\projekte\dispecs\topicModeling\dispecs_de_lemmatized\1723_Der-Leipziger-Spectateur_Anonymus_Vol-1_Nr-000_4287.txt,1723,Der-Leipziger-Spectateur,Anonymus,Vol-1,Nr-000,4287
Y:\data\projekte\dispecs\topicModeling\dispecs_de_lemmatized\1723_Der-Leipziger-Spectateur_Anonymus_Vol-1_Nr-001_4281.txt,1723,Der-Leipziger-Spectateur,Anonymus,Vol-1,Nr-001,4281
Y:\data\projekte\dispecs\topicModeling\dispecs_de_lemmatized\1723_Der-Leipziger-Spectateur_Anonymus_Vol-1_Nr-002_4282.txt,1723,Der-Leipziger-Spectateur,Anonymus,Vol-1,Nr-002,4282
Y:\data\projekte\dispecs\topicModeling\dispecs_de_lemmatized\1723_Der-Leipziger-Spectateur_Anonymus_Vol-1_Nr-003_4283.txt,1723,Der-Leipziger-Spectateur,Anonymus,Vol-1,Nr-003,4283
Y:\data\projekte\dispecs\topicModeling\dispecs_de_lemmatized\1723_Der-Leipziger-Spectateur_Anonymus_Vol-1_Nr-004_4284.txt,1723,Der-Leipziger-Spectateur,Anonymus,Vol-1,Nr-004,4284


In [33]:
len(meta)

690

In [48]:
"""
-----> Language packages (have to be installed first, see here: https://spacy.io/usage/models):
French: fr_core_news_lg
Spanish: es_core_news_lg
Italian: it_core_news_lg
English: en_core_web_lg
Portuguese: pt_core_news_lg
German: de_core_news_lg
"""
if language == 'fr':
    nlp = spacy.load('fr_core_news_lg')
    print('French package loaded')
if language == 'it':
    nlp = spacy.load('it_core_news_lg')
    print('Italian package loaded')
if language == 'es':
    nlp = spacy.load('es_core_news_lg')
    print('Spanish package loaded')
if language == 'de':
    nlp = spacy.load('de_core_news_lg')
    print('German package loaded')
if language == 'en':
    nlp = spacy.load('en_core_web_lg')
    print('English package loaded')
if language == 'pt':
    nlp = spacy.load('pt_core_news_lg')
    print('Portuguese package loaded')

German package loaded


In [41]:
""" 
Run this part only if there are corrections to be made! 
Make sure to set the right dictionary name in the variable "correction_dictionary"!
Correct the lemmatization errors by defining your dictionaries and replacing the wrong lemma in the lookup table.
The usage of upper and lowercase letters in values is relevant, so be sure to correct both versions, if needed.
"""

corr_fr = {
    "avoir" : ["avois", "avoit", "Avois", "Avoit"], 
    "dire" : ["disois", "disoit", "Disois", "Disoit"],
    "manière" : ["maniere", "Maniere"],
    "pièce" : ["piéce", "Piéce"],
    "poète" : ["poëte", "Poëte"],
    "poème" : ["poëme", "Poëme"],
    "poésie" : ["poësie", "Poësie"],
    "sexe" : ["séxe", "Séxe"],
    "moyen" : ["moïen", "Moïen"],
    "thèatre":["théâtre", "Théâtre"]
        
}

corr_es = {
    "decir":["dixo", "decia", "Dixo", "Decia"],
    "ir":["iba", "Iba"],
    "pacerer":["parecia", "Parecia"],
    "poder":["podia", "Podia"],
    "ser":["fuesse", "Fuesse"],
    "haber":["habia", "havia", "Habia", "Havia"],
    "ahora" : ["aora", "Aora"],
    "estar" : ["estàn", "Estàn"],
    "lujo" : ["luxo","luxar", "Luxo","Luxar"],
    "razón" : ["razon", "razòn", "Razon", "Razòn"],
    "caballero" : ["cavallero", "Cavallero"],
    "mujer" : ["muger", "mugeres", "Muger", "Mugeres"],
    "vez" : ["vèz", "Vèz"],
    "jamás" : ["jamas", "Jamas"],
    "demás" : ["demas", "demàs", "Demas", "Demàs"],
    "cuidar" : ["cuydado", "Cuydado"],
    "posible" : ["possible", "Possible"],
    "comedia":["comediar", "Comedias"],
    "poeta":["poetas", "Poetas"],
    "mano":["manir", "Manir"],
    "barba":["barbar", "Barbar"],
    "idea":["ideo", "Ideo"]
}



# Choose the dictionary by setting the right value in the variable "correction_dictionary"
correction_dictionary = corr_es
for key, value in correction_dictionary.items():
    for token in value:
        correct = key
        wrong = token
        nlp.vocab.lookups.get_table("lemma_lookup")[token] = key
        print("Lemma", token, "corrected with", key)


Lemma dixo corrected with decir
Lemma decia corrected with decir
Lemma Dixo corrected with decir
Lemma Decia corrected with decir
Lemma iba corrected with ir
Lemma Iba corrected with ir
Lemma parecia corrected with pacerer
Lemma Parecia corrected with pacerer
Lemma podia corrected with poder
Lemma Podia corrected with poder
Lemma fuesse corrected with ser
Lemma Fuesse corrected with ser
Lemma habia corrected with haber
Lemma havia corrected with haber
Lemma Habia corrected with haber
Lemma Havia corrected with haber
Lemma aora corrected with ahora
Lemma Aora corrected with ahora
Lemma estàn corrected with estar
Lemma Estàn corrected with estar
Lemma luxo corrected with lujo
Lemma luxar corrected with lujo
Lemma Luxo corrected with lujo
Lemma Luxar corrected with lujo
Lemma razon corrected with razón
Lemma razòn corrected with razón
Lemma Razon corrected with razón
Lemma Razòn corrected with razón
Lemma cavallero corrected with caballero
Lemma Cavallero corrected with caballero
Lemma mu

In [69]:
# Lemmatization of all text files in the corpus. The files will be overwritten. 

for file in path_to_corpus.glob('*.txt'): 
    with open(file, encoding='utf-8') as f:
        original = f.read()
        lemmatized_object = nlp(original)        
        lemma_list = []
        for lemma in lemmatized_object:
            lemma_list.append(lemma.lemma_)
        lemma_doc = ' '.join(lemma_list)
    with open(file, 'w', encoding='utf-8') as f:
        f.write(lemma_doc)
