# Lemmatisation et racinisation de texte en français
( _Lemmatization and stemming_ )

## Préparation

In [43]:
text = "avions voudrais non animaux yeux dors couvre"
text = "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard"
text = "J'utilise mes connaissances. Et nous les appliquons."

## nltk snowball stemmer

In [57]:
from nltk.stem.snowball import FrenchStemmer
stemmer = FrenchStemmer()
words_to_stem = text.split(' ')
stems = [stemmer.stem(w) for w in words_to_stem]
print (stems)
print ([stemmer.stem(t.text) for t in nlp(text)])

["j'utilis", 'me', 'connaiss', '.', 'et', 'nous', 'le', 'appliquon', '.']
["j'", 'utilis', 'me', 'connaiss', '.', 'et', 'nous', 'le', 'appliquon', '.']


## spacy

In [None]:
# pip3 install --user spacy
#"python3 -m spacy download fr_core_news_md
import spacy
nlp = spacy.load('fr_core_news_md')

In [58]:
print ([token.lemma_ for token in nlp(text)])
#for d in doc:
#    print(d.text, d.pos_, d._.melt_tagger, d._.lefff_lemma, d.tag_, d.lemma_)

['je', 'utilise', 'mon', 'connaissance', '.', 'et', 'nous', 'le', 'appliquer', '.']


## FrenchLefffLemmatizer
* Source https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer
* Licence Apache


In [59]:
# pip3 install --user git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

french_lefff_lemmatizer = FrenchLefffLemmatizer()
print(french_lefff_lemmatizer.lemmatize('avions'))
print(french_lefff_lemmatizer.lemmatize('avions','n'))
print(french_lefff_lemmatizer.lemmatize('avions','v'))
print(french_lefff_lemmatizer.lemmatize('avions','unk'))

def french_lefff_lemmatizer_context_free (french_lefff_lemmatizer, tokenized_text):
    return [french_lefff_lemmatizer.lemmatize(t) for t in tokenized_text]

print(french_lefff_lemmatizer_context_free(french_lefff_lemmatizer, [t.text for t in nlp(text)]))

avion
avion
avoir
[]
["J'", 'utilise', 'mes', 'connaissance', '.', 'Et', 'nous', 'les', 'appliquons', '.']


### Using spacy tags to desambiguate

In [54]:
#numbers_powers = list(map(pow, base_numbers, powers))
#mapped_numbers = list(map(lambda x: x , numbers))
spacy_to_lefff_pos = {
    "ADJ": "adj",
    "ADP": "det",
    "ADV": "adv",
    "DET": "det",
    "PRON": "cln",
    "PROPN": "np",
    "NOUN": "nc",
    "VERB": "v",
    "PUNCT": "poncts"
} # CCONJ ?


def french_lefff_lemmatizer_wi_spacy_pos (french_lefff_lemmatizer, spacy_doc):
    # lefff retourne de mauvais lemmes pour les DET et les PRON
    # spacy retourne de mauvais lemmes pour les VERB
    # retourne le lemme de spacy par défaut excepté pour les verbes
    lemmas = []
    for t in spacy_doc:
        if t.pos_ in ['VERB']:
            lefff_lemma = french_lefff_lemmatizer.lemmatize(t.text, spacy_to_lefff_pos[t.pos_])
            if type(lefff_lemma) != type ("") and len(lefff_lemma) !=0:
                lefff_lemma = lefff_lemma[0][0]
            else: lefff_lemma = t.lemma_
            lemmas.append(lefff_lemma)
        else:
            lemmas.append(t.lemma_)

    return lemmas
print (french_lefff_lemmatizer_wi_spacy_pos(french_lefff_lemmatizer, nlp(text)))

['je', 'utilise', 'mon', 'connaissance', '.', 'et', 'nous', 'le', 'appliquer', '.']


## spacy-lefff

**Custom French POS and lemmatizer based on Lefff for spacy**

* Source https://pypi.org/project/spacy-lefff/
* License MIT


In [9]:
# pip3 install --user spacy-lefff
import spacy
from spacy_lefff import LefffLemmatizer, POSTagger
from spacy.language import Language

@Language.factory('french_lemmatizer')
def create_french_lemmatizer(nlp, name):
    return LefffLemmatizer(after_melt=True, default=True)

@Language.factory('melt_tagger')  
def create_melt_tagger(nlp, name):
    return POSTagger()
 
nlp = spacy.load('fr_core_news_md')
#nlp = spacy.load('fr_core_news_sm')

nlp.add_pipe('melt_tagger', after='parser')
nlp.add_pipe('french_lemmatizer', after='melt_tagger')
doc = nlp(u"Apple cherche a acheter une startup anglaise pour 1 milliard de dollard")
for d in doc:
    print(d.text, d.pos_, d._.melt_tagger, d._.lefff_lemma, d.tag_, d.lemma_)

AttributeError: type object 'Language' has no attribute 'factory'

## treetagger-python

**A Python module for interfacing with the Treetagger by Helmut Schmid**

* Wrapper source https://github.com/miotto/treetagger-python (alternative exists)
* License GPL-v3
* TreeTagger source https://www.cis.lmu.de/~schmid/tools/TreeTagger/

## CLTK, The Classical Language Toolkit 

**The Classical Language Toolkit (CLTK) is a Python library offering natural language processing (NLP) for pre-modern languages.**

* Home https://github.com/cltk/cltk 
* Licence MIT
* Doc https://docs.cltk.org/en/latest/quickstart.html

In [1]:
# pip3 install --user cltk
from cltk import NLP

#  Middle French language
cltk_nlp = NLP(language="mfr")
cltk_doc = cltk_nlp.analyze(text=text)
print(cltk_doc.lemmata)

ImportError: cannot import name 'NLP'

In [2]:
from cltk.lemmatize.french.lemma import LemmaReplace

ImportError: cannot import name 'LemmaReplace'

## Conclusions

* spacy-lefff que je n'ai pu tester, probablement pour des questions de configurations d'environnements de jupyter
* treetagger-python qui requiert de pré-installer treetagger
* malgré une installation pip sans erreur, problème à l'exécution de cltk ; dans tous les cas n'était pas pour du français contemporain 
* spacy pour la robustesse, spacy+lefff pour la précision  