# Lemmatisation et racinisation de texte en français
( _Lemmatization and stemming_ )

## Préparation

In [43]:
text = "avions voudrais non animaux yeux dors couvre"
text = "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard"
text = "J'utilise mes connaissances. Et nous les appliquons."

## nltk snowball stemmer

In [57]:
from nltk.stem.snowball import FrenchStemmer
stemmer = FrenchStemmer()
words_to_stem = text.split(' ')
stems = [stemmer.stem(w) for w in words_to_stem]
print (stems)
print ([stemmer.stem(t.text) for t in nlp(text)])

["j'utilis", 'me', 'connaiss', '.', 'et', 'nous', 'le', 'appliquon', '.']
["j'", 'utilis', 'me', 'connaiss', '.', 'et', 'nous', 'le', 'appliquon', '.']


## spacy

In [None]:
# pip3 install --user spacy
#"python3 -m spacy download fr_core_news_md
import spacy
nlp = spacy.load('fr_core_news_md')

In [58]:
print ([token.lemma_ for token in nlp(text)])
#for d in doc:
#    print(d.text, d.pos_, d._.melt_tagger, d._.lefff_lemma, d.tag_, d.lemma_)

['je', 'utilise', 'mon', 'connaissance', '.', 'et', 'nous', 'le', 'appliquer', '.']


## FrenchLefffLemmatizer
* Source https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer
* Licence Apache


In [59]:
# pip3 install --user git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

french_lefff_lemmatizer = FrenchLefffLemmatizer()
print(french_lefff_lemmatizer.lemmatize('avions'))
print(french_lefff_lemmatizer.lemmatize('avions','n'))
print(french_lefff_lemmatizer.lemmatize('avions','v'))
print(french_lefff_lemmatizer.lemmatize('avions','unk'))

def french_lefff_lemmatizer_context_free (french_lefff_lemmatizer, tokenized_text):
    return [french_lefff_lemmatizer.lemmatize(t) for t in tokenized_text]

print(french_lefff_lemmatizer_context_free(french_lefff_lemmatizer, [t.text for t in nlp(text)]))

avion
avion
avoir
[]
["J'", 'utilise', 'mes', 'connaissance', '.', 'Et', 'nous', 'les', 'appliquons', '.']


### Using spacy tags to desambiguate

In [54]:
#numbers_powers = list(map(pow, base_numbers, powers))
#mapped_numbers = list(map(lambda x: x , numbers))
spacy_to_lefff_pos = {
    "ADJ": "adj",
    "ADP": "det",
    "ADV": "adv",
    "DET": "det",
    "PRON": "cln",
    "PROPN": "np",
    "NOUN": "nc",
    "VERB": "v",
    "PUNCT": "poncts"
} # CCONJ ?


def french_lefff_lemmatizer_wi_spacy_pos (french_lefff_lemmatizer, spacy_doc):
    # lefff retourne de mauvais lemmes pour les DET et les PRON
    # spacy retourne de mauvais lemmes pour les VERB
    # retourne le lemme de spacy par défaut excepté pour les verbes
    lemmas = []
    for t in spacy_doc:
        if t.pos_ in ['VERB']:
            lefff_lemma = french_lefff_lemmatizer.lemmatize(t.text, spacy_to_lefff_pos[t.pos_])
            if type(lefff_lemma) != type ("") and len(lefff_lemma) !=0:
                lefff_lemma = lefff_lemma[0][0]
            else: lefff_lemma = t.lemma_
            lemmas.append(lefff_lemma)
        else:
            lemmas.append(t.lemma_)

    return lemmas
print (french_lefff_lemmatizer_wi_spacy_pos(french_lefff_lemmatizer, nlp(text)))

['je', 'utilise', 'mon', 'connaissance', '.', 'et', 'nous', 'le', 'appliquer', '.']


## spacy-lefff

**Custom French POS and lemmatizer based on Lefff for spacy**

* Source https://pypi.org/project/spacy-lefff/
* License MIT


In [9]:
# pip3 install --user spacy-lefff
import spacy
from spacy_lefff import LefffLemmatizer, POSTagger
from spacy.language import Language

@Language.factory('french_lemmatizer')
def create_french_lemmatizer(nlp, name):
    return LefffLemmatizer(after_melt=True, default=True)

@Language.factory('melt_tagger')  
def create_melt_tagger(nlp, name):
    return POSTagger()
 
nlp = spacy.load('fr_core_news_md')
#nlp = spacy.load('fr_core_news_sm')

nlp.add_pipe('melt_tagger', after='parser')
nlp.add_pipe('french_lemmatizer', after='melt_tagger')
doc = nlp(u"Apple cherche a acheter une startup anglaise pour 1 milliard de dollard")
for d in doc:
    print(d.text, d.pos_, d._.melt_tagger, d._.lefff_lemma, d.tag_, d.lemma_)

AttributeError: type object 'Language' has no attribute 'factory'

## stanza aka Neural Stanford corenlp

Aside from the neural pipeline, this package also includes an official wrapper for accessing the Java Stanford CoreNLP software with Python code https://github.com/stanfordnlp/CoreNLP.

* 800 Mo of parameters + 600 for the French model (stored in ~/stanza_resources/fr/default.zip)

* https://github.com/stanfordnlp/stanza

In [62]:
# pip3 install --user stanza 
import stanza
stanza.download('fr')       # This downloads the English models for the neural pipeline
nlp = stanza.Pipeline('fr') # This sets up a default neural pipeline in English
doc = nlp("Barack Obama was born in Hawaii.  He was elected president in 2008.")
doc.sentences[0].print_dependencies()

for sentence in doc.sentences:
    for word in sentence.words:
        print(word.text, word.lemma, word.pos)




Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s][A
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 15.7MB/s]                    [A2021-04-09 01:03:28 INFO: Downloading default packages for language: fr (French)...
2021-04-09 01:03:29 INFO: File exists: /home/hernandez-n/stanza_resources/fr/default.zip.
2021-04-09 01:03:35 INFO: Finished downloading models and saved to /home/hernandez-n/stanza_resources.
2021-04-09 01:03:35 INFO: Loading these models for language: fr (French):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |
| ner       | wikiner |

2021-04-09 01:03:35 INFO: Use device: cpu
2021-04-09 01:03:35 INFO: Loading: tokenize
2021-04-09 01:03:35 INFO: Loading: mwt
2021-04-09 01:03:35 INFO: Loa

('Barack', 0, 'root')
('Obama', 1, 'flat:foreign')
('was', 2, 'flat:foreign')
('born', 3, 'flat:foreign')
('in', 4, 'flat:foreign')
('Hawaii', 5, 'flat:foreign')
('.', 1, 'punct')
Barack Barack X
Obama Obama X
was was X
born born X
in in X
Hawaii Hawaii X
. . PUNCT
He He X
was was X
elected elected X
president president X
in in X
2008 2008 NUM
. . PUNCT


## treetagger-python

**A Python module for interfacing with the Treetagger by Helmut Schmid**

* Wrapper source https://github.com/miotto/treetagger-python (alternative exists)
* License GPL-v3
* TreeTagger source https://www.cis.lmu.de/~schmid/tools/TreeTagger/
* A neural version exists too. It lemmatizes all tokens. Lemmas of unknown tokens are guessed and are therefore not guaranteed to be always correct. Slower, requires PyTorch, requires a GPU for improved speed, larger parameter files https://www.cis.lmu.de/~schmid/tools/RNNTagger/  (3.1 GB)

## CLTK, The Classical Language Toolkit 

**The Classical Language Toolkit (CLTK) is a Python library offering natural language processing (NLP) for pre-modern languages.**

* Home https://github.com/cltk/cltk 
* Licence MIT
* Doc https://docs.cltk.org/en/latest/quickstart.html

In [1]:
# pip3 install --user cltk
from cltk import NLP

#  Middle French language
cltk_nlp = NLP(language="mfr")
cltk_doc = cltk_nlp.analyze(text=text)
print(cltk_doc.lemmata)

ImportError: cannot import name 'NLP'

In [2]:
from cltk.lemmatize.french.lemma import LemmaReplace

ImportError: cannot import name 'LemmaReplace'

## textblob-fr

does not offer Words Inflection and Lemmatization for French

https://pypi.org/project/textblob-fr/

'''
>>> from textblob import Word
>>> w = Word("octopi")
>>> w.lemmatize()
'octopus'
>>> w = Word("went")
>>> w.lemmatize("v")  # Pass in WordNet part of speech (verb)

from textblob import TextBlob
>>> from textblob_fr import PatternTagger, PatternAnalyzer
>>> text = u"Quelle belle matinée"
>>> blob = TextBlob(text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
>>> blob.tags
[(u'Quelle', u'DT'), (u'belle', u'JJ'), (u'matin\xe9e', u'NN')]
>>> blob.sentiment
'''

## pattern
2.6 
https://github.com/clips/pattern

## Conclusions

En conclusion du tour d'horizon des solutions de lemmatisation en python 3 pour le traitement du français contemporain

* stanza, et textblob-fr ne fournissent pas de lemmatization. 
* pattern 2.6 n'est pas encore compatible python 3
* je n'ai pu tester spacy-lefff, probablement pour des questions de configurations d'environnements de jupyter
* je n'ai pas pu tester non plus cltk. Malgré une installation pip sans erreur, problème à l'exécution ; dans tous les cas n'était pas pour du français contemporain 
* treetagger-python mis de côté car requiert de pré-installer treetagger ; 
* n'ai pas testé rnntagger le nouvel outil en python de l'auteur de treetagger car celui-ci fait plus de 3 Go
" N'ai pas testé le stanford corenlp (java) pour lequel stanza offre un wrapper et qui https://github.com/stanfordnlp/CoreNLP

* spacy pour la robustesse, spacy+lefff pour la précision  