# Lemmatisation et racinisation de texte en français
( _Lemmatization and stemming_ )

## Préparation

In [2]:
text = "avions voudrais non animaux yeux dors couvre"
text = "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard"
text = "J'utilise mes connaissances. Et nous les appliquons."

## nltk snowball stemmer

In [16]:
# sudo pip3 install nltk
nltk.download('punkt')
from nltk.stem.snowball import FrenchStemmer
from nltk.tokenize import word_tokenize

print([FrenchStemmer().stem(w) for w in word_tokenize(text, language='french')])

["j'utilis", 'me', 'connaiss', '.', 'et', 'nous', 'le', 'appliquon', '.']


* tokenizations issues
* stemming is what it is

## spacy
* https://spacy.io/models (two models for French, one more efficient and the other more accurate)

In [43]:
# sudo pip3 install spacy
# sudo python3 -m spacy download fr_core_news_sm 
# sudo python3 -m spacy download fr_dep_news_trf 
import spacy
nlp = spacy.load('fr_core_news_sm') # efficiency

# what spacy can produce (tokenization, pos tagging, lemmatization)
#for w in nlp(text):
#    print(w.text, w.pos_, w.tag_, w.lemma_)
# spacy lemmatization

print ([token.lemma_ for token in nlp(text)])

['je', 'utilise', 'mon', 'connaissance', '.', 'et', 'nous', 'le', 'appliquon', '.']


In [44]:
# spacy lemmatization
nlp = spacy.load('fr_dep_news_trf') # accuracy
print ([token.lemma_ for token in nlp(text)])

['je', 'utilise', 'mon', 'connaissance', '.', 'et', 'nous', 'le', 'appliquer', '.']


* lemmatization issues mainly for verbs
* a model more accurate than another

## FrenchLefffLemmatizer
* Source https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer
* Licence Apache


In [45]:
# sudo pip3 install git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

french_lefff_lemmatizer = FrenchLefffLemmatizer()

print(french_lefff_lemmatizer.lemmatize('avions'))
print(french_lefff_lemmatizer.lemmatize('avions','n'))
print(french_lefff_lemmatizer.lemmatize('avions','v'))
print(french_lefff_lemmatizer.lemmatize('avions','pipo'))
print(french_lefff_lemmatizer.lemmatize('utilise'))
print(french_lefff_lemmatizer.lemmatize('utilise','v'))
print(french_lefff_lemmatizer.lemmatize('appliquons'))
print(french_lefff_lemmatizer.lemmatize('appliquons','v'))

def french_lefff_lemmatizer_context_free (french_lefff_lemmatizer, tokenized_text):
    return [french_lefff_lemmatizer.lemmatize(t) for t in tokenized_text]

print ('\t'.join(['spacy_text', 'spacy_pos', 'spacy_lemma', 'lefff_pos', 'lefff_lemma', 'lefff+pos_lemma']))
for t in nlp(text):
    print ('\t\t'.join([t.text,t.pos_,t.lemma_, spacy_to_lefff_pos[t.pos_] if t.pos_ in spacy_to_lefff_pos else t.pos_, french_lefff_lemmatizer.lemmatize(t.text.lower()), str(french_lefff_lemmatizer.lemmatize(t.text,spacy_to_lefff_pos[t.pos_] if t.pos_ in spacy_to_lefff_pos else ''))]))
        
print(french_lefff_lemmatizer_context_free(french_lefff_lemmatizer, [t.text for t in nlp(text)]))
print(french_lefff_lemmatizer_context_free(french_lefff_lemmatizer, [t.text.lower() for t in nlp(text)]))
print(french_lefff_lemmatizer_context_free(french_lefff_lemmatizer, [t.lemma_ for t in nlp(text)]))

avion
avion
avoir
[]
utilise
utiliser
appliquons
appliquer
spacy_text	spacy_pos	spacy_lemma	lefff_pos	lefff_lemma	lefff+pos_lemma
J'		PRON		je		cln		j'		[('il', 'cln')]
utilise		VERB		utilise		v		utilise		utiliser
mes		DET		mon		det		mes		[('son', 'det')]
connaissances		NOUN		connaissance		nc		connaissance		[('connaissance', 'nc')]
.		PUNCT		.		poncts		.		[('.', 'poncts')]
Et		CCONJ		et		CCONJ		et		[]
nous		PRON		nous		cln		nous		[('il', 'cln')]
les		PRON		le		cln		les		[]
appliquons		VERB		appliquer		v		appliquons		appliquer
.		PUNCT		.		poncts		.		[('.', 'poncts')]
["J'", 'utilise', 'mes', 'connaissance', '.', 'Et', 'nous', 'les', 'appliquons', '.']
["j'", 'utilise', 'mes', 'connaissance', '.', 'et', 'nous', 'les', 'appliquons', '.']
['je', 'utilise', 'mon', 'connaissance', '.', 'et', 'nous', 'le', 'appliquer', '.']


* FrenchLefffLemmatizer can use pos tag informations to desambiguate 
* FrenchLefffLemmatizer can perform better than spacy on verb lemmatization with pos tag; do some errors on pronouns and determiners
* FrenchLefffLemmatizer return either a string or a list 

### Using spacy tags to desambiguate - home made solution

In [46]:
spacy_to_lefff_pos = {
    "ADJ": "adj",
    "ADP": "det",
    "ADV": "adv",
    "DET": "det",
    "PRON": "cln",
    "PROPN": "np",
    "NOUN": "nc",
    "VERB": "v",
    "PUNCT": "poncts"
} # CCONJ ?


def french_lefff_lemmatizer_context_sensitive_wi_spacy_pos (french_lefff_lemmatizer, spacy_doc):
    ''' 
    Given that lefff returns wrong lemma for DET and PRON
    Given that spacy returns wrong lemma for VERB
    Returns the spacy lemma except for VERB
    '''
    lemmas = []
    for t in spacy_doc:
        #print ([t.text,t.pos_,t.lemma_, spacy_to_lefff_pos[t.pos_] if t.pos_ in spacy_to_lefff_pos else t.pos_, french_lefff_lemmatizer.lemmatize(t.text.lower()), french_lefff_lemmatizer.lemmatize(t.text,spacy_to_lefff_pos[t.pos_] if t.pos_ in spacy_to_lefff_pos else '')])
        if t.pos_ in ['VERB']:
            #print ('to fix')
            lefff_lemma = french_lefff_lemmatizer.lemmatize(t.text, spacy_to_lefff_pos[t.pos_])
            if type(lefff_lemma) != type ("") and len(lefff_lemma) !=0:
                lefff_lemma = lefff_lemma[0][0]
                #print ('is a list so lemma is : '+lefff_lemma)
            else: 
                lefff_lemma =lefff_lemma
                #print ('is a string so lemma is : '+lefff_lemma)
            lemmas.append(lefff_lemma)
        else:
            lemmas.append(t.lemma_)

    return lemmas
print (french_lefff_lemmatizer_context_sensitive_wi_spacy_pos(french_lefff_lemmatizer, nlp(text)))

["J'", 'PRON', 'je', 'cln', "j'", [('il', 'cln')]]
['utilise', 'VERB', 'utilise', 'v', 'utilise', 'utiliser']
to fix
is a string so lemma is : utiliser
['mes', 'DET', 'mon', 'det', 'mes', [('son', 'det')]]
['connaissances', 'NOUN', 'connaissance', 'nc', 'connaissance', [('connaissance', 'nc')]]
['.', 'PUNCT', '.', 'poncts', '.', [('.', 'poncts')]]
['Et', 'CCONJ', 'et', 'CCONJ', 'et', []]
['nous', 'PRON', 'nous', 'cln', 'nous', [('il', 'cln')]]
['les', 'PRON', 'le', 'cln', 'les', []]
['appliquons', 'VERB', 'appliquer', 'v', 'appliquons', 'appliquer']
to fix
is a string so lemma is : appliquer
['.', 'PUNCT', '.', 'poncts', '.', [('.', 'poncts')]]
['je', 'utiliser', 'mon', 'connaissance', '.', 'et', 'nous', 'le', 'appliquer', '.']


* on our example, the lemmatization performs well

## spacy-lefff

**Custom French POS and lemmatizer based on Lefff for spacy**

* Source https://pypi.org/project/spacy-lefff/
* License MIT


In [1]:
# sudo pip3 install spacy-lefff
import spacy
from spacy_lefff import LefffLemmatizer, POSTagger
from spacy.language import Language

@Language.factory('french_lemmatizer')
def create_french_lemmatizer(nlp, name):
    return LefffLemmatizer(after_melt=True, default=True)

@Language.factory('melt_tagger')  
def create_melt_tagger(nlp, name):
    return POSTagger()
 
nlp = spacy.load('fr_core_news_sm') # efficiency

nlp.add_pipe('melt_tagger', after='parser')
nlp.add_pipe('french_lemmatizer', after='melt_tagger')
doc = nlp(u"Apple cherche a acheter une startup anglaise pour 1 milliard de dollard")
for d in doc:
    print(d.text, d.pos_, d._.melt_tagger, d._.lefff_lemma, d.tag_, d.lemma_)

PermissionError: [Errno 13] Permission denied: '/usr/local/lib/python3.6/dist-packages/spacy_lefff/data/tagger'

* `PermissionError: [Errno 13] Permission denied: '/usr/local/lib/python3.6/dist-packages/spacy_lefff/data/tagger'` ; the path `/usr/local/lib/python3.6/dist-packages/spacy_lefff/data/'` exists but no tagger in it.

## stanza aka Neural Stanford corenlp

Aside from the neural pipeline, this package also includes an official wrapper for accessing the Java Stanford CoreNLP software with Python code https://github.com/stanfordnlp/CoreNLP.

* 800 Mo of parameters + 600 for the French model (stored in ~/stanza_resources/fr/default.zip)

* https://github.com/stanfordnlp/stanza

In [4]:
# sudo pip3 install stanza 
import stanza
stanza.download('fr')       # This downloads the English models for the neural pipeline
nlp = stanza.Pipeline('fr') # This sets up a default neural pipeline in English
# doc = nlp("Barack Obama was born in Hawaii.  He was elected president in 2008.")
doc = nlp(text)

doc.sentences[0].print_dependencies()

for sentence in doc.sentences:
    for word in sentence.words:
        print(word.text, word.lemma, word.pos)



Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 20.3MB/s]                    
2021-04-14 11:59:15 INFO: Downloading default packages for language: fr (French)...
Downloading http://nlp.stanford.edu/software/stanza/1.2.0/fr/default.zip: 100%|██████████| 574M/574M [02:05<00:00, 4.59MB/s] 
2021-04-14 12:01:26 INFO: Finished downloading models and saved to /home/hernandez-n/stanza_resources.
2021-04-14 12:01:26 INFO: Loading these models for language: fr (French):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |
| ner       | wikiner |

2021-04-14 12:01:26 INFO: Use device: cpu
2021-04-14 12:01:26 INFO: Loading: tokenize
2021-04-14 12:01:26 INFO: Loading: mwt
2021-04-14 12:01:26 INFO: Loading: pos
2021-04-14 12:01:27 INFO: Loading: lemma
2021-04-14 12:01:27 INFO: Loading: depparse
2021-04-14 12:01:27 INF

("J'", 2, 'nsubj')
('utilise', 0, 'root')
('mes', 4, 'det')
('connaissances', 2, 'obj')
('.', 2, 'punct')
J' il PRON
utilise utiliser VERB
mes son DET
connaissances connaissance NOUN
. . PUNCT
Et et CCONJ
nous il PRON
les le PRON
appliquons appliquer VERB
. . PUNCT


* correct lemmatization on VERBs ; incorrect lemmatization for PRONouns


## treetagger-python

**A Python module for interfacing with the Treetagger by Helmut Schmid**

* Wrapper source https://github.com/miotto/treetagger-python (alternative exists)
* License GPL-v3
* TreeTagger source https://www.cis.lmu.de/~schmid/tools/TreeTagger/
* A neural version exists too. It lemmatizes all tokens. Lemmas of unknown tokens are guessed and are therefore not guaranteed to be always correct. Slower, requires PyTorch, requires a GPU for improved speed, larger parameter files https://www.cis.lmu.de/~schmid/tools/RNNTagger/  (3.1 GB)

## CLTK, The Classical Language Toolkit 

**The Classical Language Toolkit (CLTK) is a Python library offering natural language processing (NLP) for pre-modern languages.**

* Home https://github.com/cltk/cltk 
* Licence MIT
* Doc https://docs.cltk.org/en/latest/quickstart.html

In [1]:
# pip3 install --user cltk
from cltk import NLP

#  Middle French language
cltk_nlp = NLP(language="mfr")
cltk_doc = cltk_nlp.analyze(text=text)
print(cltk_doc.lemmata)

ImportError: cannot import name 'NLP'

In [2]:
from cltk.lemmatize.french.lemma import LemmaReplace

ImportError: cannot import name 'LemmaReplace'

## textblob-fr

does not offer Words Inflection and Lemmatization for French

https://pypi.org/project/textblob-fr/

'''
>>> from textblob import Word
>>> w = Word("octopi")
>>> w.lemmatize()
'octopus'
>>> w = Word("went")
>>> w.lemmatize("v")  # Pass in WordNet part of speech (verb)

from textblob import TextBlob
>>> from textblob_fr import PatternTagger, PatternAnalyzer
>>> text = u"Quelle belle matinée"
>>> blob = TextBlob(text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
>>> blob.tags
[(u'Quelle', u'DT'), (u'belle', u'JJ'), (u'matin\xe9e', u'NN')]
>>> blob.sentiment
'''

## pattern
2.6 
https://github.com/clips/pattern

## Conclusions

En conclusion du tour d'horizon des solutions de lemmatisation en python 3 pour le traitement du français contemporain

* stanza, et textblob-fr ne fournissent pas de lemmatization. 
* pattern 2.6 n'est pas encore compatible python 3
* je n'ai pu tester spacy-lefff, probablement pour des questions de configurations d'environnements de jupyter
* je n'ai pas pu tester non plus cltk. Malgré une installation pip sans erreur, problème à l'exécution ; dans tous les cas n'était pas pour du français contemporain 
* treetagger-python mis de côté car requiert de pré-installer treetagger ; 
* n'ai pas testé rnntagger le nouvel outil en python de l'auteur de treetagger car celui-ci fait plus de 3 Go
" N'ai pas testé le stanford corenlp (java) pour lequel stanza offre un wrapper et qui fait de la lemmatization https://github.com/stanfordnlp/CoreNLP

* spacy pour la robustesse, spacy+lefff pour la précision  