In [1]:
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.tag import StanfordPOSTagger
from nltk.corpus import wordnet
import pandas as pd
from langdetect import detect
from stop_words import get_stop_words
from collections import defaultdict
import pprint
from googletrans import Translator
from nltk.tokenize import RegexpTokenizer


#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')


In [2]:
stop_words = get_stop_words('fr')
df = pd.read_csv('export_articles_EGC_2004_2018.csv', sep='\t')
translator = Translator()
lemmatizer = FrenchLefffLemmatizer()

In [3]:
translator.translate('Hello', dest='fr').text

'Bonjour'

In [4]:
%%time

def languagesDetectionDF(column):
    languagesTitle = defaultdict(int)
    for cell in df[column]:
        if type(cell) is str:
            languagesTitle[detect(cell)] += 1
#         if lang != 'fr' and lang != 'en':
#             print(title)
    return languagesTitle

languageTitle = languagesDetectionDF("title")
languageAbstract = languagesDetectionDF("abstract")

CPU times: user 9.04 s, sys: 129 ms, total: 9.17 s
Wall time: 9.16 s


In [5]:
print("Langage des titres :", languageTitle)

print("Langage des abstract :", languageAbstract)

Langage des titres : defaultdict(<class 'int'>, {'fr': 1132, 'en': 124, 'tl': 1, 'it': 2, 'ca': 5, 'de': 2, 'fi': 1, 'da': 1, 'es': 1})
Langage des abstract : defaultdict(<class 'int'>, {'fr': 991, 'en': 105})


In [6]:
invYr = df.groupby(['year']).count()

print(invYr['abstract'].mean())
print(invYr['abstract'].std())
print(invYr['abstract'].min())
print(invYr['abstract'].max())

73.06666666666666
13.884558054938314
56
98


In [7]:
#tagger fr
jar = 'stanford-postagger-full-2018-10-16/stanford-postagger-3.9.2.jar'
model = 'stanford-postagger-full-2018-10-16/models/french.tagger'
import os
java_path = "/usr/bin/java"
os.environ['JAVAHOME'] = java_path

#appel super long
pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8' )

In [8]:
%%time
def wordNetTag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
def lemma(tokenizedText):
    taggedText = pos_tagger.tag(tokenizedText)
    res = []
    for word in taggedText:
        tag = wordNetTag(word[1])
        if tag:
            res.append(lemmatizer.lemmatize(word[0], tag))

        else:
            res.append(lemmatizer.lemmatize(word[0]))
    return(res)


CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs


In [35]:
%%time

tokenizer = RegexpTokenizer(r'\w+')

def preprocess(txt):
    if(type(txt) == str):
        if detect(txt) != 'fr':
            txt = translator.translate(txt, dest='fr').text
        tok = tokenizer.tokenize(txt)
        tokens = [token for token in tok if (len(token) > 4) ] #Words that have fewer than 3 characters are removed.
        tokens = [token.lower() for token in tokens if token.isalpha()] #stopwords are removed.
        tokens = [w for w in tokens if not w in stop_words] #Lowercase the words and remove punctuation.
        return lemma(tokens)
    else:
        return('')

def lemmatizerDF(dfColumn):
    columnLemma = []
    for cell in dfColumn:
        if type(cell) is str:
            if detect(cell) == 'fr':
                filtered_sentence = [w.lower() for w in word_tokenize(cell) if not w in stop_words] 
                print(type(filtered_sentence))
                columnLemma.append(' '.join(lemma(filtered_sentence)))
            else:
                columnLemma.append(cell)
        else:
            columnLemma.append('')
    return columnLemma


CPU times: user 14 µs, sys: 6 µs, total: 20 µs
Wall time: 25.5 µs


In [36]:
def lemmatizerDF1(dfColumn):
    columnLemma = []
    for cell in dfColumn:
        if type(cell) is str:
            if detect(cell) != 'fr':
                cell = translator.translate(cell, dest='fr').text
            tok = tokenizer.tokenize(cell)
            tokens = [token for token in tok if (len(token) > 4)]
            tokens = [token.lower() for token in tokens if token.isalpha()] 
            tokens = [w for w in tokens if not w in stop_words]
            columnLemma.append(' '.join(lemma(tokens)))
        else:
            columnLemma.append('')
    return columnLemma

In [29]:
%%time

#C'est très long parce que chaque utilisation du tagger prend environ une demie seconde et qu'on l'utilise pour chaque ligne

titleLemma = df.title.apply(lambda x : preprocess(x))

CPU times: user 7.23 s, sys: 8.78 s, total: 16 s
Wall time: 9min 52s


In [37]:
%%time

#C'est très long parce que chaque utilisation du tagger prend environ une demie seconde et qu'on l'utilise pour chaque ligne

abstractLemma = df.abstract.apply(lambda x : preprocess(x))

CPU times: user 7.88 s, sys: 7.48 s, total: 15.4 s
Wall time: 10min 22s


In [38]:
df["titleLemma"] = [" ".join(cell) for cell in titleLemma]
df["abstractLemma"] = abstractLemma

In [39]:
df.to_csv('dataLemmatized.csv')