# Phase de pretraitement des données textuelles

In [3]:
import pandas as pd
from clean import clean_claimKG

import nltk
nltk.download('all')
!pip3 install inflect
import inflect 
import contractions as c
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer 
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import sys

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/massy/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/massy/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /home/massy/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to /home/massy/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     /home/massy/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     /home/massy/nltk_data...
[nltk_data]    |   Package cess_cat is already up-to-date!
[nltk_data]    | Downloading package cess_esp to
[nltk_data]   



In [6]:
file_name = "../data/claimKG.csv"

# Lecture du fichier
kg_origin = pd.read_csv(file_name)
kg= kg_origin.copy()
kg = clean_claimKG(kg)

In [7]:
claims = kg['claimReview_claimReviewed']
claims_text = []
for k,v in claims.sample(10).items():
    claims_text.append(v)

for claim in claims_text:
    print("->",claim)

-> In 2005 and 2007, "" Joe Straus received a 100 percent rating by NARAL (the National Abortion and Reproductive Rights Action League).""
-> Says that except for Donald Trump, ""every other major party nominee"" for the past 40 years has released their tax returns.
-> Says that 500,000 federal workers -- one-fourth of the federal workforce -- make more than $100,000 a year.
-> A Clinton Foundation cargo ship arriving from Africa was raided and found to contain ""illegal contraband"" in the form of foreign refugees, narcotics, weapons, and illegal fruits.
-> ""North Carolina last year was second in the nation in overdose deaths""
-> A photograph shows a female wolf protecting a male's throat during a fight.
-> Harry Reid Was Injured in a Fight With His Brother
-> ""While (Barack) Obama preaches ‘we are our brother’s keeper,’ his brother and aunt live in real poverty in Kenya.""
-> An inattentive janitor caused several deaths in a hospital when he disconnected patients' life support sys

## Tokenization

Découpage de l'assertion en Token (en mots).

In [8]:
def tokenize(text):
    return nltk.word_tokenize(text)

tokenize(claims_text[8])

['An',
 'inattentive',
 'janitor',
 'caused',
 'several',
 'deaths',
 'in',
 'a',
 'hospital',
 'when',
 'he',
 'disconnected',
 'patients',
 "'",
 'life',
 'support',
 'systems',
 'to',
 'plug',
 'in',
 'a',
 'floor',
 'polisher',
 '.']

## Mise en miniscule

La mise en miniscule peut-être util dans certain cas.

In [9]:
def lowercase(text):
    return text.lower()

lowercase(claims_text[2])

'says that 500,000 federal workers -- one-fourth of the federal workforce -- make more than $100,000 a year.'

## numbers to words

Transformer les nombres en mots.

In [22]:
def number_to_words(text):
    return inflect.engine().number_to_words(text)

print(number_to_words("15.2"))

print(claims_text[0])

def number2words(text):
    tokens = tokenize(text)
    for i,m in enumerate(tokens):
        try:
            float(m)
        except ValueError:
            continue
        else:
            tokens[i] = number_to_words(m)
    return ' '.join(tokens)

print(number2words(claims_text[0]))

fifteen point two
In 2005 and 2007, "" Joe Straus received a 100 percent rating by NARAL (the National Abortion and Reproductive Rights Action League).""
In two thousand and five and two thousand and seven , `` '' Joe Straus received a one hundred percent rating by NARAL ( the National Abortion and Reproductive Rights Action League ) . '' ''


## Traitement des contraction et ponctuation

La suppression des ponctuations peut avoir des conséquences sur le qualité du modèle, par exemple dans la détection des opinions. Il est préferable de traiter d'abord les contracions dans les phrases avant de supprimer les ponctuations.

In [11]:
import contractions as c
def contractions(text):
    return c.fix(text)

print(contractions("couldn't"))

def ponctuations(text):
    return re.sub(r'[^\w\s]', ' ', text)

print(ponctuations(claims_text[5]))

could not
A photograph shows a female wolf protecting a male s throat during a fight 


## Stopwords

Supprimer les mots les plus fréquent de la langue. Dans notre cas:`the`, `a`, `an`, `in` ...

NB: Dans les stopwords fournit par défault par NLTK contient les formes de négation.

In [12]:
stopwords.words('english')
stopwords.words('french')

def remove_stopwords(text_tokenized,language='english'):
    stop_words = set(stopwords.words(language))
  
    return [w for w in text_tokenized if not w in stop_words]

print(claims_text[8])
claim = ponctuations(claims_text[8])
claim = tokenize(claim)
claim = remove_stopwords(claim)
print(' '.join(claim))

An inattentive janitor caused several deaths in a hospital when he disconnected patients' life support systems to plug in a floor polisher.
An inattentive janitor caused several deaths hospital disconnected patients life support systems plug floor polisher


## Stemmatisation

Le stemmatisation (racinisation en français) vise à garder la racine du mot. La racine d’un mot correspond à la partie du mot restante une fois que l’on a supprimé son (ses) préfixe(s) et suffixe(s), à savoir son radical. 
Plusieurs variantes d'un terme peuvent ainsi être groupées dans une seule forme représentative.

Il existe plusieurs algorithmes de stemmatisation, celui utiliser ici est `SnowBall Stemmer`. Mais il existe aussi `Lancaster Stemmer` qui est considérer comme plus agresif.

In [19]:
def stem(text_tokenized, language='english',stemmer_name='snowball',verbose=False):
    if stemmer_name == 'snowball':
        if verbose:
            print('Snowball stemmer used!')
        stemmer = SnowballStemmer(language=language) 
    elif stemmer_name == 'lancaster':
        if language != 'english':
            print("LancasterStemmer do not suport "+language, file=sys.stderr)
            raise ValueError()
        stemmer = LancasterStemmer()
    return [stemmer.stem(term) for term in text_tokenized]

claim = claims_text[0]
print(claim)
claim = ponctuations(claim)
claim = tokenize(claim)
claim = stem(claim,stemmer_name='snowball',verbose=True)
print(' '.join(claim))

In 2005 and 2007, "" Joe Straus received a 100 percent rating by NARAL (the National Abortion and Reproductive Rights Action League).""
Snowball stemmer used!
in 2005 and 2007 joe straus receiv a 100 percent rate by naral the nation abort and reproduct right action leagu


## Lemmatisation

La stemmatisation et la lemmatisation sont deux notions proches, mais il y a des différences fondamentales.
La lemmatisation a pour objectif de retrouver le lemme d'un mot, par exemple l'infinitif pour les verbes. La racinisation consiste à supprimer la fin des mots, ce qui peut résulter en un mot qui n'existe pas dans la langue.

NB: La lemmatisation foncionnent beaucoup mieux si chaque mot vient avec son tag parts-of-speech (POS).

In [34]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lem(words, pos_tag=[]):
    lemmatizer = WordNetLemmatizer()
    if len(words) == len(pos_tag):
        lem = []
        for w, pos in zip(words,pos_tag):
            if pos is None:
               lem.append(lemmatizer.lemmatize(w))
            else:
               lem.append(lemmatizer.lemmatize(w, pos=pos))
        return lem
    else:
        return [lemmatizer.lemmatize(w) for w in words]
  
claim = claims_text[8]
print(claim)
claim = ponctuations(claim)
claim = tokenize(claim)

pos = pos_tag(claim)

claim = lem(claim,[get_wordnet_pos(p[1]) for p in pos])
print(' '.join(claim))

An inattentive janitor caused several deaths in a hospital when he disconnected patients' life support systems to plug in a floor polisher.
An inattentive janitor cause several death in a hospital when he disconnect patient life support system to plug in a floor polisher


## Pos-tagging

L'étiquetage morpho-syntaxique est le processus qui consiste à associer aux mots d'un texte les informations grammaticales correspondantes comme la partie du discours, le genre, le nombre, etc.

In [35]:
def pos_tag(words):
    return nltk.pos_tag(words)

claim = claims_text[8]
print(claim)
claim = ponctuations(claim)
claim = tokenize(claim)
pos = pos_tag(claim)
pos

An inattentive janitor caused several deaths in a hospital when he disconnected patients' life support systems to plug in a floor polisher.


[('An', 'DT'),
 ('inattentive', 'JJ'),
 ('janitor', 'NN'),
 ('caused', 'VBD'),
 ('several', 'JJ'),
 ('deaths', 'NNS'),
 ('in', 'IN'),
 ('a', 'DT'),
 ('hospital', 'NN'),
 ('when', 'WRB'),
 ('he', 'PRP'),
 ('disconnected', 'VBD'),
 ('patients', 'NNS'),
 ('life', 'NN'),
 ('support', 'NN'),
 ('systems', 'NNS'),
 ('to', 'TO'),
 ('plug', 'VB'),
 ('in', 'IN'),
 ('a', 'DT'),
 ('floor', 'NN'),
 ('polisher', 'NN')]

## Fonction de prétraitement

La fonction de prétraitement va utiliser une combinaison des fonctions citées plus haut.
L'idée est donc d'entraîner le modèle avec quelques combinaisons pour voir leurs effets sur le processus d'apprentissage. Cette fonction va prendre en paramètre un texte brut et donner en résultat une liste de token, potentiellement accompagné avec leur tag POS.

In [49]:
def pretraitement(text_brut):
    text = ponctuations(text_brut)
    
    text_tokenized = tokenize(text)
    
    text_tagged = pos_tag(text_tokenized)

    text_lematized = lem(text_tokenized,[get_wordnet_pos(p[1]) for p in text_tagged])
    
    return [(text_lematized[i],p) for i,(w,p) in enumerate(text_tagged)]

for claim in claims_text:
    print("Claim: ",claim)
    pre = pretraitement(claim)
    for p in pre:
        print(p)
    print("\n")

Claim:  In 2005 and 2007, "" Joe Straus received a 100 percent rating by NARAL (the National Abortion and Reproductive Rights Action League).""
('In', 'IN')
('2005', 'CD')
('and', 'CC')
('2007', 'CD')
('Joe', 'NNP')
('Straus', 'NNP')
('receive', 'VBD')
('a', 'DT')
('100', 'CD')
('percent', 'NN')
('rating', 'NN')
('by', 'IN')
('NARAL', 'NNP')
('the', 'DT')
('National', 'NNP')
('Abortion', 'NNP')
('and', 'CC')
('Reproductive', 'NNP')
('Rights', 'NNP')
('Action', 'NNP')
('League', 'NNP')


Claim:  Says that except for Donald Trump, ""every other major party nominee"" for the past 40 years has released their tax returns.
('Says', 'VBZ')
('that', 'WDT')
('except', 'IN')
('for', 'IN')
('Donald', 'NNP')
('Trump', 'NNP')
('every', 'DT')
('other', 'JJ')
('major', 'JJ')
('party', 'NN')
('nominee', 'NN')
('for', 'IN')
('the', 'DT')
('past', 'JJ')
('40', 'CD')
('year', 'NNS')
('have', 'VBZ')
('release', 'VBN')
('their', 'PRP$')
('tax', 'NN')
('return', 'NNS')


Claim:  Says that 500,000 federal wo