In [4]:
import nltk
nltk.download('all')
import spacy
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

# Téléchargement des ressources nécessaires
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('tagsets')

# Chargement du modèle spaCy
nlp = spacy.load('en_core_web_sm')

# Création du DataFrame
data = {
    'Review': [
        'At McDonald\'s the food was ok and the service was bad.',
        'I would not recommend this Japanese restaurant to anyone.',
        'I loved this restaurant when I traveled to Thailand last summer.',
        'The menu of Loving has a wide variety of options.',
        'The staff was friendly and helpful at Google\'s employees restaurant.',
        'The ambiance at Bella Italia is amazing, and the pasta dishes are delicious.',
        'I had a terrible experience at Pizza Hut. The pizza was burnt, and the service was slow.',
        'The sushi at Sushi Express is always fresh and flavorful.',
        'The steakhouse on Main Street has a cozy atmosphere and excellent steaks.',
        'The dessert selection at Sweet Treats is to die for!'
    ]
}

df = pd.DataFrame(data)

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/ludovicveltz/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/ludovicveltz/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/ludovicveltz/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/ludovicveltz/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/ludovicveltz/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_

In [2]:
# 1. Fonction de prétraitement
def preprocess_text(text):
    # Conversion en minuscules
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Suppression de la ponctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Reconstitution du texte
    return ' '.join(tokens)

In [5]:
# 2. Création du nouveau dataset avec le texte prétraité
df['Preprocessed_Review'] = df['Review'].apply(preprocess_text)


In [6]:
# 3. Fonction pour NER
def perform_ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [7]:
# 4. Fonction pour POS tagging
def perform_pos_tagging(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    return pos_tags

In [None]:
# 5. Application des fonctions et analyse

print("Exemple de prétraitement :")
print("Original :", df['Review'][0])
print("Prétraité :", df['Preprocessed_Review'][0])
print("\n")

print("Exemple de NER sur texte original :")
print("Texte :", df['Review'][0])
print("Entités :", perform_ner(df['Review'][0]))
print("\n")

print("Exemple de NER sur texte prétraité :")
print("Texte :", df['Preprocessed_Review'][0])
print("Entités :", perform_ner(df['Preprocessed_Review'][0]))
print("\n")

print("Exemple de POS tagging sur texte original :")
print("Texte :", df['Review'][0])
print("Tags :", perform_pos_tagging(df['Review'][0]))
print("\n")

print("Exemple de POS tagging sur texte prétraité :")
print("Texte :", df['Preprocessed_Review'][0])
print("Tags :", perform_pos_tagging(df['Preprocessed_Review'][0]))

# Affichage de la signification des tags POS
print("\nSignification des tags POS courants :")
nltk.help.upenn_tagset('NN')

Exemple de prétraitement :
Original : At McDonald's the food was ok and the service was bad.
Prétraité : at mcdonald 's the food wa ok and the service wa bad


Exemple de NER sur texte original :
Texte : At McDonald's the food was ok and the service was bad.
Entités : [("McDonald's", 'ORG')]


Exemple de NER sur texte prétraité :
Texte : at mcdonald 's the food wa ok and the service wa bad
Entités : [("mcdonald 's", 'ORG')]


Exemple de POS tagging sur texte original :
Texte : At McDonald's the food was ok and the service was bad.
Tags : [('At', 'IN'), ('McDonald', 'NNP'), ("'s", 'POS'), ('the', 'DT'), ('food', 'NN'), ('was', 'VBD'), ('ok', 'JJ'), ('and', 'CC'), ('the', 'DT'), ('service', 'NN'), ('was', 'VBD'), ('bad', 'JJ'), ('.', '.')]


Exemple de POS tagging sur texte prétraité :
Texte : at mcdonald 's the food wa ok and the service wa bad
Tags : [('at', 'IN'), ('mcdonald', 'NN'), ("'s", 'POS'), ('the', 'DT'), ('food', 'NN'), ('wa', 'NN'), ('ok', 'NN'), ('and', 'CC'), ('the', 'DT')

: 

Analyse des résultats :

Prétraitement :
- Convertit tout en minuscules
- Supprime la ponctuation
- Applique la lemmatisation pour obtenir la forme de base des mots
- Le texte est plus propre et standardisé

NER (Named Entity Recognition) :
- Sur le texte original : détecte mieux les entités nommées (McDonald's, Japanese, Thailand, etc.)
- Sur le texte prétraité : moins efficace car la casse et la ponctuation sont des indices importants

POS (Part of Speech) Tagging :
- Sur le texte original : donne une meilleure analyse grammaticale
- Sur le texte prétraité : moins précis mais toujours utile pour l'analyse basique