# **TP 3 : NLP Classification de texte**

1. **Import Libraries :**

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

2. **Import data file :**

In [2]:
file_path = '/kaggle/input/movie-review/movie_review.csv'

#charger data set juste la colone text et tag
df = pd.read_csv(file_path,usecols=['text', 'tag'])

3. **Pre-processing des données textuelles :**

In [3]:
# Téléchargez la liste des mots vides
nltk.download('punkt')
nltk.download('stopwords')

#supprime les mots tel que "is" ,"the" , ... etc
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convertir en minuscules
    text = text.lower()
    # Supprimer la ponctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenization
    tokens = word_tokenize(text)
    # Supprimer les stop words
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


4. **Entraînement du modèle Word2Vec :**

In [4]:
# Prétraitez les données avant d'entraîner Word2Vec
preprocessed_data = [preprocess_text(text) for text in df['text']]

# Entraîner le modèle Word2Vec
word2vec_model = Word2Vec(sentences=preprocessed_data, vector_size=100, window=5, min_count=1, workers=4)

5. **Vectorisation des reviews de movies :**

In [5]:
def vectorize_text(text, model):
    # Prétraiter le texte
    preprocessed_text = preprocess_text(text)
    # Initialiser un vecteur de zéros
    vector = np.zeros((model.vector_size,))
    # Compter le nombre de mots dans le texte
    count = 0
    for word in preprocessed_text:
        if word in model.wv.key_to_index:
            vector += model.wv[word]
            count += 1
    # Calculer la moyenne des vecteurs
    if count != 0:
        vector /= count
    return vector
vectors = [vectorize_text(text, word2vec_model) for text in df['text']]

6. **Division des données :**

In [6]:
X_train, X_test, y_train, y_test = train_test_split(vectors, df['tag'], test_size=0.2, random_state=42)

7. **Construction d'un classificateur :**

In [7]:
# Initialiser le modèle
classifier = LogisticRegression(max_iter=1000)
# Entraîner le modèle
classifier.fit(X_train, y_train)

8. **Évaluation du modèle :**

In [8]:
# Prédictions sur l'ensemble de test
y_pred = classifier.predict(X_test)

# Calcul des métriques
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.5738566131025958
Precision: 0.5758785763735338
Recall: 0.5738566131025958
F1 Score: 0.5682661470086734
