In [16]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Télécharger les données NLTK
nltk.download('stopwords')




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oumaima\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [35]:
# Chargement du dataset
df = pd.read_csv('movie_review.csv')
print("hello")

hello


In [34]:
# Pre-processing des données textuelles
def preprocess_text(text):
    # Convertir le texte en minuscules
    text = text.lower()
    # Supprimer la ponctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Supprimer les stopwords
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

df['processed_text'] = df['text'].apply(preprocess_text)

In [33]:
# Entraînement du modèle Word2Vec
model = Word2Vec(sentences=df['processed_text'], vector_size=100, window=5, min_count=1, workers=4)


In [32]:
# Vectorisation des reviews de movies
def vectorize_text(text):
    vectors = []
    for word in text:
        if word in model.wv:
            vectors.append(model.wv[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(100)  # Retourne un vecteur de taille 100 rempli de zéros si aucun mot trouvé

df['vectorized_text'] = df['processed_text'].apply(vectorize_text)


In [31]:
# Division des données
X = np.array(df['vectorized_text'].tolist())
y = np.array(df['tag'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [30]:
# Construction d'un classificateur
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)


In [29]:
# Prédictions
y_pred = classifier.predict(X_test)


In [28]:
# Évaluation du modèle
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')


In [27]:
#affichage
print("Résultats de l'entraînement :")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")


Résultats de l'entraînement :
Accuracy: 0.5718479604449939
Precision: 0.5739487323439882
Recall: 0.5718479604449939
F1 score: 0.5658843573479401
