In [None]:
import nltk
import numpy as np
import operator as op
import pandas as pd
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import io
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

In [None]:
#chargement des donnes
liste_id = []
liste_label = []
liste_message = []
file_train = '/content/drive/My Drive/Analyse_des_sentiment/data/train.xml'
file_dev   = '/content/drive/My Drive/Analyse_des_sentiment/data/dev.xml'
file_test  = '/content/drive/My Drive/Analyse_des_sentiment/data/test.xml'

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def read_xml_to_dataframe(file_path):
    # Analyse du fichier XML
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Création d'une liste vide pour stocker les données
    data = []

    # Parcours des éléments de commentaire
    for comment in root.findall('comment'):
        movie = comment.find('movie').text
        review_id = comment.find('review_id').text
        name = comment.find('name').text
        user_id = comment.find('user_id').text
        note = comment.find('note').text.replace(',', '.')
        commentaire = comment.find('commentaire').text

        # Ajout des données dans la liste sous forme de dictionnaire
        data.append({
            'movie': movie,
            'review_id': review_id,
            'name': name,
            'user_id': user_id,
            'note': note,
            'commentaire': commentaire
        })

    # Création d'un DataFrame
    df = pd.DataFrame(data)

    return df

In [None]:
#fonction pour le fichier de test
def read_xml_to_dataframeTest(file_path):
    # Analyse du fichier XML
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Création d'une liste vide pour stocker les données
    data = []

    # Parcours des éléments de commentaire
    for comment in root.findall('comment'):
        # Initialisation d'un dictionnaire pour chaque commentaire
        comment_data = {}

        # Extraction de chaque élément si disponible et ajout au dictionnaire
        movie = comment.find('movie')
        if movie is not None:
            comment_data['movie'] = movie.text

        review_id = comment.find('review_id')
        if review_id is not None:
            comment_data['review_id'] = review_id.text

        name = comment.find('name')
        if name is not None:
            comment_data['name'] = name.text

        user_id = comment.find('user_id')
        if user_id is not None:
            comment_data['user_id'] = user_id.text

        # Note is not present in the given data, but if it's sometimes included,
        # you can check for it like this:
        note = comment.find('note')
        if note is not None:
            comment_data['note'] = note.text.replace(',', '.')

        commentaire = comment.find('commentaire')
        if commentaire is not None:
            comment_data['commentaire'] = commentaire.text

        # Ajout des données dans la liste sous forme de dictionnaire
        data.append(comment_data)

    # Création d'un DataFrame
    df = pd.DataFrame(data)

    return df

# Récupération des données dans des dataFrame

In [None]:
#création DataFrame
data_dev   = read_xml_to_dataframe(file_dev)
data_train = read_xml_to_dataframe(file_train)
data_test  = read_xml_to_dataframeTest(file_test)

data_train['commentaire'] = data_train['commentaire'].fillna('')
data_dev['commentaire'] = data_dev['commentaire'].fillna('')
data_test['commentaire'] = data_test['commentaire'].fillna('')

In [None]:
data_train['note'] = pd.to_numeric(data_train['note'], errors='coerce')
data_dev['note'] = pd.to_numeric(data_dev['note'], errors='coerce')
data_train['note'] = data_train['note'].apply(lambda x: x * 2)
data_dev['note'] = data_dev['note'].apply(lambda x: x * 2)

In [None]:
data_test  = read_xml_to_dataframeTest(file_test)
data_test['commentaire'] = data_test['commentaire'].fillna('')

In [None]:
Y_train  = data_train['note']
y_train_adjusted = Y_train - 1

In [None]:
Y_dev  = data_dev['note']
y_dev_adjusted = Y_dev - 1
y_dev_adjusted.shape

(100400,)

In [None]:
y_train_adjusted.shape

(665962,)

### **Prétraitement des Commentaires Textuels avec Keras : Tokenisation et Remplissage**

In [None]:
#
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd

# Initialiser le tokenizer de Keras
tokenizer = Tokenizer(num_words=5000)  
tokenizer.fit_on_texts(data_train['commentaire'])

# Convertir les commentaires en séquences d'entiers
X_train = tokenizer.texts_to_sequences(data_train['commentaire'])

# Définir une longueur maximale
MAX_LENGTH = 500

# Padding et troncature des vecteurs
X_train_padded = pad_sequences(X_train, maxlen=MAX_LENGTH, padding='post', truncating='post')

# Affichage du résultat
X_train_padded

In [None]:
# Convertir les commentaires en séquences d'entiers
X_dev = tokenizer.texts_to_sequences(data_dev['commentaire'])

# Padding et troncature des vecteurs
X_dev_padded = pad_sequences(X_dev, maxlen=MAX_LENGTH, padding='post', truncating='post')


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Classe du modèle LSTM
class CommentClassifierLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, embed_size, num_layers):
        super(CommentClassifierLSTM, self).__init__()
        self.embedding = nn.Embedding(input_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=0.5)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Prendre la dernière sortie de la séquence
        x = self.fc(x)
        return x

# Paramètres du modèle 
input_size = 10000  # Taille du vocabulaire 
embed_size = 100  # Taille des embeddings
hidden_size = 128  # Taille des couches cachées LSTM
num_classes = 10  # Nombre de classes de sortie
num_layers = 2  # Nombre de couches LSTM

# Création du modèle et passage au GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CommentClassifierLSTM(input_size, hidden_size, num_classes, embed_size, num_layers).to(device)

# Définir la fonction de perte et l'optimiseur
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
from sklearn.model_selection import train_test_split


# Convertir en tenseurs
X_train_tensor = torch.tensor(X_train_padded, dtype=torch.long).to(device)
y_train_tensor = torch.tensor(y_train_adjusted, dtype=torch.long).to(device)
X_val_tensor = torch.tensor(X_dev_padded, dtype=torch.long).to(device)
y_val_tensor = torch.tensor(y_dev_adjusted, dtype=torch.long).to(device)

# Création des DataLoaders pour l'entraînement et la validation
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Boucle d'entraînement avec validation
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for inputs, labels in train_loader:
        # Entraînement
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {total_loss/len(train_loader):.4f}, Validation Loss: {val_loss/len(val_loader):.4f}, Accuracy: {correct / total:.4f}')


Epoch [1/10], Training Loss: 1.8027, Validation Loss: 2.4517, Accuracy: 0.1711
Epoch [2/10], Training Loss: 1.4614, Validation Loss: 2.5105, Accuracy: 0.1735
Epoch [3/10], Training Loss: 1.4105, Validation Loss: 2.7127, Accuracy: 0.1479
Epoch [4/10], Training Loss: 1.3797, Validation Loss: 2.5755, Accuracy: 0.1623
Epoch [5/10], Training Loss: 1.3562, Validation Loss: 2.6092, Accuracy: 0.1620
Epoch [6/10], Training Loss: 1.3360, Validation Loss: 2.7017, Accuracy: 0.1598
Epoch [7/10], Training Loss: 1.3175, Validation Loss: 2.7745, Accuracy: 0.1586
Epoch [8/10], Training Loss: 1.3007, Validation Loss: 2.7472, Accuracy: 0.1573
Epoch [9/10], Training Loss: 1.2865, Validation Loss: 2.7999, Accuracy: 0.1577
Epoch [10/10], Training Loss: 1.2741, Validation Loss: 2.9094, Accuracy: 0.1529


### **Tf_IDF**

In [None]:
import pickle
with open('/content/drive/My Drive/Analyse_des_sentiment/data/dataTF/tfidf_dev.pkl', 'rb') as f:
    X_dev_padded  = pickle.load(f)

with open('/content/drive/My Drive/Analyse_des_sentiment/data/dataTF/tfidf_train.pkl', 'rb') as f:
     X_train_padded = pickle.load(f)

with open('/content/drive/My Drive/Analyse_des_sentiment/data/dataTF/tfidf_test.pkl', 'rb') as f:
    X_test_padded  = pickle.load(f)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Classe du modèle LSTM
class CommentClassifierLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, embed_size, num_layers):
        super(CommentClassifierLSTM, self).__init__()
        self.embedding = nn.Embedding(input_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=0.5)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Prendre la dernière sortie de la séquence
        x = self.fc(x)
        return x

# Paramètres du modèle
input_size = 10000  # Taille du vocabulaire 
embed_size = 100  # Taille des embeddings
hidden_size = 128  # Taille des couches cachées LSTM
num_classes = 10  # Nombre de classes de sortie
num_layers = 2  # Nombre de couches LSTM

# Création du modèle et passage au GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CommentClassifierLSTM(input_size, hidden_size, num_classes, embed_size, num_layers).to(device)

# Définir la fonction de perte et l'optimiseur
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


X_train_dense = X_train_padded.toarray()  

# Convertir y_train_adjusted en un tableau numpy
y_train_numpy = y_train_adjusted.values 

# Convertir les tableaux numpy dense en tenseurs PyTorch
X_train_tensor = torch.tensor(X_train_dense, dtype=torch.long).to(device)
y_train_tensor = torch.tensor(y_train_numpy, dtype=torch.long).to(device)

# Création du DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


# Boucle d'entraînement
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}')


#**Bert-base-uncased**

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Initialisation du tokenizer et du modèle
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Vérifier si un GPU est disponible et le configurer pour le modèle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
def get_embeddings(texts, max_length=500):
    # Tronquer les textes si leur longueur dépasse max_length
    truncated_texts = [text[:max_length] for text in texts]

    # Préparer les inputs pour le modèle et les déplacer sur le GPU
    inputs = tokenizer(truncated_texts, padding=True, truncation=True, return_tensors="pt", max_length=max_length)
    inputs = inputs.to(device)

    with torch.no_grad():
        # Exécuter le modèle sur le GPU
        outputs = model(**inputs)

    # Ramener les embeddings sur le CPU et les convertir en NumPy
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()


In [None]:
# Appliquer la fonction à la colonne de texte du DataFrame
data_train['embeddings'] = data_train['commentaire'].apply(lambda x: get_embeddings([x]))
data_train['embeddings'].shape

(665962,)

In [None]:
data_test['embeddings'] = data_test['commentaire'].apply(lambda x: get_embeddings([x]))
data_test['embeddings'].shape

(85847,)

In [None]:
X_train_padded = data_train['embeddings']

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convertir les données en tenseurs PyTorch et les transférer sur le bon appareil
X_train_tensor = torch.tensor(X_train_padded, dtype=torch.float).to(device)
y_train_tensor = torch.tensor(y_train_adjusted, dtype=torch.long).to(device)

# Créer un DataLoader pour le traitement par lots
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# **LSTM**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class CommentClassifierLSTM(nn.Module):
    def __init__(self, hidden_size, num_classes, num_layers, input_dim):
        super(CommentClassifierLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_size, num_layers, batch_first=True, dropout=0.5)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Prendre la dernière sortie de la séquence
        x = self.fc(x)
        return x

In [None]:
input_dim = 768  # Taille des embeddings
hidden_size = 128  # Taille des couches cachées LSTM
num_classes = 10  # Nombre de classes de sortie 
num_layers = 2  # Nombre de couches LSTM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CommentClassifierLSTM(hidden_size, num_classes, num_layers, input_dim).to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}')

Epoch [1/10], Loss: 2.0454
Epoch [2/10], Loss: 2.0045
Epoch [3/10], Loss: 1.9907
Epoch [4/10], Loss: 1.9827
Epoch [5/10], Loss: 1.9764
Epoch [6/10], Loss: 1.9719
Epoch [7/10], Loss: 1.9677
Epoch [8/10], Loss: 1.9640
Epoch [9/10], Loss: 1.9615
Epoch [10/10], Loss: 1.9585


In [None]:
X_test_padded = data_test['embeddings']

In [None]:
X_test_tensor = torch.tensor(X_test_padded, dtype=torch.float).to(device)

test_dataset = TensorDataset(X_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
def predict(model, test_loader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for inputs in test_loader:
            inputs = inputs[0]  
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.cpu().numpy())

    return predictions

predictions = predict(model, test_loader)


In [None]:
# Ajouter les prédictions ajustées comme une nouvelle colonne à data_test
predictions_series = pd.Series(predictions)
adjusted_predictions = (predictions_series + 1) / 2
data_test['note'] = adjusted_predictions
dataTest = data_test
dataTest
import pandas as pd

# Fonction pour convertir un nombre flottant en chaîne avec une virgule pour le décimal
def float_to_comma_string(x):
    if isinstance(x, float):
        return '{:.1f}'.format(x).replace('.', ',')
    return x

# Convertir les notes (prédictions) en nombres
data_test['note'] = data_test['note'].apply(float_to_comma_string)

# Fusionner les prédictions avec review_id
df_test = pd.concat([data_test['review_id'], data_test['note']], axis=1)

# Enregistrer dans un nouveau fichier texte séparé par un espace
df_test.to_csv('/content/drive/My Drive/Analyse_des_sentiment/data/test_output_LSTM_Embeding.txt', sep=' ', index=False, header=False)