In [1]:
import pandas as pd
# Charger les trois fichiers CSV
df1 = pd.read_csv('/kaggle/input/analyse-sentiement-kafka-dl-js/train_data_single/part-00000-abd2d631-00fd-4975-a058-ffbd706f7bcb-c000.csv')
df2 = pd.read_csv('/kaggle/input/analyse-sentiement-kafka-dl-js/val_data_single/part-00000-8cc101df-e287-47d4-aa3d-6fbe733328ad-c000.csv')
df3 = pd.read_csv('/kaggle/input/analyse-sentiement-kafka-dl-js/test_data_single/part-00000-1f0de82b-a6f9-478a-863f-705b0fef0a61-c000.csv')



In [2]:
# importing required libraries 

import pandas as pd

# for pytorch imports
import torch

# for functional dependencies like activation function 
import torch.nn.functional as F

# nn is basic module in Torch which provide different neural network architecture
import torch.nn as nn
import torch.optim as optim

# CountVectorizer for Bagof words model
from sklearn.feature_extraction.text import CountVectorizer

# for padding .. since the LSTM takes input as sequence so it is said that 
#if we have fixed input string computation will be faster and it will improve performance 
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
class Sequences(Dataset):
    def __init__(self, df, max_seq_len):
        self.max_seq_len = max_seq_len
        
        # Remplacer les NaN par une chaîne vide
        df['text'].fillna('', inplace=True)
        
        # Vérifier si des NaN subsistent (facultatif)
        print("Nombre de NaN dans text:", df['text'].isna().sum())  # Doit afficher 0
        
        # Bag of Words (BOW)
        vectorizer = CountVectorizer(stop_words='english', min_df=0.015)
        vectorizer.fit(df['text'].tolist())  # Utilisation sécurisée de processed_text
        
        # Créer le vocabulaire
        self.token2idx = vectorizer.vocabulary_
        self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1

        tokenizer = vectorizer.build_analyzer()
        self.encode = lambda x: [self.token2idx[token] for token in tokenizer(x)
                                 if token in self.token2idx]
        self.pad = lambda x: x + (max_seq_len - len(x)) * [self.token2idx['<PAD>']]
        
        # Encoder les séquences
        sequences = [self.encode(sequence)[:max_seq_len] for sequence in df['text'].tolist()]
        
        # Filtrer les séquences vides
        sequences, self.labels = zip(*[(sequence, label) for sequence, label
                                       in zip(sequences, df['label'].tolist()) if sequence])
        self.sequences = [self.pad(sequence) for sequence in sequences]

    def __getitem__(self, i):
        assert len(self.sequences[i]) == self.max_seq_len
        return self.sequences[i], self.labels[i]

    def __len__(self):
        return len(self.sequences)


In [5]:
# Combiner les DataFrames
from statistics import mode


df = pd.concat([df1, df2, df3], ignore_index=True)
df['label'] = df['sentiment']
del df['sentiment']
del df['processed_words']
del df['processed_text']
del df['words']
df




Unnamed: 0,text,label
0,Go get `em! lol,0
1,"I like fridays generally, but class is extende...",0
2,Mc John sim posit 3 yr passed no softees ...,0
3,What To Say?,0
4,from g`s to gents season 1,0
...,...,...
26381,y do i only have 2 people following me people...,1
26382,yay it`s friday... hold on I have to work tomo...,1
26383,"yep, good morning to you all or night or even...",1
26384,yo yo yo! i like ice cream,1


In [6]:
#Supprimer les nulls
df.dropna(inplace=True)



In [7]:
import re
def text_cleaner(tx):
    
    text = re.sub(r"won\'t", "would not", tx)
    text = re.sub(r"im", "i am", tx)
    text = re.sub(r"Im", "I am", tx)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"don\'t", "do not", text)
    text = re.sub(r"shouldn\'t", "should not", text)
    text = re.sub(r"needn\'t", "need not", text)
    text = re.sub(r"hasn\'t", "has not", text)
    text = re.sub(r"haven\'t", "have not", text)
    text = re.sub(r"weren\'t", "were not", text)
    text = re.sub(r"mightn\'t", "might not", text)
    text = re.sub(r"didn\'t", "did not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\!\?\.\@]',' ' , text)
    text = re.sub(r'[!]+' , '!' , text)
    text = re.sub(r'[?]+' , '?' , text)
    text = re.sub(r'[.]+' , '.' , text)
    text = re.sub(r'[@]+' , '@' , text)
    text = re.sub(r'unk' , ' ' , text)
    text = re.sub('\n', '', text)
    text = text.lower()
    text = re.sub(r'[ ]+' , ' ' , text)
    
    return text

In [8]:
df['text'] = df['text'].apply(lambda x : text_cleaner(x))

In [9]:
dataset = Sequences(df, max_seq_len=128)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text'].fillna('', inplace=True)


Nombre de NaN dans text: 0


In [10]:
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Vérification de la répartition des classes avant équilibrage
print("Distribution des classes avant équilibrage :")
# Convertir dataset.labels en DataFrame pour afficher la distribution
df_labels = pd.DataFrame({'label': dataset.labels})
print(df_labels['label'].value_counts())

# Extraire les séquences et les labels
X = dataset.sequences  # Les séquences
y = dataset.labels  # Les labels

# Diviser les données en ensemble d'entraînement, validation et test (70% - 15% - 15%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Appliquer SMOTE uniquement sur l'ensemble d'entraînement
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Vérification après équilibrage
print("\nDistribution des classes après SMOTE (train) :")
print(Counter(y_train_resampled))
print("\nDistribution des classes (validation) :")
print(Counter(y_val))
print("\nDistribution des classes (test) :")
print(Counter(y_test))


Distribution des classes avant équilibrage :
label
1    6952
2    5923
0    5429
Name: count, dtype: int64

Distribution des classes après SMOTE (train) :
Counter({1: 4866, 2: 4866, 0: 4866})

Distribution des classes (validation) :
Counter({1: 1043, 2: 889, 0: 814})

Distribution des classes (test) :
Counter({1: 1043, 2: 888, 0: 815})


In [11]:
from torch.nn.utils.rnn import pad_sequence
from torch.autograd import Variable
import torch

# Fonction de collate pour padding des séquences
def collate(batch):
    inputs, labels = zip(*batch)
    inputs = [torch.tensor(seq) for seq in inputs]
    inputs = pad_sequence(inputs, batch_first=True, padding_value=0)  # Padding avec 0
    labels = torch.tensor(labels)
    return inputs, labels

# Créer un DataLoader pour l'entraînement, la validation et le test
train_dataset = list(zip(X_train, y_train_resampled))
val_dataset = list(zip(X_val, y_val))
test_dataset = list(zip(X_test, y_test))

train_loader = DataLoader(train_dataset, batch_size=128, collate_fn=collate)
val_loader = DataLoader(val_dataset, batch_size=128, collate_fn=collate)
test_loader = DataLoader(test_dataset, batch_size=128, collate_fn=collate)


In [12]:
# Fonction d'évaluation
def evaluate(model, data_loader, criterion):
    model.eval()  # Mettre le modèle en mode évaluation
    total_loss = 0
    total_accuracy = 0
    with torch.no_grad():  # Pas de rétropropagation pendant l'évaluation
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(model.device), labels.to(model.device)

            # Passage avant
            outputs = model(inputs)

            # Calcul de la perte
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            # Calcul de l'accuracy
            accuracy = calculate_accuracy(outputs, labels)
            total_accuracy += accuracy.item()

    avg_loss = total_loss / len(data_loader)
    avg_accuracy = total_accuracy / len(data_loader)
    return avg_loss, avg_accuracy


In [13]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score

# Charger les embeddings GloVe
def load_glove_embeddings(glove_path, vocab):
    embeddings_index = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    
    embedding_dim = len(next(iter(embeddings_index.values())))
    embedding_matrix = np.zeros((len(vocab), embedding_dim))

    for word, idx in vocab.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector
        else:
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
    
    return embedding_matrix, embedding_dim

# Classe du modèle RNN amélioré
class RNN(nn.Module):
    def __init__(self, embedding_layer, hidden_size=256, num_classes=3, device='cpu'):
        super(RNN, self).__init__()
        self.device = device
        self.encoder = nn.Embedding.from_pretrained(embedding_layer, freeze=False)
        self.rnn = nn.GRU(
            input_size=embedding_layer.size(1),  # Dimension des embeddings GloVe
            hidden_size=hidden_size,
            num_layers=2,  # Couches supplémentaires pour capturer les dépendances
            bidirectional=True,
            batch_first=True,
            dropout=0.3  # Dropout entre les couches GRU)
        self.layer_norm = nn.LayerNorm(hidden_size * 2)  # Normalisation par couches
        self.attn = nn.Linear(hidden_size * 2, 1)  # Mécanisme d'attention
        self.dropout = nn.Dropout(p=0.5)  # Dropout régularisation
        self.decoder = nn.Linear(hidden_size * 2, num_classes)  # Bidirectionnel x2
    def forward(self, inputs):
        embedded = self.encoder(inputs)
        rnn_output, _ = self.rnn(embedded)
        rnn_output = self.layer_norm(rnn_output)
        attn_weights = torch.softmax(self.attn(rnn_output), dim=1)
        context = torch.sum(attn_weights * rnn_output, dim=1)
        context = self.dropout(context)
        output = self.decoder(context) return output

# Fonction pour calculer l'accuracy
def calculate_accuracy(predictions, labels):
    preds = predictions.argmax(dim=1)
    correct = torch.sum(preds == labels)
    accuracy = correct.float() / labels.size(0)
    return accuracy

# Fonction d'évaluation
def evaluate(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    total_accuracy = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(model.device), labels.to(model.device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            accuracy = calculate_accuracy(outputs, labels)
            total_accuracy += accuracy.item()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(outputs.argmax(dim=1).cpu().numpy())
    
    avg_loss = total_loss / len(data_loader)
    avg_accuracy = total_accuracy / len(data_loader)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return avg_loss, avg_accuracy, f1

# Charger les embeddings GloVe et définir le vocabulaire
vocab = dataset.token2idx  # À remplacer par votre vocabulaire
glove_path = '/kaggle/input/glovetwitter/glove.twitter.27B.200d.txt'
embedding_matrix, embedding_dim = load_glove_embeddings(glove_path, vocab)

# Convertir les séquences et les labels en Tensors
X_train_tensor = torch.tensor(X_train_resampled, dtype=torch.long)
y_train_tensor = torch.tensor(y_train_resampled, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.long)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Créer les DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Créer le modèle
model = RNN(
    embedding_layer=torch.FloatTensor(embedding_matrix),
    hidden_size=256,
    num_classes=3,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)
model = model.to(model.device)

# Fonction de perte et optimiseur
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Plan de diminution du taux d'apprentissage
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

# Entraînement du modèle
epochs = 30
for epoch in range(epochs):
    model.train()
    total_loss = 0
    total_accuracy = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(model.device), labels.to(model.device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        accuracy = calculate_accuracy(outputs, labels)
        total_accuracy += accuracy.item()

        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / len(train_loader)
    avg_accuracy = total_accuracy / len(train_loader)

    # Validation
    val_loss, val_accuracy, val_f1 = evaluate(model, val_loader, criterion)
    scheduler.step(val_loss)

    print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {avg_loss:.4f}, Train Accuracy: {avg_accuracy:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}, F1 Score: {val_f1:.4f}")

# Test final
test_loss, test_accuracy, test_f1 = evaluate(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, F1 Score: {test_f1:.4f}")


Epoch [1/30], Train Loss: 0.9919, Train Accuracy: 0.5161
Validation Loss: 0.8867, Validation Accuracy: 0.5742, F1 Score: 0.5793
Epoch [2/30], Train Loss: 0.9151, Train Accuracy: 0.5645
Validation Loss: 0.9228, Validation Accuracy: 0.5645, F1 Score: 0.5735
Epoch [3/30], Train Loss: 0.9030, Train Accuracy: 0.5722
Validation Loss: 0.8863, Validation Accuracy: 0.5626, F1 Score: 0.5664
Epoch [4/30], Train Loss: 0.9035, Train Accuracy: 0.5736
Validation Loss: 0.9041, Validation Accuracy: 0.5677, F1 Score: 0.5765
Epoch [5/30], Train Loss: 0.9000, Train Accuracy: 0.5721
Validation Loss: 0.8986, Validation Accuracy: 0.5693, F1 Score: 0.5727
Epoch [6/30], Train Loss: 0.8982, Train Accuracy: 0.5764
Validation Loss: 0.8875, Validation Accuracy: 0.5714, F1 Score: 0.5725
Epoch [7/30], Train Loss: 0.8916, Train Accuracy: 0.5771
Validation Loss: 0.8948, Validation Accuracy: 0.5775, F1 Score: 0.5843
Epoch [8/30], Train Loss: 0.8776, Train Accuracy: 0.5852
Validation Loss: 0.8888, Validation Accuracy: 0