## Import the Dependecies

In [1]:
import pandas as pd
from __future__ import annotations
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

## Putting all the csv files into one file

In [2]:
train_data = 'data/train.csv'
test_data = 'data/test.csv'
valid_data = 'data/test.csv'

file = "multiclass_dataset.csv"

def save_load_df(file:str):
    if os.path.exists(file):
        df = pd.read_csv(file, index_col= 0)
    else:
        df = pd.concat(map(pd.read_csv, [train_data, test_data, valid_data]), axis= 0, ignore_index=True)
        df.to_csv(file, columns= ['id', 'text', 'label', 'sentiment'])
        df = pd.read_csv(file, index_col= 0)
    return df

df = save_load_df(file=file)
# df = df.sample(frac=1, random_state= 1337).reset_index(drop=True)
df = df.drop(columns='id')
df

Unnamed: 0,text,label,sentiment
0,"Cooking microwave pizzas, yummy",2,positive
1,Any plans of allowing sub tasks to show up in ...,1,neutral
2,"I love the humor, I just reworded it. Like sa...",2,positive
3,naw idk what ur talkin about,1,neutral
4,That sucks to hear. I hate days like that,0,negative
...,...,...,...
41639,Fuck no internet damn time warner!,0,negative
41640,Looking forward to android 1.5 being pushed t...,1,neutral
41641,Not good. Wasted time.,0,negative
41642,"U were great, as always. But, can`t we do an ...",2,positive


## 1e etape: pre-precessing the text

In [3]:
def cleaning_text(text):
    text_pattern = re.compile(
        r'(<.+?>)'         # Balises HTML
        r'|([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'  # Emails
        r'|(https?\W+[^\s]+)'  # URLs commençant par http ou https
        r'|(https?://[^\s\n\r]+)' # URLs commençant par http ou https
        r'|(www\.[^\s]+)'      # URLs commençant par www
        r'|([\U00010000-\U0010ffff])'  # Émojis et autres caractères au-delà de l'ASCII étendu
        r'|([^\x00-\xFF])'     # Tout ce qui n'est pas en ASCII étendu (0-255)
    )
    text = text_pattern.sub('', str(text))
    text = text.lower()
    punctuation = set(string.punctuation)
    stop_words = set(stopwords.words('english'))
    tokens = []
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
      words = nltk.word_tokenize(sentence)
      for word in words:
        if word not in stop_words:
          word = ''.join([c for c in word if c not in punctuation])
          if word == '':
              continue
          tokens.append(word)
    
    # get the part of speech
    pos_tags = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    data = []
    for token, pos in pos_tags:
        if pos.startswith('J'):
          lemma = lemmatizer.lemmatize(token, pos = 'a')
        elif pos.startswith('V'):
          lemma = lemmatizer.lemmatize(token, pos = 'v')
        elif pos.startswith('RB'):
          lemma = lemmatizer.lemmatize(token, pos = 'r')
        elif pos.startswith('N'):
          lemma = lemmatizer.lemmatize(token, pos = 'n')
        else:
          lemma = lemmatizer.lemmatize(token)
        data.append([token, lemma, pos])
    data.append(['', '', ''])
    return data

def get_infos(texts):
    infos = []
    for text in texts:
        data = cleaning_text(text=text)
        infos.extend(data)
    return infos

texts = df['text']
df_tokens = pd.DataFrame(get_infos(texts), columns = ['token', 'lemma', 'pos'])
df_tokens

Unnamed: 0,token,lemma,pos
0,cooking,cook,VBG
1,microwave,microwave,NN
2,pizzas,pizza,NN
3,yummy,yummy,NN
4,,,
...,...,...,...
447665,live,live,JJ
447666,live,live,JJ
447667,tx,tx,NN
447668,visit,visit,NN


## 2e etape: TF-IDF

In [4]:

def get_document(element: list[str]):
  docs = []
  for i in range(0, len(element)):
    for j in range(i, len(element)):
      if element[j] == '':
        docs.append(' '.join(element[i:j]))
        i = j + 1
    break
  return docs

documents = get_document(df_tokens['lemma'].tolist())

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix

docs = np.array(documents)
tfidf = TfidfVectorizer(use_idf = True, norm = 'l2', smooth_idf=True)
tfidf_matrix = tfidf.fit_transform(docs).toarray()
csr = csr_matrix(tfidf_matrix ,dtype = float)


In [6]:
# save the tfidf_matrix
if os.path.exists('tfidf_matrix.npz'):
  csr = np.load('tfidf_matrix.npz', allow_pickle=True)
else:
  np.savez('tfidf_matrix', csr)
  csr = np.load('tfidf_matrix.npz', allow_pickle= True)

## 3e etape: word2vec

In [7]:
from gensim.models import KeyedVectors

word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary= True)

In [8]:
vocab = set(list(word2vec_model.key_to_index.keys()))
lemmas = [doc.split() for doc in documents]

In [9]:
word2vec = df_tokens['lemma'].apply(lambda word: word2vec_model[word] if word in vocab else None)
df2 = pd.DataFrame(data = [(lemma, word2vec) \
                           for lemma, word2vec in zip(df_tokens['lemma'], word2vec) \
                            if word2vec is not None], columns= ['lemma', 'word2vec'])
df2

Unnamed: 0,lemma,word2vec
0,cook,"[-0.23242188, 0.09033203, 0.078125, 0.12695312..."
1,microwave,"[-0.30078125, -0.06689453, 0.075683594, 0.3242..."
2,pizza,"[-0.12597656, 0.025390625, 0.16699219, 0.55078..."
3,yummy,"[-0.18945312, -0.06591797, -0.041748047, 0.433..."
4,plan,"[0.07861328, 0.09814453, 0.16894531, 0.0834960..."
...,...,...
387233,use,"[0.11279297, -0.13085938, 0.06689453, 0.138671..."
387234,live,"[0.016967773, 0.017333984, -0.041748047, 0.126..."
387235,live,"[0.016967773, 0.017333984, -0.041748047, 0.126..."
387236,tx,"[0.022949219, 0.049804688, -0.10546875, 0.3300..."


In [10]:
phr2vec = []
for phrase in lemmas:
    if len(phrase) == 0:
        continue
    mean_vect = word2vec_model.get_mean_vector(keys = phrase, pre_normalize = False)
    phr2vec.append({
        'phrase': phrase,
        'phrase2vec': mean_vect,
    })

df3 = pd.DataFrame(phr2vec)
df3

Unnamed: 0,phrase,phrase2vec
0,"[cook, microwave, pizza, yummy]","[-0.2121582, -0.004272461, 0.06976318, 0.35888..."
1,"[plan, allow, sub, task, show, widget]","[0.02561442, 0.022299448, 0.045491535, 0.04935..."
2,"[love, humor, reword, like, say, group, therap...","[0.05505371, -0.0052217757, 0.05345481, 0.1639..."
3,"[naw, idk, ur, talkin]","[-0.053726196, 0.058654785, 0.15679932, 0.2163..."
4,"[suck, hear, hate, day, like]","[0.05214844, 0.03173828, 0.09024353, 0.0959961..."
...,...,...
41502,"[fuck, internet, damn, time, warner]","[0.023242187, -0.06225586, -0.018005371, 0.217..."
41503,"[look, forward, android, 15, push, g1]","[-0.059326172, 0.030419922, -0.070410155, 0.06..."
41504,"[good, waste, time]","[-0.068359375, 0.21289062, 0.12904866, 0.14469..."
41505,"[u, great, always, east, germany, noko, least,...","[0.0022521974, 0.013061523, 0.10640259, 0.1460..."


## Word Embedding Contextuel

In [11]:
from transformers import AutoTokenizer
from transformers import AutoModel

tokens = get_document(df_tokens['token'].tolist())
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')

embeddings = []
i = 0
for phrase in tokens:
    words = tokenizer(phrase, return_tensors='pt')
    # feed to the embedding layer to get the embdeddings for each token
    token_embd = model(**words)
    embedding = token_embd.last_hidden_state
    # sum up all the tokens embedding to get the phrase embedding
    phrase_embd = embedding.mean(dim = 1)
    embeddings.append(phrase_embd)
    i +=1 
    if i == 64:
        break

  from .autonotebook import tqdm as notebook_tqdm
2024-12-05 11:22:55.119922: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-05 11:22:55.760550: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-05 11:22:55.763180: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
import torch

embeddings = torch.cat(embeddings, dim = 0)
embeddings.shape

torch.Size([64, 768])

## vectorisation 

In [13]:
# get a dataframe with lemmes and pos and label :

pos = get_document(df_tokens['pos'].tolist())
pos = [doc.split() for doc in pos]
labels = df['label'].tolist()

In [14]:
corpus = pd.DataFrame(data = [(doc, p, label) for doc, p, label in zip(documents, pos, labels)], columns= ['text', 'pos', 'label'])
corpus = corpus[corpus['text'] != '']
corpus = corpus.reset_index(drop= True)
corpus

Unnamed: 0,text,pos,label
0,cook microwave pizza yummy,"[VBG, NN, NN, NN]",2
1,plan allow sub task show widget,"[NNS, VBG, NN, NNS, VBP, VB]",1
2,love humor reword like say group therapy inste...,"[VB, NN, VBN, IN, VBG, NN, NN, RB, VBD, VBG, N...",2
3,naw idk ur talkin,"[JJ, NN, JJ, NN]",1
4,suck hear hate day like,"[NNS, VBP, JJ, NNS, IN]",0
...,...,...,...
41502,fuck internet damn time warner,"[JJ, NN, NN, NN, NN]",0
41503,look forward android 15 push g1,"[VBG, RB, JJ, CD, VBD, NN]",1
41504,good waste time,"[JJ, VBD, NN]",0
41505,u great always east germany noko least provoke...,"[JJ, JJ, RB, VBP, JJ, RB, JJS, JJ, VBP, CD, NN]",2


In [15]:
from sklearn.preprocessing import OneHotEncoder

all_pos_tags = list(set(pos for tag in corpus['pos'] for pos in tag))
one_hot_encoder = OneHotEncoder(sparse_output=False, categories=[all_pos_tags])

pos_vectors = []
for tags in corpus['pos']:
    pos_vectors.append(np.sum(one_hot_encoder.fit_transform([[tag] for tag in tags]), axis=0))


In [16]:

# df_pos = pd.DataFrame(pos_vectors, columns= all_pos_tags)
# df_pos['text'] = corpus['text']
# df_pos['label'] = corpus['label']

# df_pos = df_pos[["text"] + all_pos_tags + ["label"]]
# df_pos

## Phase d'entrainement 

In [17]:
df3['Y'] = corpus['label']
final_dataset = df3.drop(columns='phrase')


In [18]:
final_dataset

Unnamed: 0,phrase2vec,Y
0,"[-0.2121582, -0.004272461, 0.06976318, 0.35888...",2
1,"[0.02561442, 0.022299448, 0.045491535, 0.04935...",1
2,"[0.05505371, -0.0052217757, 0.05345481, 0.1639...",2
3,"[-0.053726196, 0.058654785, 0.15679932, 0.2163...",1
4,"[0.05214844, 0.03173828, 0.09024353, 0.0959961...",0
...,...,...
41502,"[0.023242187, -0.06225586, -0.018005371, 0.217...",0
41503,"[-0.059326172, 0.030419922, -0.070410155, 0.06...",1
41504,"[-0.068359375, 0.21289062, 0.12904866, 0.14469...",0
41505,"[0.0022521974, 0.013061523, 0.10640259, 0.1460...",2


In [127]:
from sklearn.model_selection import train_test_split

X = final_dataset['phrase2vec']
Y = final_dataset['Y']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, train_size=0.7, random_state=42, shuffle=True)

In [128]:
def get_batch(x, y, batch_size = 16):
    xb, yb = list(x[i:i+batch_size]), list(y[i:i+batch_size])
    xb = torch.tensor(np.stack(xb, axis= 0), dtype= torch.float32)
    yb = torch.tensor(np.stack(yb, axis= 0), dtype= torch.long)
    return xb, yb

xb, yb = get_batch(x_train, y_train)
xb.shape

torch.Size([16, 300])

In [138]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Model(nn.Module):
    def __init__(self, embed_dim, hidden_size = 128, n_layers= 2):
        super(Model, self).__init__()
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers = n_layers, 
                            batch_first=True, dropout=0.3, bidirectional=True)
        self.fc1 = nn.Linear(n_layers * hidden_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x, (h1, c1) = self.lstm(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        logits = self.sigmoid(x)
        return logits

In [139]:
xb, yb = get_batch(x_train, y_train, batch_size= 32)
model = Model(embed_dim=xb.size(1))
optimizer = optim.Adam(model.parameters(), lr = 0.001)

for i in range(1000):
    optimizer.zero_grad(set_to_none=False)
    # forward pass
    logits = model(xb)
    loss = F.cross_entropy(logits, yb)
    
    # backward pass
    loss.backward()
    # update the gradient
    optimizer.step()

    print(loss.item())

1.0988742113113403
1.0972639322280884
1.0960090160369873
1.0947126150131226
1.092895746231079
1.0903218984603882
1.0868314504623413
1.0826807022094727
1.076998233795166
1.074790120124817
1.0757933855056763
1.074370265007019
1.072308897972107
1.0689113140106201
1.067621111869812
1.0654664039611816
1.0640716552734375
1.0620238780975342
1.0611436367034912
1.0586857795715332
1.0563926696777344
1.0543276071548462
1.051666259765625
1.0506007671356201
1.048697829246521
1.0467835664749146
1.0433210134506226
1.0382575988769531
1.034592628479004
1.026778221130371
1.022049069404602
1.0159392356872559
1.0073601007461548
0.999229371547699
0.9915934801101685
0.9845032095909119
0.9769010543823242
0.9771125316619873
0.9721494913101196
0.9573929309844971
0.9512853026390076
0.9393582344055176
0.922374963760376
0.9046894907951355
0.9010148048400879
0.8898901343345642
0.8706144094467163
0.8566556572914124
0.8334162831306458
0.8224446177482605
0.8105222582817078
0.7821233868598938
0.7741371393203735
0.7739