## Import the Dependecies

In [7]:
import pandas as pd
import nltk
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
from __future__ import annotations
import re
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Putting all the csv files into one file

In [8]:
train_data = 'data/train.csv'
test_data = 'data/test.csv'
valid_data = 'data/test.csv'

file = "multiclass_dataset.csv"

def save_load_df(file:str):
    if os.path.exists(file):
        df = pd.read_csv(file, index_col= 0)
    else:
        df = pd.concat(map(pd.read_csv, [train_data, test_data, valid_data]), axis= 0, ignore_index=True)
        df.to_csv(file, columns= ['id', 'text', 'label', 'sentiment'])
        df = pd.read_csv(file, index_col= 0)
    return df

df = save_load_df(file=file)
# df = df.sample(frac=1, random_state= 1337).reset_index(drop=True)
df = df.drop(columns='id')
df

Unnamed: 0,text,label,sentiment
0,"Cooking microwave pizzas, yummy",2,positive
1,Any plans of allowing sub tasks to show up in ...,1,neutral
2,"I love the humor, I just reworded it. Like sa...",2,positive
3,naw idk what ur talkin about,1,neutral
4,That sucks to hear. I hate days like that,0,negative
...,...,...,...
41639,Fuck no internet damn time warner!,0,negative
41640,Looking forward to android 1.5 being pushed t...,1,neutral
41641,Not good. Wasted time.,0,negative
41642,"U were great, as always. But, can`t we do an ...",2,positive


## 1e etape: pre-precessing the text

In [9]:
def cleaning_text(text):
    text_pattern = re.compile(
        r'(<.+?>)'         # Balises HTML
        r'|([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'  # Emails
        r'|(https?\W+[^\s]+)'  # URLs commençant par http ou https
        r'|(https?://[^\s\n\r]+)' # URLs commençant par http ou https
        r'|(www\.[^\s]+)'      # URLs commençant par www
        r'|([\U00010000-\U0010ffff])'  # Émojis et autres caractères au-delà de l'ASCII étendu
        r'|([^\x00-\xFF])'     # Tout ce qui n'est pas en ASCII étendu (0-255)
    )
    text = text_pattern.sub('', str(text))
    text = text.lower()
    punctuation = set(string.punctuation)
    stop_words = set(stopwords.words('english'))
    tokens = []
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
      words = nltk.word_tokenize(sentence)
      for word in words:
        if word not in stop_words:
          word = ''.join([c for c in word if c not in punctuation])
          if word == '':
              continue
          tokens.append(word)
    
    # get the part of speech
    pos_tags = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    data = []
    for token, pos in pos_tags:
        if pos.startswith('J'):
          lemma = lemmatizer.lemmatize(token, pos = 'a')
        elif pos.startswith('V'):
          lemma = lemmatizer.lemmatize(token, pos = 'v')
        elif pos.startswith('RB'):
          lemma = lemmatizer.lemmatize(token, pos = 'r')
        elif pos.startswith('N'):
          lemma = lemmatizer.lemmatize(token, pos = 'n')
        else:
          lemma = lemmatizer.lemmatize(token)
        data.append([token, lemma, pos])
    data.append(['', '', ''])
    return data

def get_infos(texts):
    infos = []
    for text in texts:
        data = cleaning_text(text=text)
        infos.extend(data)
    return infos

texts = df['text']
df_tokens = pd.DataFrame(get_infos(texts), columns = ['token', 'lemma', 'pos'])
df_tokens

Unnamed: 0,token,lemma,pos
0,cooking,cook,VBG
1,microwave,microwave,NN
2,pizzas,pizza,NN
3,yummy,yummy,NN
4,,,
...,...,...,...
447665,live,live,JJ
447666,live,live,JJ
447667,tx,tx,NN
447668,visit,visit,NN


## 2e etape: TF-IDF

In [10]:

def get_document(element: list[str]):
  docs = []
  for i in range(0, len(element)):
    for j in range(i, len(element)):
      if element[j] == '':
        docs.append(' '.join(element[i:j]))
        i = j + 1
    break
  return docs

documents = get_document(df_tokens['lemma'].tolist())

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix

docs = np.array(documents)
tfidf = TfidfVectorizer(use_idf = True, norm = 'l2', smooth_idf=True)
tfidf_matrix = tfidf.fit_transform(docs).toarray()
csr = csr_matrix(tfidf_matrix ,dtype = float)


In [12]:
# save the tfidf_matrix
if os.path.exists('tfidf_matrix.npz'):
  csr = np.load('tfidf_matrix.npz', allow_pickle=True)
else:
  np.savez('tfidf_matrix', csr)
  csr = np.load('tfidf_matrix.npz', allow_pickle= True)

FASTTEXT


In [None]:

from gensim.models import KeyedVectors


fasttext_model_path = 'D:\\sentiment_analysis Project\\sentiment_analysis-project\\cc.en.300.bin'  # Chemin vers le fichier binaire FastText
fasttext_model = KeyedVectors.load_word2vec_format(fasttext_model_path, binary=True)

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\sentiment_analysis Project\\sentiment_analysis-project\\cc.en.300.bin'

In [None]:
vocab = set(list(fasttext_model.key_to_index.keys()))
lemmas = [doc.split() for doc in documents]

In [None]:
fasttext = df_tokens['lemma'].apply(lambda word: fasttext_model[word] if word in vocab else None)
df7 = pd.DataFrame(data = [(lemma, fasttext) \
                           for lemma, fasttext in zip(df_tokens['lemma'], fasttext) \
                            if fasttext is not None], columns= ['lemma', 'fasttext'])
df7

In [None]:
phr2vec = []
for phrase in lemmas:
    if len(phrase) == 0:
        continue
    mean_vect = fasttext_model.get_mean_vector(keys = phrase, pre_normalize = False)
    phr2vec.append({
        'phrase': phrase,
        'phrase2vec': mean_vect,
    })

## Word Embedding Contextuel

In [None]:

from transformers import AutoTokenizer
from transformers import AutoModel

tokens = get_document(df_tokens['token'].tolist())
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')

embeddings = []
i = 0
for phrase in tokens:
    words = tokenizer(phrase, return_tensors='pt')
    # feed to the embedding layer to get the embdeddings for each token
    token_embd = model(**words)
    embedding = token_embd.last_hidden_state
    # sum up all the tokens embedding to get the phrase embedding
    phrase_embd = embedding.mean(dim = 1)
    embeddings.append(phrase_embd)
    i +=1 
    if i == 64:
        break

In [None]:
import torch

embeddings = torch.cat(embeddings, dim = 0)
embeddings.shape

## vectorisation 

In [None]:
# get a dataframe with lemmes and pos and label :

pos = get_document(df_tokens['pos'].tolist())
pos = [doc.split() for doc in pos]
labels = df['label'].tolist()

In [None]:
corpus = pd.DataFrame(data = [(doc, p, label) for doc, p, label in zip(documents, pos, labels)], columns= ['text', 'pos', 'label'])
corpus = corpus[corpus['text'] != '']
corpus = corpus.reset_index(drop= True)
corpus

In [None]:
from sklearn.preprocessing import OneHotEncoder

all_pos_tags = list(set(pos for tag in corpus['pos'] for pos in tag))
one_hot_encoder = OneHotEncoder(sparse_output=False, categories=[all_pos_tags])

pos_vectors = []
for tags in corpus['pos']:
    pos_vectors.append(np.sum(one_hot_encoder.fit_transform([[tag] for tag in tags]), axis=0))


In [None]:

# df_pos = pd.DataFrame(pos_vectors, columns= all_pos_tags)
# df_pos['text'] = corpus['text']
# df_pos['label'] = corpus['label']

# df_pos = df_pos[["text"] + all_pos_tags + ["label"]]
# df_pos

## Phase d'entrainement 

In [None]:
df3['Y'] = corpus['label']
final_dataset = df3.drop(columns='phrase')


In [None]:
final_dataset

In [None]:
from sklearn.model_selection import train_test_split

X = final_dataset['phrase2vec']
Y = final_dataset['Y']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, train_size=0.7, random_state=42, shuffle=True)

In [None]:
def get_batch(x, y, batch_size = 16):
    xb, yb = list(x[i:i+batch_size]), list(y[i:i+batch_size])
    xb = torch.tensor(np.stack(xb, axis= 0), dtype= torch.float32)
    yb = torch.tensor(np.stack(yb, axis= 0), dtype= torch.long)
    return xb, yb

xb, yb = get_batch(x_train, y_train)
xb.shape

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Model(nn.Module):
    def __init__(self, embed_dim, hidden_size = 128, n_layers= 2):
        super(Model, self).__init__()
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers = n_layers, 
                            batch_first=True, dropout=0.3, bidirectional=True)
        self.fc1 = nn.Linear(n_layers * hidden_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x, (h1, c1) = self.lstm(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        logits = self.sigmoid(x)
        return logits

In [None]:
xb, yb = get_batch(x_train, y_train, batch_size= 32)
model = Model(embed_dim=xb.size(1))
optimizer = optim.Adam(model.parameters(), lr = 0.001)

for i in range(1000):
    optimizer.zero_grad(set_to_none=False)
    # forward pass
    logits = model(xb)
    loss = F.cross_entropy(logits, yb)
    
    # backward pass
    loss.backward()
    # update the gradient
    optimizer.step()

    print(loss.item())