## Import the Dependecies

In [1]:
import pandas as pd
from __future__ import annotations
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

## Putting all the csv files into one file

In [2]:
train_data = 'data/train.csv'
test_data = 'data/test.csv'
valid_data = 'data/test.csv'

file = "multiclass_dataset.csv"

def save_load_df(file:str):
    if os.path.exists(file):
        df = pd.read_csv(file, index_col= 0)
    else:
        df = pd.concat(map(pd.read_csv, [train_data, test_data, valid_data]), axis= 0, ignore_index=True)
        df.to_csv(file, columns= ['id', 'text', 'label', 'sentiment'])
        df = pd.read_csv(file, index_col= 0)
    return df

df = save_load_df(file=file)
# df = df.sample(frac=1, random_state= 1337).reset_index(drop=True)
df = df.drop(columns='id')
df

Unnamed: 0,text,label,sentiment
0,"Cooking microwave pizzas, yummy",2,positive
1,Any plans of allowing sub tasks to show up in ...,1,neutral
2,"I love the humor, I just reworded it. Like sa...",2,positive
3,naw idk what ur talkin about,1,neutral
4,That sucks to hear. I hate days like that,0,negative
...,...,...,...
41639,Fuck no internet damn time warner!,0,negative
41640,Looking forward to android 1.5 being pushed t...,1,neutral
41641,Not good. Wasted time.,0,negative
41642,"U were great, as always. But, can`t we do an ...",2,positive


## 1e etape: pre-precessing the text

In [3]:
def cleaning_text(text):
    text_pattern = re.compile(
        r'(<.+?>)'         # Balises HTML
        r'|([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'  # Emails
        r'|(https?\W+[^\s]+)'  # URLs commençant par http ou https
        r'|(https?://[^\s\n\r]+)' # URLs commençant par http ou https
        r'|(www\.[^\s]+)'      # URLs commençant par www
        r'|([\U00010000-\U0010ffff])'  # Émojis et autres caractères au-delà de l'ASCII étendu
        r'|([^\x00-\xFF])'     # Tout ce qui n'est pas en ASCII étendu (0-255)
    )
    text = text_pattern.sub('', str(text))
    text = text.lower()
    punctuation = set(string.punctuation)
    stop_words = set(stopwords.words('english'))
    tokens = []
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
      words = nltk.word_tokenize(sentence)
      for word in words:
        if word not in stop_words:
          word = ''.join([c for c in word if c not in punctuation])
          if word == '':
              continue
          tokens.append(word)
    
    # get the part of speech
    pos_tags = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    data = []
    for token, pos in pos_tags:
        if pos.startswith('J'):
          lemma = lemmatizer.lemmatize(token, pos = 'a')
        elif pos.startswith('V'):
          lemma = lemmatizer.lemmatize(token, pos = 'v')
        elif pos.startswith('RB'):
          lemma = lemmatizer.lemmatize(token, pos = 'r')
        elif pos.startswith('N'):
          lemma = lemmatizer.lemmatize(token, pos = 'n')
        else:
          lemma = lemmatizer.lemmatize(token)
        data.append([token, lemma, pos])
    data.append(['', '', ''])
    return data

def get_infos(texts):
    infos = []
    for text in texts:
        data = cleaning_text(text=text)
        infos.extend(data)
    return infos

texts = df['text']
df_tokens = pd.DataFrame(get_infos(texts), columns = ['token', 'lemma', 'pos'])
df_tokens

Unnamed: 0,token,lemma,pos
0,cooking,cook,VBG
1,microwave,microwave,NN
2,pizzas,pizza,NN
3,yummy,yummy,NN
4,,,
...,...,...,...
447665,live,live,JJ
447666,live,live,JJ
447667,tx,tx,NN
447668,visit,visit,NN


## 2e etape: TF-IDF

In [4]:

def get_document(element: list[str]):
  docs = []
  for i in range(0, len(element)):
    for j in range(i, len(element)):
      if element[j] == '':
        docs.append(' '.join(element[i:j]))
        i = j + 1
    break
  return docs

documents = get_document(df_tokens['lemma'].tolist())

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix

docs = np.array(documents)
tfidf = TfidfVectorizer(use_idf = True, norm = 'l2', smooth_idf=True)
tfidf_matrix = tfidf.fit_transform(docs).toarray()
csr = csr_matrix(tfidf_matrix ,dtype = float)


In [6]:
# save the tfidf_matrix
if os.path.exists('tfidf_matrix.npz'):
  csr = np.load('tfidf_matrix.npz', allow_pickle=True)
else:
  np.savez('tfidf_matrix', csr)
  csr = np.load('tfidf_matrix.npz', allow_pickle= True)

## 3e etape: word2vec

In [7]:
from gensim.models import KeyedVectors

word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary= True)

In [8]:
vocab = set(list(word2vec_model.key_to_index.keys()))
lemmas = [doc.split() for doc in documents]

In [9]:
word2vec = df_tokens['lemma'].apply(lambda word: word2vec_model[word] if word in vocab else None)
df2 = pd.DataFrame(data = [(lemma, word2vec) \
                           for lemma, word2vec in zip(df_tokens['lemma'], word2vec) \
                            if word2vec is not None], columns= ['lemma', 'word2vec'])
df2

Unnamed: 0,lemma,word2vec
0,cook,"[-0.23242188, 0.09033203, 0.078125, 0.12695312..."
1,microwave,"[-0.30078125, -0.06689453, 0.075683594, 0.3242..."
2,pizza,"[-0.12597656, 0.025390625, 0.16699219, 0.55078..."
3,yummy,"[-0.18945312, -0.06591797, -0.041748047, 0.433..."
4,plan,"[0.07861328, 0.09814453, 0.16894531, 0.0834960..."
...,...,...
387233,use,"[0.11279297, -0.13085938, 0.06689453, 0.138671..."
387234,live,"[0.016967773, 0.017333984, -0.041748047, 0.126..."
387235,live,"[0.016967773, 0.017333984, -0.041748047, 0.126..."
387236,tx,"[0.022949219, 0.049804688, -0.10546875, 0.3300..."


In [10]:
phr2vec = []
for phrase in lemmas:
    if len(phrase) == 0:
        continue
    mean_vect = word2vec_model.get_mean_vector(keys = phrase, pre_normalize = False)
    phr2vec.append({
        'phrase': phrase,
        'phrase2vec': mean_vect,
    })

df3 = pd.DataFrame(phr2vec)
df3

Unnamed: 0,phrase,phrase2vec
0,"[cook, microwave, pizza, yummy]","[-0.2121582, -0.004272461, 0.06976318, 0.35888..."
1,"[plan, allow, sub, task, show, widget]","[0.02561442, 0.022299448, 0.045491535, 0.04935..."
2,"[love, humor, reword, like, say, group, therap...","[0.05505371, -0.0052217757, 0.05345481, 0.1639..."
3,"[naw, idk, ur, talkin]","[-0.053726196, 0.058654785, 0.15679932, 0.2163..."
4,"[suck, hear, hate, day, like]","[0.05214844, 0.03173828, 0.09024353, 0.0959961..."
...,...,...
41502,"[fuck, internet, damn, time, warner]","[0.023242187, -0.06225586, -0.018005371, 0.217..."
41503,"[look, forward, android, 15, push, g1]","[-0.059326172, 0.030419922, -0.070410155, 0.06..."
41504,"[good, waste, time]","[-0.068359375, 0.21289062, 0.12904866, 0.14469..."
41505,"[u, great, always, east, germany, noko, least,...","[0.0022521974, 0.013061523, 0.10640259, 0.1460..."


## vectorisation 

In [11]:
# get a dataframe with lemmes and pos and label :

pos = get_document(df_tokens['pos'].tolist())
pos = [doc.split() for doc in pos]
labels = df['label'].tolist()

In [16]:
corpus = pd.DataFrame(data = [(doc, p, label) for doc, p, label in zip(documents, pos, labels)], columns= ['text', 'pos', 'label'])
corpus = corpus[corpus['text'] != '']
corpus

Unnamed: 0,text,pos,label
0,cook microwave pizza yummy,"[VBG, NN, NN, NN]",2
1,plan allow sub task show widget,"[NNS, VBG, NN, NNS, VBP, VB]",1
2,love humor reword like say group therapy inste...,"[VB, NN, VBN, IN, VBG, NN, NN, RB, VBD, VBG, N...",2
3,naw idk ur talkin,"[JJ, NN, JJ, NN]",1
4,suck hear hate day like,"[NNS, VBP, JJ, NNS, IN]",0
...,...,...,...
41639,fuck internet damn time warner,"[JJ, NN, NN, NN, NN]",0
41640,look forward android 15 push g1,"[VBG, RB, JJ, CD, VBD, NN]",1
41641,good waste time,"[JJ, VBD, NN]",0
41642,u great always east germany noko least provoke...,"[JJ, JJ, RB, VBP, JJ, RB, JJS, JJ, VBP, CD, NN]",2


In [17]:
from sklearn.preprocessing import OneHotEncoder

all_pos_tags = list(set(pos for tag in corpus['pos'] for pos in tag))
one_hot_encoder = OneHotEncoder(sparse_output=False, categories=[all_pos_tags])

pos_vectors = []
for tags in corpus['pos']:
    pos_vectors.append(np.sum(one_hot_encoder.fit_transform([[tag] for tag in tags]), axis=0))


new_df = pd.DataFrame(data = pos_vectors, columns=all_pos_tags)
new_df['text'] = corpus['text']
new_df['label'] = corpus['label']


new_df = new_df[['text'] + all_pos_tags + ['label']]
new_df

Unnamed: 0,text,VBN,PDT,WDT,EX,PRP$,DT,FW,RB,$,...,WRB,VB,VBP,WP$,RP,IN,TO,JJR,NNP,label
0,cook microwave pizza yummy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,plan allow sub task show widget,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,love humor reword like say group therapy inste...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
3,naw idk ur talkin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,suck hear hate day like,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41502,happy mother day woman men make mother,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
41503,ugly programme open,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41504,aww anyone see hollie steel bgt tonight wharra...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41505,routine nt last forever,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
