## Import the Dependecies

In [1]:
import pandas as pd
from __future__ import annotations
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

## Putting all the csv files into one file

In [2]:
train_data = 'data/train.csv'
test_data = 'data/test.csv'
valid_data = 'data/test.csv'

file = "multiclass_dataset.csv"

def save_load_df(file:str):
    if os.path.exists(file):
        df = pd.read_csv(file, index_col= 0)
    else:
        df = pd.concat(map(pd.read_csv, [train_data, test_data, valid_data]), axis= 0, ignore_index=True)
        df.to_csv(file, columns= ['id', 'text', 'label', 'sentiment'])
        df = pd.read_csv(file, index_col= 0)

    return df

df = save_load_df(file=file)
df
# (41644-41511) rows

Unnamed: 0,id,text,label,sentiment
0,9536,"Cooking microwave pizzas, yummy",2,positive
1,6135,Any plans of allowing sub tasks to show up in ...,1,neutral
2,17697,"I love the humor, I just reworded it. Like sa...",2,positive
3,14182,naw idk what ur talkin about,1,neutral
4,17840,That sucks to hear. I hate days like that,0,negative
...,...,...,...,...
41639,10277,Fuck no internet damn time warner!,0,negative
41640,8610,Looking forward to android 1.5 being pushed t...,1,neutral
41641,8114,Not good. Wasted time.,0,negative
41642,3034,"U were great, as always. But, can`t we do an ...",2,positive


## 1e etape: pre-precessing the text

In [3]:
def cleaning_text(text):
    text_pattern = re.compile(
        r'(<.+?>)'         # Balises HTML
        r'|([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'  # Emails
        r'|(https?\W+[^\s]+)'  # URLs commençant par http ou https
        r'|(https?://[^\s\n\r]+)' # URLs commençant par http ou https
        r'|(www\.[^\s]+)'      # URLs commençant par www
        r'|([\U00010000-\U0010ffff])'  # Émojis et autres caractères au-delà de l'ASCII étendu
        r'|([^\x00-\xFF])'     # Tout ce qui n'est pas en ASCII étendu (0-255)
    )
    text = text_pattern.sub('', str(text))
    text = text.lower()
    punctuation = set(string.punctuation)
    stop_words = set(stopwords.words('english'))
    tokens = []
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
      words = nltk.word_tokenize(sentence)
      for word in words:
        if word not in stop_words:
          word = ''.join([c for c in word if c not in punctuation])
          if word == '':
              continue
          tokens.append(word)
    
    # get the part of speech
    pos_tags = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    data = []
    for token, pos in pos_tags:
        if pos.startswith('J'):
          lemma = lemmatizer.lemmatize(token, pos = 'a')
        elif pos.startswith('V'):
          lemma = lemmatizer.lemmatize(token, pos = 'v')
        elif pos.startswith('RB'):
          lemma = lemmatizer.lemmatize(token, pos = 'r')
        elif pos.startswith('N'):
          lemma = lemmatizer.lemmatize(token, pos = 'n')
        else:
          lemma = lemmatizer.lemmatize(token)
        data.append([token, lemma, pos])
    return data

def get_info(texts):
    data = []
    for text in texts:
        text = cleaning_text(text)
        data.extend(text)
        data.append(['', '', ''])
    return data

In [4]:
texts = df['text']

if os.path.exists("./token_lemma_pos.csv"):
    df_tokens = pd.read_csv("./token_lemma_pos.csv", index_col= 0)
else:
    data = get_info(texts = texts)
    df_tokens = pd.DataFrame(data, columns = ['token', 'lemma', 'pos'])
    df_tokens.to_csv("./token_lemma_pos.csv")
    df_tokens = pd.read_csv('./token_lemma_pos.csv', index_col= 0)

# df_tokens = df_tokens.dropna()
df_tokens = df_tokens.fillna("")
df_tokens


Unnamed: 0,token,lemma,pos
0,cooking,cook,VBG
1,microwave,microwave,NN
2,pizzas,pizza,NN
3,yummy,yummy,NN
4,,,
...,...,...,...
447665,live,live,JJ
447666,live,live,JJ
447667,tx,tx,NN
447668,visit,visit,NN


## 2e etape: TF-IDF

In [5]:
lemma = df_tokens['lemma'].tolist()

def get_document(lemma: list[str]):
  docs = []
  for i in range(0, len(lemma)):
    for j in range(i, len(lemma)):
      if lemma[j] == '':
        docs.append(' '.join(lemma[i:j]))
        i = j + 1
    break
  return docs

docs = get_document(lemma)
docs

['cook microwave pizza yummy',
 'plan allow sub task show widget',
 'love humor reword like say group therapy instead gang bang keep mom back hahaha',
 'naw idk ur talkin',
 'suck hear hate day like',
 'umm yeah probably pretty good note self eeeeeewwwwwwww',
 'whatever mean',
 'would panic little maybe read orbitron gym like else fails',
 'sad people phone dead',
 'sad face',
 'cafe management end month nothing number arty little head hurt find discrepencies',
 'basically listen miley cyrus fourteen hour love',
 'think finale think kind cheap way',
 'think boat sail friend cco month ago wish luck though',
 'give mommy mother day present ticket aerosmith favorite band everr yay good reaction',
 'installed office mac mess font safari stick help even read solution online badmicrosoft',
 'welcome',
 'ok hungry fat know know aha',
 'early monday cram yay week leave',
 'ok ff soooo anyone follow meeee lol ughhhh lazy follower wont even help',
 'ruth today',
 'bye plurk muna back im do readi

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix

docs = np.array(docs)
tfidf = TfidfVectorizer(use_idf = True, norm = 'l2', smooth_idf=True)
tfidf_matrix = tfidf.fit_transform(docs).toarray()
csr = csr_matrix(tfidf_matrix ,dtype = float)


In [7]:
# save the tfidf_matrix

if os.path.exists('./tfidf_matrix.npz'):
  csr = np.load('./tfidf_matrix.npz', allow_pickle=True)
else:
  np.savez('./tfidf_matrix', csr)
  csr = np.load('./tfidf_matrix.npz', allow_pickle= True)
csr = csr['arr_0']
print(csr)

  (0, 5982)	0.45094812378426613
  (0, 15425)	0.6228628203795099
  (0, 18453)	0.45094812378426613
  (0, 27452)	0.4531373812650889
  (1, 1825)	0.4440330675714078
  (1, 18475)	0.41603681735146036
  (1, 21642)	0.3494264788785262
  (1, 22990)	0.5218517556880837
  (1, 23619)	0.30228628985760325
  (1, 26653)	0.37939805141715954
  (2, 2918)	0.16513011885700035
  (2, 3058)	0.31993986163249
  (2, 10000)	0.37050800626997316
  (2, 10705)	0.26425777335008327
  (2, 10923)	0.2559349096370054
  (2, 11829)	0.34442312086556537
  (2, 12449)	0.21716639348178488
  (2, 13357)	0.1755748953893207
  (2, 14143)	0.13495608332912892
  (2, 14526)	0.14226866495279525
  (2, 15769)	0.2089613208809932
  (2, 20337)	0.4118515714715681
  (2, 20954)	0.17543394940128412
  (2, 24001)	0.34442312086556537
  (3, 11994)	0.47038319982518934
  :	:
  (41649, 26211)	0.47692822218924075
  (41650, 240)	0.43360704805896594
  (41650, 2050)	0.37395851930098445
  (41650, 9552)	0.37255957920202354
  (41650, 9910)	0.5270193549443537
  (416

## 3e etape: word2vec

In [8]:
from gensim.models import KeyedVectors

word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary= True)

In [9]:
vocab = set(list(word2vec_model.key_to_index.keys()))
docs = get_document(lemma= lemma)
lemmas = [doc.split() for doc in docs]

In [None]:
df_tokens['word2vec'] = df_tokens['lemma'].apply(lambda word: word2vec_model[word] if word in vocab else None)

Unnamed: 0,token,lemma,pos,word2vec
0,cooking,cook,VBG,"[-0.23242188, 0.09033203, 0.078125, 0.12695312..."
1,microwave,microwave,NN,"[-0.30078125, -0.06689453, 0.075683594, 0.3242..."
2,pizzas,pizza,NN,"[-0.12597656, 0.025390625, 0.16699219, 0.55078..."
3,yummy,yummy,NN,"[-0.18945312, -0.06591797, -0.041748047, 0.433..."
4,,,,
...,...,...,...,...
447665,live,live,JJ,"[0.016967773, 0.017333984, -0.041748047, 0.126..."
447666,live,live,JJ,"[0.016967773, 0.017333984, -0.041748047, 0.126..."
447667,tx,tx,NN,"[0.022949219, 0.049804688, -0.10546875, 0.3300..."
447668,visit,visit,NN,"[-0.12597656, 0.12451172, 0.0035858154, 0.0156..."


In [17]:
df2 = pd.DataFrame(data = [(lemma, word2vec) \
                           for lemma, word2vec in zip(df_tokens['lemma'], df_tokens['word2vec']) \
                            if word2vec is not None], columns= ['lemma', 'word2vec'])

if os.path.exists('word2vec.csv'):
    df2 = pd.read_csv('word2vec.csv')
else:
    df2.to_csv("word2vec.csv")
    df2 = pd.read_csv('word2vec.csv')

df2

Unnamed: 0,lemma,word2vec
0,cook,[-2.32421875e-01 9.03320312e-02 7.81250000e-...
1,microwave,[-0.30078125 -0.06689453 0.07568359 0.324218...
2,pizza,[-1.25976562e-01 2.53906250e-02 1.66992188e-...
3,yummy,[-1.89453125e-01 -6.59179688e-02 -4.17480469e-...
4,plan,[ 0.07861328 0.09814453 0.16894531 0.083496...
...,...,...
387223,use,[ 0.11279297 -0.13085938 0.06689453 0.138671...
387224,live,[ 1.69677734e-02 1.73339844e-02 -4.17480469e-...
387225,live,[ 1.69677734e-02 1.73339844e-02 -4.17480469e-...
387226,tx,[ 2.29492188e-02 4.98046875e-02 -1.05468750e-...


In [18]:
# lemmas = remove_words(lemmas, vocab)
phr2vec = []
for phrase in lemmas:
    if len(phrase) == 0:
        continue
    mean_vect = word2vec_model.get_mean_vector(keys = phrase, pre_normalize = False)
    phr2vec.append({
        'phrase': phrase,
        'phrase2vec': mean_vect,
    })

df3 = pd.DataFrame(phr2vec)


In [19]:
if os.path.exists('phrase2vec.csv'):
    df3 = pd.read_csv('phrase2vec.csv')
else:
    df3.to_csv("phrase2vec.csv")
    df3 = pd.read_csv('phrase2vec.csv')

df3

Unnamed: 0,phrase,phrase2vec
0,"['cook', 'microwave', 'pizza', 'yummy']",[-0.2121582 -0.00427246 0.06976318 0.358886...
1,"['plan', 'allow', 'sub', 'task', 'show', 'widg...",[ 0.02561442 0.02229945 0.04549154 0.049357...
2,"['love', 'humor', 'reword', 'like', 'say', 'gr...",[ 0.05505371 -0.00522178 0.05345481 0.163966...
3,"['naw', 'idk', 'ur', 'talkin']",[-5.37261963e-02 5.86547852e-02 1.56799316e-...
4,"['suck', 'hear', 'hate', 'day', 'like']",[ 0.05214844 0.03173828 0.09024353 0.095996...
...,...,...
41506,"['fuck', 'internet', 'damn', 'time', 'warner']",[ 0.02324219 -0.06225586 -0.01800537 0.217578...
41507,"['look', 'forward', 'android', '15', 'push', '...",[-5.93261719e-02 3.04199215e-02 -7.04101548e-...
41508,"['good', 'waste', 'time']",[-6.83593750e-02 2.12890625e-01 1.29048660e-...
41509,"['u', 'great', 'always', 'east', 'germany', 'n...",[ 0.0022522 0.01306152 0.10640259 0.146069...


## vectorisation 

In [20]:
# Vectorisez les POS Tags et les Catégories de votre corpus en choisissant la méthode appropriée à chaque type d’information.

In [25]:
for lemma, pos in zip(df_tokens['lemma'], df_tokens['pos']):
    print(lemma, pos)

cook VBG
microwave NN
pizza NN
yummy NN
 
plan NNS
allow VBG
sub NN
task NNS
show VBP
widget VB
 
love VB
humor NN
reword VBN
like IN
say VBG
group NN
therapy NN
instead RB
gang VBD
bang VBG
keep NNS
mom NNS
back RB
hahaha VBP
 
naw JJ
idk NN
ur JJ
talkin NN
 
suck NNS
hear VBP
hate JJ
day NNS
like IN
 
umm JJ
yeah NN
probably RB
pretty RB
good JJ
note NN
self NN
eeeeeewwwwwwww NN
 
whatever WDT
mean NN
 
would MD
panic VB
little JJ
maybe RB
read VB
orbitron RP
gym NNS
like IN
else RB
fails NNS
 
sad JJ
people NNS
phone NNS
dead JJ
 
sad JJ
face NN
 
cafe JJ
management NN
end NN
month NN
nothing NN
number NNS
arty VBP
little JJ
head NN
hurt VBZ
find VBG
discrepencies NNS
 
basically RB
listen VBN
miley NN
cyrus NN
fourteen JJ
hour NNS
love VBP
 
think VB
finale JJ
think VBN
kind NN
cheap JJ
way NN
 
think VB
boat NN
sail VBD
friend JJ
cco NN
month NNS
ago IN
wish JJ
luck NN
though IN
 
give VBD
mommy JJ
mother NN
day NN
present JJ
ticket NNS
aerosmith VBP
favorite JJ
band NN
everr NN
y