## Import the Dependecies

In [1]:
import pandas as pd
from __future__ import annotations
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

## 1e etape: pre-precessing the text

In [2]:
df = pd.read_csv('train.csv')
df.shape

(31232, 4)

In [3]:
texts = df['text']

def cleaning_text(text):
  text_pattern = re.compile(
    r'(<.+?>)'         # Balises HTML
    r'|([@|#]\w+)'     # Mentions et hashtags
    r'|([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'  # Emails
    r'|(https?\W+[^\s]+)'  # URLs commençant par http ou https
    r'|(www\.[^\s]+)'      # URLs commençant par www
    r'|([\U00010000-\U0010ffff])'  # Émojis et autres caractères au-delà de l'ASCII étendu
    r'|([^\x00-\xFF])'     # Tout ce qui n'est pas en ASCII étendu (0-255)
)
  text = text_pattern.sub('', str(text))
  text = text.lower()
  punctuation = set(string.punctuation)
  stop_words = set(stopwords.words('english'))
  tokens = []
  sentences = nltk.sent_tokenize(text)

  for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    for word in words:
      if word in punctuation or word in stop_words:
        continue
      word = ''.join([c for c in word if c not in punctuation])
      tokens.append(word)
  # get the part of speech
  pos_tags = nltk.pos_tag(tokens)
  lemmatizer = WordNetLemmatizer()
  data = []
  for token, pos in pos_tags:
    if pos.startswith('J'):
      lemma = lemmatizer.lemmatize(token, pos = 'a')
    elif pos.startswith('V'):
      lemma = lemmatizer.lemmatize(token, pos = 'v')
    elif pos.startswith('R'):
      lemma = lemmatizer.lemmatize(token, pos = 'r')
    elif pos.startswith('N'):
      lemma = lemmatizer.lemmatize(token, pos = 'n')
    else:
      lemma = lemmatizer.lemmatize(token)
    data.append([token, lemma, pos])

  return data

def get_info(texts):
  data = []
  for text in texts:
    text = cleaning_text(text)
    data.extend(text)
    data.append(['', '', ''])
  return data

In [4]:
if os.path.exists("./token_lemma_pos.csv"):
  df_tokens = pd.read_csv("./token_lemma_pos.csv")
else:
  data = get_info(texts = texts)
  df_tokens = pd.DataFrame(data, columns = ['token', 'lemma', 'pos'])
  df_tokens.to_csv("./token_lemma_pos.csv", index = False)

In [5]:
df_tokens = pd.read_csv('./token_lemma_pos.csv')
df_tokens.fillna('', inplace = True)

In [6]:
df_tokens

Unnamed: 0,token,lemma,pos
0,cooking,cook,VBG
1,microwave,microwave,NN
2,pizzas,pizza,NN
3,yummy,yummy,NN
4,,,
...,...,...,...
343782,hours,hour,NNS
343783,,,
343784,missed,miss,VBN
343785,play,play,NN


## 2e etape: TF-IDF

In [7]:
lemma = df_tokens['lemma'].tolist()

def get_document(lemma: list[str]):
  docs = []
  for i in range(0, len(lemma)):
    for j in range(i, len(lemma)):
      if lemma[j] == '':
        docs.append(' '.join(lemma[i:j]))
        i = j + 1
    break
  return docs

docs = get_document(lemma)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix

docs = np.array(docs)
tfidf = TfidfVectorizer(use_idf = True, norm = 'l2', smooth_idf=True)
tfidf_matrix = tfidf.fit_transform(docs).toarray()
csr = csr_matrix(tfidf_matrix ,dtype = float)


In [9]:
# save the tfidf_matrix

if os.path.exists('./tfidf_matrix.npz'):
  csr = np.load('./tfidf_matrix.npz', allow_pickle=True)
else:
  np.savez('./tfidf_matrix', csr)
  csr = np.load('./tfidf_matrix.npz', allow_pickle= True)
csr = csr['arr_0']

## 3e etape: WORD2VEC

In [10]:
from gensim.models import KeyedVectors

path = './GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(path, binary= True)

In [11]:
docs = get_document(lemma= lemma)
lemmas = [doc.split() for doc in docs]

In [12]:
vocab = set(list(model.key_to_index.keys()))

def remove_words(lemmas, vocab):
    for phrase in lemmas:
        for word in phrase:
            if word not in vocab:
                phrase.remove(word)
        if phrase == []:
            lemmas.remove(phrase)
    return lemmas


In [13]:
df_tokens['word2vec'] = df_tokens['lemma'].apply(lambda word: model[word] if word in vocab else None)

In [24]:
df2 = pd.DataFrame(data = [(lemma, word2vec) \
                           for lemma, word2vec in zip(df_tokens['lemma'], df_tokens['word2vec']) \
                            if word2vec is not None], 
                    columns= ['lemma', 'word2vec']
)

if os.path.exists('./word2vec.csv'):
    df2 = pd.read_csv('./word2vec.csv')
else:
    df2.to_csv("./word2vec.csv", index = False)
    df2 = pd.read_csv('./word2vec.csv')

df2

Unnamed: 0.1,Unnamed: 0,lemma,word2vec
0,0,cook,[-2.32421875e-01 9.03320312e-02 7.81250000e-...
1,1,microwave,[-0.30078125 -0.06689453 0.07568359 0.324218...
2,2,pizza,[-1.25976562e-01 2.53906250e-02 1.66992188e-...
3,3,yummy,[-1.89453125e-01 -6.59179688e-02 -4.17480469e-...
4,4,plan,[ 0.07861328 0.09814453 0.16894531 0.083496...
...,...,...,...
290991,290991,start,[-3.44238281e-02 1.03515625e-01 2.16064453e-...
290992,290992,couple,[ 0.09667969 -0.00326538 -0.37109375 0.104492...
290993,290993,hour,[-2.15820312e-01 1.43432617e-02 -7.91015625e-...
290994,290994,miss,[ 2.00195312e-01 -1.54296875e-01 1.45507812e-...


In [25]:
lemmas = remove_words(lemmas, vocab)

df3 = pd.DataFrame(data = [(phrase, model.get_mean_vector(phrase, pre_normalize=False)) \
                               for phrase in lemmas],
                    columns= ['phrase', 'phrase2vec'])

if os.path.exists('./phrase2vec.csv'):
    df3 = pd.read_csv('./phrase2vec.csv')
else:
    df3.to_csv("./phrase2vec.csv", index = False)
    df3 = pd.read_csv('./phrase2vec.csv')

df3

Unnamed: 0,phrase,phrase2vec
0,"['cook', 'microwave', 'pizza', 'yummy']",[-0.2121582 -0.00427246 0.06976318 0.358886...
1,"['plan', 'allow', 'sub', 'task', 'show', 'widg...",[ 0.02561442 0.02229945 0.04549154 0.049357...
2,"['love', 'humor', 'reword', 'like', 'say', 'gr...",[ 0.05505371 -0.00522178 0.05345481 0.163966...
3,"['naw', 'idk', 'ur', 'talkin']",[-5.37261963e-02 5.86547852e-02 1.56799316e-...
4,"['suck', 'hear', 'hate', 'day', 'like']",[ 0.05214844 0.03173828 0.09024353 0.095996...
...,...,...
36974,"['get', 'wrong', 'size', 'coat', 'sheep']",[ 5.19775376e-02 3.69628891e-02 -8.23242217e-...
36975,"['4', 'case', 'swine', 'flu']",[ 0.01879883 -0.00683594 -0.00805664 0.111633...
36976,['excellent'],[-2.12890625e-01 -4.30297852e-03 -1.80664062e-...
36977,"['sit', 'thru', 'bore', 'bit', 'titanic', 'wai...",[ 0.04436701 0.07160811 -0.06126265 0.158513...


In [None]:
# model.similarity(w1 = 'woman', w2 = 'man')

In [None]:
# # king - man + women = queen
# model.most_similar(positive=['king', 'woman'], negative=['man'])

In [None]:
# model.doesnt_match("breakfast cereal dinner lunch".split())

In [None]:
# wv1 = model['king'] - model['man'] + model['women']

In [None]:
# vectors = []

# for token in tokens[1]:
#     out = model['token']
#     vectors.append(out)

# np.array(vectors).shape

In [None]:
# model.get_mean_vector(tokens[2], pre_normalize = True)

In [None]:
# from gensim.test.utils import common_texts
# from gensim.models import Word2Vec

# model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
# model.save("word2vec.model")

In [None]:
# model = model.load('word2vec.model')
# model.train([['hello', 'world'], ['how', 'are', 'you']], total_examples=10, epochs = 1)

In [None]:
# model.train([["hello", "world"]], total_examples=1, epochs=1)