## Import the Dependecies

In [1]:
import pandas as pd
from __future__ import annotations
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

## 1e etape: pre-precessing the text

In [2]:
df = pd.read_csv('train.csv')
df.shape

(31232, 4)

In [3]:
texts = df['text']

def cleaning_text(text):
  text_pattern = re.compile(
    r'(<.+?>)'         # Balises HTML
    r'|([@|#]\w+)'     # Mentions et hashtags
    r'|([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'  # Emails
    r'|(https?\W+[^\s]+)'  # URLs commençant par http ou https
    r'|(www\.[^\s]+)'      # URLs commençant par www
    r'|([\U00010000-\U0010ffff])'  # Émojis et autres caractères au-delà de l'ASCII étendu
    r'|([^\x00-\xFF])'     # Tout ce qui n'est pas en ASCII étendu (0-255)
    r'|([^\w\s])'          # Tout ce qui n'est pas un caractère alphanumérique ou un espace
)
  text = text_pattern.sub('', text)
  text = text.lower()
  punctuation = set(string.punctuation)
  stop_words = set(stopwords.words('english'))
  tokens = []
  sentences = nltk.sent_tokenize(text)

  for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    for t in words:
      if t in punctuation or t in stop_words:
        continue
      tokens.append(t)
  # get the part of speech
  pos_tags = nltk.pos_tag(tokens)
  lemmatizer = WordNetLemmatizer()
  data = []
  for token, pos in pos_tags:
    if pos.startswith('J'):
      lemma = lemmatizer.lemmatize(token, pos = 'a')
    elif pos.startswith('V'):
      lemma = lemmatizer.lemmatize(token, pos = 'v')
    elif pos.startswith('R'):
      lemma = lemmatizer.lemmatize(token, pos = 'r')
    elif pos.startswith('N'):
      lemma = lemmatizer.lemmatize(token, pos = 'n')
    else:
      lemma = lemmatizer.lemmatize(token)
    data.append([token, lemma, pos])

  return data

def get_info():
  data = []
  for text in texts:
    text = cleaning_text(text)
    data.extend(text)
    data.append(['', '', ''])
  return data


In [4]:
if os.path.exists("./tokens.csv"):
  df_tokens = pd.read_csv("./tokens.csv")
else:
  data = get_info()
  df_tokens = pd.DataFrame(data, columns = ['token', 'lemma', 'pos'])
  df_tokens.to_csv("./tokens.csv", index = False)

In [5]:
df_tokens = pd.read_csv('./tokens.csv')
df_tokens.fillna('', inplace = True)

In [6]:
df_tokens

Unnamed: 0,token,lemma,pos
0,cooking,cook,VBG
1,microwave,microwave,NN
2,pizzas,pizza,NN
3,yummy,yummy,NN
4,,,
...,...,...,...
338804,hours,hour,NNS
338805,,,
338806,missed,miss,VBN
338807,play,play,NN


## 2e etape: TF-IDF

In [7]:
tokens = df_tokens['token'].tolist()

def get_document(tokens: list[str]):
  docs = []
  for i in range(0, len(tokens)):
    for j in range(i, len(tokens)):
      if tokens[j] == '':
        docs.append(' '.join(tokens[i:j]))
        i = j + 1
    break
  return docs

docs = get_document(tokens)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix

docs = np.array(docs)
tfidf = TfidfVectorizer(use_idf = True, norm = 'l2', smooth_idf=True)
tfidf_matrix = tfidf.fit_transform(docs).toarray()
csr = csr_matrix(tfidf_matrix ,dtype = float)


In [9]:
# save the tfidf_matrix

if os.path.exists('./tfidf_matrix.npz'):
  csr = np.load('./tfidf_matrix.npz', allow_pickle=True)
else:
  np.savez('./tfidf_matrix', csr)
  csr = np.load('./tfidf_matrix.npz', allow_pickle= True)
csr = csr['arr_0']

## 3e etape: WORD2VEC

In [10]:
docs = get_document(tokens= tokens)
tokens = [doc.split() for doc in docs]

In [11]:
from gensim.models.keyedvectors import KeyedVectors

path = './GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(path, binary= True)

In [12]:
model.most_similar('king',topn=5)

[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204220056533813),
 ('prince', 0.6159993410110474)]

In [13]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

[('queen', 0.7118193507194519)]

In [14]:
model.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [15]:
model.similarity('woman', 'man')

0.76640123