In [1]:
from gensim.models import Word2Vec
import pandas as pd
import re
from gensim.parsing.preprocessing import strip_punctuation, strip_numeric, strip_short, stem_text

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from datasets import load_dataset

In [2]:
dataset_corpus = load_dataset("large_spanish_corpus", "ParaCrawl")

In [3]:
dataset_corpus

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 15510649
    })
})

In [4]:
subset = dataset_corpus["train"].select(range(1_000_000))

In [5]:
subset[0:2]

{'text': ['lavado de cerebro a través de los medios de comunicación, y amenaza de fuerza a través de los militares.',
  'Sin un constante aluvión de doble cañón, requiriendo la complicidad de los seres humanos para reprimir y engañar a sus semejantes, su tan cacareada magia rápidamente se desvanecería y se disiparía.']}

In [6]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mmenendezg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mmenendezg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
def clean_text(sentece_batch):
    text_list = sentece_batch["text"]
    cleaned_text_list = []
    for text in text_list:
        # Lowercase the text
        text = text.lower()
        # Remove URLs
        text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
        # Remove social media mentions
        text = re.sub(r"\@\w+|\#\w+", "", text)
        # Remove punctuation
        text = strip_punctuation(text)
        # Remove the numbers
        text = strip_numeric(text)
        # Remove short words
        text = strip_short(text, minsize=2)
        # Remove the stopwords
        stop_words = set(stopwords.words("spanish"))
        word_tokens = word_tokenize(text)
        filtered_text = [word for word in word_tokens if word not in stop_words]
        
        cleaned_text_list.append(filtered_text)
    
    return {"text": cleaned_text_list}

In [8]:
sentences_corpus = subset.map(clean_text, batched=True)

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

In [15]:
sentences_corpus["text"][:3]

[['lavado',
  'cerebro',
  'través',
  'medios',
  'comunicación',
  'amenaza',
  'fuerza',
  'través',
  'militares'],
 ['constante',
  'aluvión',
  'doble',
  'cañón',
  'requiriendo',
  'complicidad',
  'seres',
  'humanos',
  'reprimir',
  'engañar',
  'semejantes',
  'tan',
  'cacareada',
  'magia',
  'rápidamente',
  'desvanecería',
  'disiparía'],
 ['realidad',
  'nuevo',
  'om',
  'sólo',
  'puede',
  'mantener',
  'ilusión',
  'supremacía',
  'mágica',
  'siempre',
  'reprima',
  'desvíe',
  'potencial',
  'humano',
  'mora',
  'verdadera',
  'magia',
  'decir',
  'capacidad',
  'innata',
  'especie',
  'magia',
  'interactiva',
  'poderes',
  'animación',
  'diosa',
  'planetaria']]

In [16]:
model = Word2Vec(sentences_corpus["text"], vector_size=100, window=5, min_count=2, workers=8, sg=1)

model.save("word2vec.model")

In [17]:
model.wv["rey"]

array([ 0.16017072, -0.32289705, -0.12332602,  0.18084925, -0.06713438,
        0.21398501,  0.03398271,  0.78280765, -0.47395843, -0.67224306,
       -0.74776715, -0.6597449 , -0.31700933,  0.25756398, -0.249013  ,
       -0.1925262 , -0.44309142, -0.51708907, -0.37307623, -0.1748599 ,
        0.22401686,  0.25190222,  1.2153071 ,  0.22841558,  0.34427357,
        0.30287328, -0.08658863, -0.21468261, -0.6042372 ,  0.05414255,
       -0.6058548 ,  0.80807227,  0.27611133, -0.6505351 , -0.7418964 ,
        0.47726482, -0.76335716, -0.07873221,  0.02602187, -0.32445833,
        0.04472232, -0.40263155,  0.57207894,  0.06032294, -0.58783877,
       -0.13988917, -0.2029182 , -0.5036995 , -0.14719863,  0.26365215,
       -0.2904254 , -1.2020726 ,  0.3463537 , -0.02320538, -0.34795463,
       -0.42241365, -0.20329475, -0.47643566, -0.06420347,  0.01502857,
        0.37856996, -0.35347977,  0.5463576 ,  0.29986218,  0.57905644,
        0.7436612 , -0.00332628, -0.17041057, -0.3477728 , -0.07

In [20]:
model.wv.most_similar(["television"], topn=3)

[('gsm', 0.8859359622001648),
 ('entertainment', 0.8636576533317566),
 ('player', 0.8603487610816956)]

In [22]:
word_vectors = model.wv
vectors = word_vectors.vectors
words = word_vectors.index_to_key

# Save the Embeddings

In [23]:
df_vectors = pd.DataFrame(vectors)
df_vectors.to_csv(("embeddings.tsv"), sep="\t", index=False)

In [None]:
df_words = pd.DataFrame(words)
df_words.to_csv(("words.tsv"), sep="\t", index=False)