# Gensim Word2Vec

# Briefing about Word2Vec:

<img src="http://mccormickml.com/assets/word2vec/skip_gram_net_arch.png" alt="drawing" width="550"/>

In [1]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
import spacy

In [3]:
spacy.__version__

'3.7.2'

In [7]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/michalpalinski/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/michalpalinski/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michalpalinski/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preprocessing

In [8]:
df = pd.read_csv('../data/clean/sec_pol_clean.csv')
df.shape

(819, 3)

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,f_name,cleaned_txt
0,0,../data/sec_POL/Fakt/Factiva-20230808-125,Ksiądz Janusz Koplewski l Odwagi Wirus to nie ...
1,1,../data/sec_POL/Rzeczpospolita/Factiva-2023080...,Kraj bez Boga Wörter Januar © Copyright Al...
2,2,../data/sec_POL/Rzeczpospolita/Factiva-2023080...,Jaka Polska po rządach PiS Wörter Januar © ...
3,3,../data/sec_POL/Rzeczpospolita/Factiva-2023080...,Watykan Rosja Chiny W co naprawdę gra Francisz...
4,4,../data/sec_POL/Rzeczpospolita/Factiva-2023080...,Górski Cieślik Dokąd jechać dokąd nie i dlacze...


In [10]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

Unnamed: 0     0
f_name         0
cleaned_txt    0
dtype: int64

In [11]:
# df['title']=df['title'].fillna('')
df['text_all']=df['cleaned_txt']

## Cleaning:
We are lemmatizing and removing the stopwords and non-alphabetic characters for each line of dialogue.

In [12]:
# %%capture 
!python -m spacy download pl_core_news_sm

Collecting pl-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-3.7.0/pl_core_news_sm-3.7.0-py3-none-any.whl (20.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.2/20.2 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: pl-core-news-sm
Successfully installed pl-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pl_core_news_sm')


In [13]:
nlp = spacy.load('pl_core_news_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 5:
        return ' '.join(txt)

Removes non-alphabetic characters:

In [14]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['text_all'])

Taking advantage of spaCy .pipe() attribute to speed-up the cleaning process:

In [None]:
# t = time()

# txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1, cleanup=True)]

# print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

In [15]:
import re
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word) > 3]

        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

In [16]:
df=df[~df['text_all'].isna()]

In [17]:
docs=df['text_all'].tolist()

In [18]:
final_corpus = [preprocess_text(sentence) for sentence in docs if sentence.strip() !='']

word_punctuation_tokenizer = nltk.WordPunctTokenizer()
word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in final_corpus]

In [19]:
df['clean']=final_corpus

In [20]:
df_clean=df

Put the results in a DataFrame to remove missing values and duplicates:

## Bigrams:
We are using Gensim Phrases package to automatically detect common phrases (bigrams) from a list of sentences.
https://radimrehurek.com/gensim/models/phrases.html

The main reason we do this is to catch words like "mr_burns" or "bart_simpson" !

In [21]:
from gensim.models.phrases import Phrases, Phraser

As `Phrases()` takes a list of list of words as input:

In [22]:
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]

INFO - 16:36:29: collecting all words and their counts
INFO - 16:36:29: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 16:36:30: collected 676918 token types (unigram + bigrams) from a corpus of 771399 words and 819 sentences
INFO - 16:36:30: merged Phrases<676918 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 16:36:30: Phrases lifecycle event {'msg': 'built Phrases<676918 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 1.25s', 'datetime': '2024-01-18T16:36:30.312982', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:19:27) [Clang 14.0.6 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}
INFO - 16:36:30: exporting phrases from Phrases<676918 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 16:36:31: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<199 phrases, min_count=30, threshold=10.0> from Phrases<676918 vocab, min_count=30, threshold=10.0, max_vocab_size=40000

Creates the relevant phrases from the list of sentences:

## Most Frequent Words:
Mainly a sanity check of the effectiveness of the lemmatization, removal of stopwords, and addition of bigrams.

In [23]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

96658

In [24]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['jest',
 'przez',
 'tego',
 'tylko',
 'może',
 'będzie',
 'które',
 'który',
 'roku',
 'jego']

# Training the model
## Gensim Word2Vec Implementation:
We use Gensim implementation of word2vec: https://radimrehurek.com/gensim/models/word2vec.html

In [25]:
import multiprocessing

from gensim.models import Word2Vec

## Why I seperate the training of the model in 3 steps:
I prefer to separate the training in 3 distinctive steps for clarity and monitoring.
1. `Word2Vec()`: 
>In this first step, I set up the parameters of the model one-by-one. <br>I do not supply the parameter `sentences`, and therefore leave the model uninitialized, purposefully.
2. `.build_vocab()`: 
>Here it builds the vocabulary from a sequence of sentences and thus initialized the model. <br>With the loggings, I can follow the progress and even more important, the effect of `min_count` and `sample` on the word corpus. I noticed that these two parameters, and in particular `sample`, have a great influence over the performance of a model. Displaying both allows for a more accurate and an easier management of their influence.
3. `.train()`:
>Finally, trains the model.<br>
The loggings here are mainly useful for monitoring, making sure that no threads are executed instantaneously.

In [26]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

## The parameters:

* `min_count` <font color='purple'>=</font> <font color='green'>int</font> - Ignores all words with total absolute frequency lower than this - (2, 100)


* `window` <font color='purple'>=</font> <font color='green'>int</font> - The maximum distance between the current and predicted word within a sentence. E.g. `window` words on the left and `window` words on the left of our target - (2, 10)


* `size` <font color='purple'>=</font> <font color='green'>int</font> - Dimensionality of the feature vectors. - (50, 300)


* `sample` <font color='purple'>=</font> <font color='green'>float</font> - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial.  - (0, 1e-5)


* `alpha` <font color='purple'>=</font> <font color='green'>float</font> - The initial learning rate - (0.01, 0.05)


* `min_alpha` <font color='purple'>=</font> <font color='green'>float</font> - Learning rate will linearly drop to `min_alpha` as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00


* `negative` <font color='purple'>=</font> <font color='green'>int</font> - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)


* `workers` <font color='purple'>=</font> <font color='green'>int</font> - Use these many worker threads to train the model (=faster training with multicore machines)

In [28]:
w2v_model = Word2Vec(min_count=20,
                     window=5,
                     # size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

INFO - 16:36:44: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=100, alpha=0.03>', 'datetime': '2024-01-18T16:36:44.818611', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:19:27) [Clang 14.0.6 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}


## Building the Vocabulary Table:
Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, and doing some basic counts on them):

In [29]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 16:36:46: collecting all words and their counts
INFO - 16:36:46: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:36:47: collected 96658 word types from a corpus of 757989 raw words and 819 sentences
INFO - 16:36:47: Creating a fresh vocabulary
INFO - 16:36:47: Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 6002 unique words (6.21% of original 96658, drops 90656)', 'datetime': '2024-01-18T16:36:47.757087', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:19:27) [Clang 14.0.6 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
INFO - 16:36:47: Word2Vec lifecycle event {'msg': 'effective_min_count=20 leaves 487189 word corpus (64.27% of original 757989, drops 270800)', 'datetime': '2024-01-18T16:36:47.757758', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:19:27) [Clang 14.0.6 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
INFO - 16:36:47: deleting the raw counts

Time to build vocab: 0.01 mins


## Training of the model:
_Parameters of the training:_
* `total_examples` <font color='purple'>=</font> <font color='green'>int</font> - Count of sentences;
* `epochs` <font color='purple'>=</font> <font color='green'>int</font> - Number of iterations (epochs) over the corpus - [10, 20, 30]

In [30]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=15, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 16:36:48: Word2Vec lifecycle event {'msg': 'training model with 3 workers on 6002 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=5 shrink_windows=True', 'datetime': '2024-01-18T16:36:48.778136', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 08:19:27) [Clang 14.0.6 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'train'}
INFO - 16:36:49: EPOCH 0: training on 757989 raw words (309692 effective words) took 1.0s, 311200 effective words/s
INFO - 16:36:50: EPOCH 1: training on 757989 raw words (310156 effective words) took 1.0s, 311681 effective words/s
INFO - 16:36:51: EPOCH 2 - PROGRESS: at 79.85% examples, 252956 words/s, in_qsize 6, out_qsize 0
INFO - 16:36:51: EPOCH 2: training on 757989 raw words (309683 effective words) took 1.1s, 269526 effective words/s
INFO - 16:36:52: EPOCH 3 - PROGRESS: at 90.84% examples, 284811 words/s, in_qsize 5, out_qsize 0
INFO - 16:36:53: EPOCH 3: training on 757989 raw words (309401 effective words

Time to train the model: 0.31 mins


As we do not plan to train the model any further, we are calling init_sims(), which will make the model much more memory-efficient:

In [31]:
w2v_model.init_sims(replace=True)

  w2v_model.init_sims(replace=True)


# Exploring the model
## Most similar to:

In [1]:
w2v_model.wv.most_similar(positive=["epidemia"],topn=70)

### t-SNE visualizations:
t-SNE is a non-linear dimensionality reduction algorithm that attempts to represent high-dimensional data and the underlying relationships between vectors in a lower-dimensional space.<br>
Here is a good tutorial on it: https://medium.com/@luckylwk/visualising-high-dimensional-datasets-using-pca-and-t-sne-in-python-8ef87e7915b

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
 
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

Our goal in this section is to plot our 300 dimensions vectors into 2 dimensional graphs, and see if we can spot interesting patterns.<br>
For that we are going to use t-SNE implementation from scikit-learn.

To make the visualizations more relevant, we will look at the relationships between a query word (in <font color='red'>**red**</font>), its most similar words in the model (in <font color="blue">**blue**</font>), and other words from the vocabulary (in <font color='green'>**green**</font>).

In [None]:
def tsnescatterplot(model, word, list_names):
    """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
    its list of most similar words, and a list of words.
    """
    arrays = np.empty((0, 300), dtype='f')
    word_labels = [word]
    color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    
    # gets list of most similar words
    close_words = model.wv.most_similar([word])
    
    # adds the vector for each of the closest words to the array
    for wrd_score in close_words:
        wrd_vector = model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('blue')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    # adds the vector for each of the words from list_names to the array
    for wrd in list_names:
        wrd_vector = model.wv.__getitem__([wrd])
        word_labels.append(wrd)
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
        
    # Reduces the dimensionality from 300 to 50 dimensions with PCA
    reduc = PCA(n_components=50).fit_transform(arrays)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)
    
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))
    