In [None]:
# If you get an error when loading Word2Vec, install scipy==1.12
# !pip install scipy==1.12

# Word Vectors

This notebook is focused on demonstrating various text representation techniques used in Natural Language Processing. It starts by defining a simple text corpus with sentences about different text representation methods like OneHot vectors, Bag of Words, TF-IDF, N-grams, and Word Embeddings. The corpus is then split into sentences and further into words, with all text converted to lowercase. A vocabulary of unique words is created, and each word is represented as a binary vector using one-hot encoding. The Bag of Words model counts the frequency of each word in the corpus using CountVectorizer from sklearn. TF-IDF is applied to the corpus using TfidfVectorizer from sklearn, which scales word frequencies by their importance across documents. N-grams are generated using CountVectorizer with an n-gram range of two, capturing pairs of consecutive words in the corpus. Finally it presents the Transformers library.

# Define a Corpus

In [None]:
my_corpus = f"""
    OneHot vectors are binary vectors.
    
    Bag of Words Counts words.
    
    TFIDF Counts words and weights words by importance.
    
    Ngrams Captures words sequences.
    
    Words Embeddings with Dense vectors.
    """
my_corpus

In [None]:
# Split the corpus into sentences by using '.' as a delimiter and remove the last empty element
my_corpus = my_corpus.split('.')[:-1]
# Strip leading and trailing whitespace from each sentence and filter out any empty sentences
my_corpus = [sentence.strip() for sentence in my_corpus if sentence]
my_corpus

In [None]:
# Tokenize the corpus
tokenized_corpus = [doc.lower().split() for doc in my_corpus]
tokenized_corpus

# 1-Hot Encoding

In [None]:
# Get all unique words
# Extract all unique words from the tokenized corpus and sort them
all_words = sorted(list(set(word for doc in tokenized_corpus for word in doc)))
# Print the size of the vocabulary
print(f"Vocabulary size: {len(all_words)}")
# Print the list of all unique words
print(all_words)

In [None]:
import numpy as np
import pandas as pd

# Create an identity matrix where each row represents a one-hot encoded vector for each unique word
one_hot_word_vectors = np.eye(len(all_words))

# Convert the one-hot encoded vectors into a DataFrame for better readability, using unique words as column headers
one_hot_word_vectors_df = pd.DataFrame(one_hot_word_vectors, columns=all_words)

# Display the DataFrame containing one-hot encoded vectors
one_hot_word_vectors_df

In [None]:
# Initialize a zero matrix to store one-hot encoded vectors for each *document* in the corpus
corpus_vectors = np.zeros((len(tokenized_corpus), len(all_words)))

# Iterate over each document and its index in the tokenized corpus
for i, doc in enumerate(tokenized_corpus):
    # Iterate over each word in the document
    for word in doc:
        # Iterate over each word and its index in the list of all unique words
        for j, w in enumerate(all_words):
            # If the word matches the unique word, set the corresponding position in the matrix to 1
            if w == word:
                corpus_vectors[i, j] = 1

# Convert the matrix of one-hot encoded vectors into a DataFrame for better readability
corpus_vectors_df = pd.DataFrame(corpus_vectors, columns=all_words)

# Display the DataFrame containing one-hot encoded vectors for the corpus
corpus_vectors_df

In [None]:
corpus_vectors_df.shape

# Bag of Words

In [None]:
# Create a Bag of Words representation 
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the corpus
bow_matrix = count_vectorizer.fit_transform(my_corpus)

# Convert to DataFrame for better visualization
bow_df = pd.DataFrame(
    bow_matrix.toarray(),
    columns=count_vectorizer.get_feature_names_out()
)

# Display the Bag of Words representation
print("Bag of Words representation:")
bow_df

In [None]:
bow_df.shape

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(my_corpus)

# Convert to DataFrame for better visualization
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

# Display the TF-IDF representation
print("TF-IDF representation:")
tfidf_df

# N-grams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer with n-gram range
# ngram_range : is a tuple of two integers (min_n, max_n)
ngram_vectorizer = CountVectorizer(ngram_range=(2, 2))

# Fit and transform the corpus
ngram_matrix = ngram_vectorizer.fit_transform(my_corpus)
ngram_vectorizer.get_feature_names_out()

In [None]:
# Convert to DataFrame for better visualization
ngram_df = pd.DataFrame(
    ngram_matrix.toarray(),
    columns=ngram_vectorizer.get_feature_names_out()
)

# Display the N-grams representation
print("N-grams shape:", ngram_matrix.shape)
ngram_df

# Dense Word Embeddings (Word2Vec)

In [None]:
from gensim.models import Word2Vec

# Initialize the Word2Vec model
word2vec_model = Word2Vec(tokenized_corpus, # The corpus to train the model on
                          vector_size=100, # The dimensionality of the vectors
                          window=5, # The window size for the context window
                          epochs=5, # The number of epochs to train the model
                          min_count=1, # The minimum number of times a word must appear in the corpus to be included in the model
                          workers=4) # The number of threads to use for training

# Get the word embeddings
word_embeddings = word2vec_model.wv

# Print the word vectors
print(word_embeddings)

In [None]:
# Let's see the dimensionality of the vectors
print("\nVector dimensionality:", word_embeddings.vector_size)

In [None]:
# Let's see the vector for a specific word
my_word = "words"
print(f"\nVector for '{my_word}':")
print(word_embeddings[my_word])

In [None]:
# We can also find similar words
print(f"\nWords similar to '{my_word}':")
similar_words = word_embeddings.most_similar(my_word, topn=len(word_embeddings.index_to_key))
for idx, (word, similarity) in enumerate(similar_words):
    print(f"{idx+1}. {word}: {similarity:.4f}")

In [None]:
print(word_embeddings.index_to_key)

In [19]:
# Plotly heatmap
import plotly.express as px
def visualize_similarity_matrix(similarity_df):
    fig = px.imshow(similarity_df, labels=dict(x="Words", y="Words", color="Similarity"), x=similarity_df.columns, y=similarity_df.index, color_continuous_scale="Viridis")
    fig.update_layout(title="Word Similarity Matrix", xaxis_tickangle=-45, width=800, height=800)
    fig.show()

In [None]:
# Similarity matrix
# Create a similarity matrix manually since KeyedVectors doesn't have similarity_matrix attribute
import numpy as np
words = word_embeddings.index_to_key
similarity_matrix = np.zeros((len(words), len(words)))

for i, word1 in enumerate(words):
    for j, word2 in enumerate(words):
        if word1 != word2:
            similarity_matrix[i, j] = word_embeddings.similarity(word1, word2)

# Create a DataFrame for the similarity matrix
similarity_df = pd.DataFrame(similarity_matrix, index=words, columns=words)
visualize_similarity_matrix(similarity_df)

In [21]:
def visualize_2d_plot(df):
    # Create a scatter plot using Plotly
    fig = px.scatter(df, x='C1', y='C2', text='doc', title='Visualization of Word Embeddings', labels=["Component 1", "Component 2"])

    # Improve the layout
    fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.8), mode='markers+text')
    fig.update_layout(width=900, height=700, xaxis=dict(title='Component 1'), yaxis=dict(title='Component 2'))

    # Show the plot
    fig.show()

In [None]:
# PCA plot
from sklearn.decomposition import PCA

# Initialize PCA with 2 components
pca = PCA(n_components=2)

# Fit PCA on the word embeddings
pca.fit(word_embeddings.vectors)

# Transform the word embeddings using PCA
word_embeddings_2d = pca.transform(word_embeddings.vectors)

# Create a DataFrame for the 2D embeddings
pca_df = pd.DataFrame(
    word_embeddings_2d,
    columns=['C1', 'C2']
)
pca_df['doc'] = words
visualize_2d_plot(pca_df)

In [None]:
# TSNE plot
# t-SNE tries to preserve local relationships, not the global structure. For a small number of points (e.g., ~20 words), t-SNE often:
# Overemphasizes tiny distances
# Distorts distances between points not in a neighborhood
# Gives unpredictable layouts that "feel random"
from sklearn.manifold import TSNE

# Initialize TSNE with 2 components
tsne = TSNE(n_components=2, random_state=42)

# Fit and transform the word embeddings
# Set perplexity to a value less than the number of samples
# The perplexity is the number of samples in a neighborhood of a selected point
# Default perplexity is 30, so we need to reduce it if we have fewer than 30 samples
n_samples = word_embeddings.vectors.shape[0]
perplexity = min(30, n_samples - 1)  # Ensure perplexity is less than n_samples
tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, init='pca')
word_embeddings_2d = tsne.fit_transform(word_embeddings.vectors)

# Create a DataFrame for the 2D embeddings
tsne_df = pd.DataFrame(
    word_embeddings_2d,
    columns=['C1', 'C2']
)
tsne_df['doc'] = words

visualize_2d_plot(tsne_df)

In [None]:
# Compute the embeddings of the sentences in the corpus from the word embeddings
# Initialize an empty list to store sentence embeddings
sentence_embeddings = np.zeros((len(tokenized_corpus), word_embeddings.vector_size))
# Iterate through each document in the corpus
for i, doc in enumerate(tokenized_corpus):
    # Initialize a numpy array of zeros for the sentence vector
    sentence_vector = np.zeros(word_embeddings.vector_size)
    word_count = 0
    
    # Iterate through each word and add its vector to the sentence vector
    for word in doc:
        if word in word_embeddings:
            sentence_vector += word_embeddings[word]
            word_count += 1
    
    # If we found words in the model, calculate the average
    if word_count > 0:
        sentence_vector = sentence_vector / word_count
    
    # Add the sentence embedding to our list
    sentence_embeddings[i] = sentence_vector
# Create a DataFrame with the sentence embeddings
# The error occurs because word_embeddings.index_to_key has 17 items but our vectors have 100 dimensions
# We need to create column names that match the dimensions of our vectors
sentence_embeddings_df = pd.DataFrame(
    sentence_embeddings,
    columns=[f"dim_{i}" for i in range(word_embeddings.vector_size)]
)
sentence_embeddings_df

In [None]:
# Similarity matrix for document embeddings
# Create a similarity matrix manually since KeyedVectors doesn't have similarity_matrix attribute
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = np.zeros((len(my_corpus), len(my_corpus)))
for i, embedding_i in enumerate(sentence_embeddings):
    for j, embedding_j in enumerate(sentence_embeddings):
        if i != j:
            similarity_matrix[i, j] = cosine_similarity(embedding_i.reshape(1, -1), embedding_j.reshape(1, -1))[0, 0]

# Create a DataFrame for the similarity matrix
doc_names = ["doc_" + str(i+1) for i in range(len(tokenized_corpus))]
similarity_df = pd.DataFrame(similarity_matrix, index=doc_names, columns=doc_names)
visualize_similarity_matrix(similarity_df)


In [None]:
# Visualize PCA for document embeddings
pca = PCA(n_components=2)

# Fit PCA on the word embeddings
pca.fit(sentence_embeddings)

# Transform the word embeddings using PCA
sentence_embeddings_2d = pca.transform(sentence_embeddings)

# Create a DataFrame for the 2D embeddings
pca_df = pd.DataFrame(
    sentence_embeddings_2d,
    columns=['C1', 'C2']
)
pca_df['doc'] = ["doc_" + str(i) for i in range(len(tokenized_corpus))]
visualize_2d_plot(pca_df)

# Transformers

## Environmental Variables
we will need to use Environment Variables:
- HF_TOKEN is you huggingface token, you may generate one on this url: https://huggingface.co/settings/tokens

## On Linux do:
- `nano ~/.bashrc`
- `export HF_TOKEN="..."`
- `source ~/.bashrc`
- `echo $HF_TOKEN`

In [2]:
import os

In [28]:
os.environ['HF_TOKEN'] = "your huggingface token here"

In [3]:
os.environ["HF_HOME"] = r"C:\my_hf_models"

In [30]:
# https://huggingface.co/sentence-transformers
# !pip install -U sentence-transformers

In [31]:
# https://pytorch.org/get-started/locally/
# !pip3 install torch

In [None]:
from sentence_transformers import SentenceTransformer
import torch
if torch.cuda.device_count()>0:
    my_device = "cuda"
    print(f"You have {torch.cuda.device_count()} GPUs available.")
else:
    my_device = "cpu"
    print("You have no GPUs available. Running on CPU.")

In [33]:
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
embeddings_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2',
                                       token=os.environ["HF_TOKEN"],
                                       cache_folder=os.environ["HF_HOME"],
                                       device=my_device)

In [None]:
# Encode the corpus using the embeddings model
word_embeddings_transformer = embeddings_model.encode(my_corpus)

# Print the shape of the resulting embeddings
print(word_embeddings_transformer.shape)

# Output the embeddings
word_embeddings_transformer

In [None]:
# Initialize a zero matrix to store similarity scores between documents
similarity_matrix = np.zeros((len(my_corpus), len(my_corpus)))

# Iterate over each pair of embeddings to compute cosine similarity
for i, embedding_i in enumerate(word_embeddings_transformer):
    for j, embedding_j in enumerate(word_embeddings_transformer):
        # Avoid computing similarity of a document with itself
        if i != j:
            # Compute and store the cosine similarity between different document embeddings
            similarity_matrix[i, j] = cosine_similarity(embedding_i.reshape(1, -1), embedding_j.reshape(1, -1))[0, 0]

# Create a DataFrame for the similarity matrix
doc_names = ["doc_" + str(i+1) for i in range(len(tokenized_corpus))]
similarity_df = pd.DataFrame(similarity_matrix, index=doc_names, columns=doc_names)
visualize_similarity_matrix(similarity_df)

## Test Embeddings - related words

In [None]:
# Define a list of words to analyze
word_list = ["book", "book!", "publication", "article"]

# Encode the list of words using the embeddings model
word_embeddings_transformer = embeddings_model.encode(word_list)

# Calculate the cosine similarity matrix for the encoded words
cosine_similarities = cosine_similarity(word_embeddings_transformer)

# Print the cosine similarity matrix
print("Cosine Similarity Matrix:")
print(cosine_similarities)

# Create a DataFrame from the cosine similarity matrix for better visualization
similarity_df = pd.DataFrame(cosine_similarities, index=word_list, columns=word_list)

# Visualize the similarity matrix
visualize_similarity_matrix(similarity_df)

## Calculate normalized mean values of embeddings

In [None]:
# Calculate the mean of the absolute values of the embeddings along axis 1
mean_embeddings = np.mean(np.abs(word_embeddings_transformer), axis=1)
print("Normalized Mean values of embeddings:", mean_embeddings)

# Calculate the standard deviation of the embeddings along axis 1
std_embeddings = np.std(word_embeddings_transformer, axis=1)
print("Standard Deviation of embeddings:", std_embeddings)

# Calculate the norm of the embeddings along axis 1
norm_embeddings = np.linalg.norm(word_embeddings_transformer, axis=1)
print("Norm of embeddings:", norm_embeddings)

## Generate random vectors with the same mean and std

In [None]:
# Generate random vectors with the same mean and standard deviation as the word embeddings
random_vectors = np.random.normal(loc=np.mean(word_embeddings_transformer),
                                  scale=np.std(word_embeddings_transformer),
                                  size=word_embeddings_transformer.shape)

# Calculate and print the normalized mean values of the random vectors
mean_random_vectors = np.mean(np.abs(random_vectors), axis=1)
print("Normalized Mean values of random vectors:", mean_random_vectors)

# Calculate and print the standard deviation of the random vectors
std_random_vectors = np.std(random_vectors, axis=1)
print("Standard Deviation of random vectors:", std_random_vectors)

# Calculate and print the norm of the random vectors
norm_random_vectors = np.linalg.norm(random_vectors, axis=1)
print("Norm of random vectors:", norm_random_vectors)

In [None]:
# Print the cosine similarity matrix for the random vectors
print("Cosine Similarity Matrix random vectors:")
cosine_similarities = cosine_similarity(random_vectors)

# Display the cosine similarity matrix
print(cosine_similarities)

# Create a DataFrame for the cosine similarity matrix with appropriate labels
similarity_df = pd.DataFrame(
    cosine_similarities, 
    index=["Random Vector 1", "Random Vector 2", "Random Vector 3", "Random Vector 4"], 
    columns=["Random Vector 1", "Random Vector 2", "Random Vector 3", "Random Vector 4"]
)

# Visualize the similarity matrix
visualize_similarity_matrix(similarity_df)


## car ~ vehicle + motorcycle - bike

In [None]:
# Define a list of words to analyze
sentences = ["car", "vehicle", "motorcycle", "bike"]

# Encode the words into embeddings using the embeddings model
embeddings = embeddings_model.encode(sentences)

# Calculate the cosine similarity between the embedding of "car" and the vector operation (vehicle + motorcycle - bike)
similarity_score = cosine_similarity(
    embeddings[0].reshape(1, -1), 
    (embeddings[1] + embeddings[2] - embeddings[3]).reshape(1, -1)
)[0, 0]

# Print the similarity score
print(similarity_score)

## Greece ~ Athens + Italy - Rome

In [None]:
# Define the list of words for the analogy task
sentences = ["Greece", "Athens", "Italy", "Rome"]

# Encode the words into embeddings
embeddings = embeddings_model.encode(sentences)

# Calculate and print the cosine similarity for the analogy: Greece ~ Athens + Italy - Rome
similarity_score = cosine_similarity(
    embeddings[0].reshape(1, -1), 
    (embeddings[1] + embeddings[2] - embeddings[3]).reshape(1, -1)
)[0, 0]
print(similarity_score)

So embeddings work!

## Sentence embeddings

In [None]:
my_sentences = [
    # Interrelated sentences - group 1
    "The data is preprocessed to remove noise and outliers.",
    "Noise and outliers are eliminated during data preprocessing.",
    "Preprocessing cleans the data by filtering out noise and irregularities.",

    # Interrelated sentences - group 2
    "Paris is the capital of France.",
    "Athens is the capital of Greece.",
    "Rome is the capital of Italy."
]
my_embeddings = embeddings_model.encode(my_sentences)
similarity_matrix = cosine_similarity(my_embeddings)
print(similarity_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=my_sentences, columns=my_sentences)
visualize_similarity_matrix(similarity_df)

## Tokenizers

In [None]:
from transformers import AutoTokenizer
my_model = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(my_model,
                                          token=os.environ["HF_TOKEN"],
                                          cache_dir=os.environ["HF_HOME"])

https://huggingface.co/learn/llm-course/en/chapter6/5

In LLaMA and similar Byte-Pair Encoding (BPE) based models:

> **tokens ≠ words** (exactly),

> **tokens ≈ pieces of words + punctuation + space markers**

This helps the model handle any language efficiently with a smaller vocabulary.

**Example of subwords**

Take the word:
`unbelievable`

A tokenizer might split it like this:

```
['un', 'believ', 'able']
```

* `"un"` → a common prefix
* `"believ"` → root of "believe", "believer", etc.
* `"able"` → a common suffix

This way, even if `"unbelievable"` was never seen during training, the model knows the meaning from its parts.

---
In the follwoing example:

words: "Hello", "world", "Let", "tokenize", "this", "text"

punctuation: ",", "!", ".", "'s"

space indicators: the Ġ marks the start of a new word with a space. The Ġ symbol is not a space itself, but it indicates that a space precedes the token. This is a convention used in the LLaMA tokenizer (and some others like RoBERTa).

Hence `','` and `'Ġ,'` **are different tokens** in LLaMA-style or BPE-style tokenizers.

In [None]:
tokens = tokenizer.tokenize("Hello, world! Let's tokenize this text.")
print(tokens)