# Define a Corpus

In [None]:
my_corpus = [
    "One-Hot Encoding Binary vectors",
    
    "Bag of Words Counts words",
    
    "TF-IDF Weights words by importance",
    
    "N-grams Captures words sequences",
    
    "Words Embeddings Dense vectors"
]
my_corpus

In [None]:
# Tokenize the corpus
tokenized_corpus = [doc.lower().split() for doc in my_corpus]
tokenized_corpus

# 1-Hot Encoding

In [None]:
# Get all unique words
all_words = sorted(list(set(word for doc in tokenized_corpus for word in doc)))
print(f"Vocabulary size: {len(all_words)}")
print(all_words)

In [None]:
import numpy as np
import pandas as pd

one_hot_word_vectors = np.eye(len(all_words))
one_hot_word_vectors_df = pd.DataFrame(one_hot_word_vectors, columns=all_words)
one_hot_word_vectors_df

In [None]:
corpus_vectors = np.zeros((len(tokenized_corpus), len(all_words)))

for i, doc in enumerate(tokenized_corpus):
    for word in doc:
        for j, w in enumerate(all_words):
            if w == word:
                corpus_vectors[i, j] = 1

corpus_vectors_df = pd.DataFrame(corpus_vectors, columns=all_words)
corpus_vectors_df

# Bag of Words

In [None]:
# Create a Bag of Words representation 
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the corpus
bow_matrix = count_vectorizer.fit_transform(my_corpus)

# Convert to DataFrame for better visualization
bow_df = pd.DataFrame(
    bow_matrix.toarray(),
    columns=count_vectorizer.get_feature_names_out()
)

# Display the Bag of Words representation
print("Bag of Words representation:")
bow_df

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(my_corpus)

# Convert to DataFrame for better visualization
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

# Display the TF-IDF representation
print("TF-IDF representation:")
tfidf_df

# N-grams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer with n-gram range
# ngram_range : is a tuple of two integers (min_n, max_n)
ngram_vectorizer = CountVectorizer(ngram_range=(2, 2))

# Fit and transform the corpus
ngram_matrix = ngram_vectorizer.fit_transform(my_corpus)
ngram_vectorizer.get_feature_names_out()

In [None]:
# Convert to DataFrame for better visualization
ngram_df = pd.DataFrame(
    ngram_matrix.toarray(),
    columns=ngram_vectorizer.get_feature_names_out()
)

# Display the N-grams representation
print("N-grams shape:", ngram_matrix.shape)
ngram_df

# Dense Word Embeddings (Word2Vec)

In [None]:
from gensim.models import Word2Vec

# Initialize the Word2Vec model
word2vec_model = Word2Vec(tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

# Get the word embeddings
word_embeddings = word2vec_model.wv

# Print the word vectors
print(word_embeddings)

In [None]:
# Let's see the dimensionality of the vectors
print("\nVector dimensionality:", word_embeddings.vector_size)

In [None]:
# Let's see the vector for a specific word
print("\nVector for 'embeddings':")
print(word_embeddings['embeddings'])

In [None]:
# We can also find similar words
print("\nWords similar to 'embeddings':")
similar_words = word_embeddings.most_similar('embeddings', topn=3)
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

In [None]:
print(word_embeddings.index_to_key)

In [50]:
# Plotly heatmap
import plotly.express as px
def visualize_similarity_matrix(similarity_df):
    fig = px.imshow(similarity_df, labels=dict(x="Words", y="Words", color="Similarity"), x=similarity_df.columns, y=similarity_df.index, color_continuous_scale="Viridis")
    fig.update_layout(title="Word Similarity Matrix", xaxis_tickangle=-45, width=800, height=800)
    fig.show()

In [None]:
# Similarity matrix
# Create a similarity matrix manually since KeyedVectors doesn't have similarity_matrix attribute
import numpy as np
words = word_embeddings.index_to_key
similarity_matrix = np.zeros((len(words), len(words)))

for i, word1 in enumerate(words):
    for j, word2 in enumerate(words):
        if word1 != word2:
            similarity_matrix[i, j] = word_embeddings.similarity(word1, word2)

# Create a DataFrame for the similarity matrix
similarity_df = pd.DataFrame(similarity_matrix, index=words, columns=words)
visualize_similarity_matrix(similarity_df)

In [74]:
def visualize_2d_plot(df):
    # Create a scatter plot using Plotly
    fig = px.scatter(df, x='C1', y='C2', text='doc', title='Visualization of Word Embeddings', labels=["Component 1", "Component 2"])

    # Improve the layout
    fig.update_traces(textposition='top center', marker=dict(size=10, opacity=0.8), mode='markers+text')
    fig.update_layout(width=900, height=700, xaxis=dict(title='Component 1'), yaxis=dict(title='Component 2'))

    # Show the plot
    fig.show()

In [None]:
# PCA plot
from sklearn.decomposition import PCA

# Initialize PCA with 2 components
pca = PCA(n_components=2)

# Fit PCA on the word embeddings
pca.fit(word_embeddings.vectors)

# Transform the word embeddings using PCA
word_embeddings_2d = pca.transform(word_embeddings.vectors)

# Create a DataFrame for the 2D embeddings
pca_df = pd.DataFrame(
    word_embeddings_2d,
    columns=['C1', 'C2']
)
pca_df['doc'] = words
visualize_2d_plot(pca_df)

In [None]:
# TSNE plot
# t-SNE tries to preserve local relationships, not the global structure. For a small number of points (e.g., ~20 words), t-SNE often:
# Overemphasizes tiny distances
# Blows up or distorts distances between points not in a neighborhood
# Gives unpredictable layouts that "feel random"
from sklearn.manifold import TSNE

# Initialize TSNE with 2 components
tsne = TSNE(n_components=2, random_state=42)

# Fit and transform the word embeddings
# Set perplexity to a value less than the number of samples
# The perplexity is the number of samples in a neighborhood of a selected point
# Default perplexity is 30, so we need to reduce it if we have fewer than 30 samples
n_samples = word_embeddings.vectors.shape[0]
perplexity = min(30, n_samples - 1)  # Ensure perplexity is less than n_samples
tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity, init='pca')
word_embeddings_2d = tsne.fit_transform(word_embeddings.vectors)

# Create a DataFrame for the 2D embeddings
tsne_df = pd.DataFrame(
    word_embeddings_2d,
    columns=['C1', 'C2']
)
tsne_df['doc'] = words

visualize_2d_plot(tsne_df)

In [None]:
# Compute the embeddings of the sentences in the corpus from the word embeddings
# Initialize an empty list to store sentence embeddings
sentence_embeddings = np.zeros((len(tokenized_corpus), word_embeddings.vector_size))
# Iterate through each document in the corpus
for i, doc in enumerate(tokenized_corpus):
    # Initialize a numpy array of zeros for the sentence vector
    sentence_vector = np.zeros(word_embeddings.vector_size)
    word_count = 0
    
    # Iterate through each word and add its vector to the sentence vector
    for word in doc:
        if word in word_embeddings:
            sentence_vector += word_embeddings[word]
            word_count += 1
    
    # If we found words in the model, calculate the average
    if word_count > 0:
        sentence_vector = sentence_vector / word_count
    
    # Add the sentence embedding to our list
    sentence_embeddings[i] = sentence_vector
# Create a DataFrame with the sentence embeddings
# The error occurs because word_embeddings.index_to_key has 17 items but our vectors have 100 dimensions
# We need to create column names that match the dimensions of our vectors
sentence_embeddings_df = pd.DataFrame(
    sentence_embeddings,
    columns=[f"dim_{i}" for i in range(word_embeddings.vector_size)]
)
sentence_embeddings_df

In [None]:
# Similarity matrix for document embeddings
# Create a similarity matrix manually since KeyedVectors doesn't have similarity_matrix attribute
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = np.zeros((len(my_corpus), len(my_corpus)))
for i, embedding_i in enumerate(sentence_embeddings):
    for j, embedding_j in enumerate(sentence_embeddings):
        if i != j:
            similarity_matrix[i, j] = cosine_similarity(embedding_i.reshape(1, -1), embedding_j.reshape(1, -1))[0, 0]

# Create a DataFrame for the similarity matrix
doc_names = ["doc_" + str(i+1) for i in range(len(tokenized_corpus))]
similarity_df = pd.DataFrame(similarity_matrix, index=doc_names, columns=doc_names)
visualize_similarity_matrix(similarity_df)


In [None]:
# Visualize PCA for document embeddings
pca = PCA(n_components=2)

# Fit PCA on the word embeddings
pca.fit(sentence_embeddings)

# Transform the word embeddings using PCA
sentence_embeddings_2d = pca.transform(sentence_embeddings)

# Create a DataFrame for the 2D embeddings
pca_df = pd.DataFrame(
    sentence_embeddings_2d,
    columns=['C1', 'C2']
)
pca_df['doc'] = ["doc_" + str(i) for i in range(len(tokenized_corpus))]
visualize_2d_plot(pca_df)

# Transformers

In [None]:
from sentence_transformers import SentenceTransformer
import os
import torch
import os
if torch.cuda.device_count()>0:
    my_device = "cuda"
    print(f"You have {torch.cuda.device_count()} GPUs available.")
else:
    my_device = "cpu"
    print("You have no GPUs available. Running on CPU.")

In [None]:
embeddings_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2',
                                       token=os.environ["HF_TOKEN"],
                                       cache_folder=os.environ["HF_HOME"],
                                       device=my_device)

In [None]:
word_embeddings_transformer = embeddings_model.encode(my_corpus)
print(word_embeddings_transformer.shape)
word_embeddings_transformer

In [None]:
similarity_matrix = np.zeros((len(my_corpus), len(my_corpus)))
for i, embedding_i in enumerate(word_embeddings_transformer):
    for j, embedding_j in enumerate(word_embeddings_transformer):
        if i != j:
            similarity_matrix[i, j] = cosine_similarity(embedding_i.reshape(1, -1), embedding_j.reshape(1, -1))[0, 0]

# Create a DataFrame for the similarity matrix
doc_names = ["doc_" + str(i+1) for i in range(len(tokenized_corpus))]
similarity_df = pd.DataFrame(similarity_matrix, index=doc_names, columns=doc_names)
visualize_similarity_matrix(similarity_df)