# a. BASIC WORD EMBEDDINGS WITH TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample sentences
corpus = [
    "Deep learning is fun",
    "Word embeddings can be learned",
    "TF-IDF captures word importance",
    "Embeddings represent words as vectors"
]

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the corpus to get the TF-IDF matrix
X = vectorizer.fit_transform(corpus)

# Get the feature names (unique words from the corpus)
features = vectorizer.get_feature_names_out()

# Convert sparse matrix to dense array
tfidf_matrix = X.toarray()

# Display the feature names
print("TF-IDF Feature Names:")
print(features)

# Display the TF-IDF matrix
print("\nTF-IDF Matrix:")
print(tfidf_matrix)

TF-IDF Feature Names:
['as' 'be' 'can' 'captures' 'deep' 'embeddings' 'fun' 'idf' 'importance'
 'is' 'learned' 'learning' 'represent' 'tf' 'vectors' 'word' 'words']

TF-IDF Matrix:
[[0.         0.         0.         0.         0.5        0.
  0.5        0.         0.         0.5        0.         0.5
  0.         0.         0.         0.         0.        ]
 [0.         0.48546061 0.48546061 0.         0.         0.38274272
  0.         0.         0.         0.         0.48546061 0.
  0.         0.         0.         0.38274272 0.        ]
 [0.         0.         0.         0.46516193 0.         0.
  0.         0.46516193 0.46516193 0.         0.         0.
  0.         0.46516193 0.         0.36673901 0.        ]
 [0.46516193 0.         0.         0.         0.         0.36673901
  0.         0.         0.         0.         0.         0.
  0.46516193 0.         0.46516193 0.         0.46516193]]


# b. GENERATING WORD EMBEDDINGS USING WORD2VEC AND GLOVE

In [None]:
# 1. Uninstall numpy and gensim completely
!pip uninstall -y numpy gensim

# 2. Install a compatible numpy version first
!pip install numpy==1.24.3

# 3. Then install gensim fresh (no cache to avoid old builds)
!pip install --no-cache-dir gensim





Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Collecting numpy==1.24.3
  Downloading numpy-1.24.3.tar.gz (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Getting requirements to build wheel ... [?25l[?25herror
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
[31m│[0m exit 

In [None]:
import nltk
import re
import gensim
import numpy as np
import matplotlib.pyplot as plt
from gensim.utils import simple_preprocess
from sklearn.manifold import TSNE

# Sample text
sample_text = """
Natural language processing enables computers to understand human language.
Word embeddings capture semantic relationships between words in a vector space.
Deep learning techniques such as Word2Vec and GloVe are widely used in NLP
applications.
This is a sample document for generating word embeddings.
Another example document is provided for demonstration purposes.
"""

# Tokenize sentences
sentences = [simple_preprocess(line) for line in sample_text.split("\n") if line.strip()]
print(f"Sample tokenized sentences:\n{sentences}\n")

# Train Word2Vec model
print("Training Word2Vec model...")
w2v_model = gensim.models.Word2Vec(
    sentences,
    vector_size=50,
    window=3,
    min_count=1,
    sg=1,
    workers=4,
    epochs=100
)

print("\nWord2Vec: Similar words to 'document'")
print(w2v_model.wv.most_similar("document", topn=5))

# Download pretrained GloVe embeddings (50D)
import os
if not os.path.exists("glove.6B.50d.txt"):
    import wget
    import zipfile

    print("\nDownloading GloVe embeddings...")
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    wget.download(url)

    print("\nExtracting GloVe embeddings...")
    with zipfile.ZipFile("glove.6B.zip", "r") as zip_ref:
        zip_ref.extract("glove.6B.50d.txt")

# Convert GloVe format to word2vec format for gensim
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = "glove.6B.50d.txt"
word2vec_output_file = "glove.6B.50d.word2vec.txt"
if not os.path.exists(word2vec_output_file):
    glove2word2vec(glove_input_file, word2vec_output_file)

# Load GloVe model
glove_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

print("\nGloVe: Similar words to 'document'")
if "document" in glove_model.key_to_index:
    print(glove_model.most_similar("document", topn=5))
else:
    print("Word 'document' not in GloVe vocab")

# Visualization function for embeddings using t-SNE
def plot_embeddings(words, vectors, title):
    tsne = TSNE(n_components=2, random_state=42, perplexity=5)
    reduced = tsne.fit_transform(vectors)
    plt.figure(figsize=(6, 6))
    for i, word in enumerate(words):
        plt.scatter(reduced[i, 0], reduced[i, 1])
        plt.annotate(word, xy=(reduced[i, 0], reduced[i, 1]), fontsize=9)
    plt.title(title)
    plt.show()

# Word2Vec visualization (top 10 common words)
common_words_w2v = list(w2v_model.wv.key_to_index.keys())[:10]
w2v_vectors = np.array([w2v_model.wv[word] for word in common_words_w2v])
plot_embeddings(common_words_w2v, w2v_vectors, "Word2Vec Embeddings (t-SNE)")

# GloVe visualization (only those words present in GloVe vocab)
common_words_glove = [w for w in common_words_w2v if w in glove_model.key_to_index]
glove_vectors = np.array([glove_model[w] for w in common_words_glove])
plot_embeddings(common_words_glove, glove_vectors, "Pretrained GloVe Embeddings (t-SNE)")


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

# c. SENTENCE EMBEDDINGS WITH UNIVERSAL SENTENCE ENCODER


In [None]:
!pip install --upgrade numpy tensorflow tensorflow_hub --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m645.0/645.0 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m95.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.1.3 which is incompatible.
tensorflow-decision-forests 1.12.0 requires tensorflow==2.19.0, but you have tensorflow 2.19.1 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.1.3 which is incompatible.
tsfresh 0.21.1 requires scipy>=1.14.0; 

In [None]:
# Install TensorFlow and TensorFlow Hub if not already installed
!pip install --quiet tensorflow tensorflow_hub

# Import libraries
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

# Load the Universal Sentence Encoder
print("Loading Universal Sentence Encoder model...")
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
print("Model loaded!")

# Sample input sentences
sentences = [
    "This is a sentence.",
    "Another example sentence.",
    "Machine learning is fascinating.",
    "I love natural language processing.",
    "The sky is blue today."
]

# Generate embeddings
embeddings = embed(sentences)

# Display results
print("\nSentence Embeddings Shape:", embeddings.shape)
for i, sentence in enumerate(sentences):
    print(f"\nSentence: {sentence}")
    print(f"Embedding vector (first 5 values): {embeddings[i][:5].numpy()}")


Loading Universal Sentence Encoder model...
Model loaded!

Sentence Embeddings Shape: (5, 512)

Sentence: This is a sentence.
Embedding vector (first 5 values): [ 0.02881765 -0.02020016  0.01069627  0.03850532 -0.09253702]

Sentence: Another example sentence.
Embedding vector (first 5 values): [ 0.03328447  0.01292921 -0.00019189  0.00639367 -0.06787535]

Sentence: Machine learning is fascinating.
Embedding vector (first 5 values): [ 0.0484621  -0.04914229 -0.06110889 -0.05120424 -0.02304145]

Sentence: I love natural language processing.
Embedding vector (first 5 values): [ 0.00407052 -0.03478802 -0.00435814  0.03229547  0.00223153]

Sentence: The sky is blue today.
Embedding vector (first 5 values): [-0.04175051 -0.02714938  0.04868532  0.04477606 -0.01378212]
