<a href="https://colab.research.google.com/github/manohargadde/wordembeddings/blob/main/Word_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **One Hot Encoding Example**

In [9]:
import numpy as np

# Define the corpus of text
corpus = [
	"The quick brown fox jumped over the lazy dog.",
	"She sells seashells by the seashore.",
	"Peter Piper picked a peck of pickled peppers."
]

In [6]:
# Create a set of unique words in the corpus
unique_words = set()
for sentence in corpus:
	for word in sentence.split():
		unique_words.add(word.lower())
print(unique_words)

{'seashells', 'seashore.', 'over', 'a', 'brown', 'lazy', 'picked', 'pickled', 'she', 'peppers.', 'peter', 'fox', 'sells', 'jumped', 'piper', 'by', 'dog.', 'peck', 'of', 'quick', 'the'}


In [8]:

# Create a dictionary to map each
# unique word to an index
word_to_index = {}
for i, word in enumerate(unique_words):
	word_to_index[word] = i
print(word_to_index)

{'seashells': 0, 'seashore.': 1, 'over': 2, 'a': 3, 'brown': 4, 'lazy': 5, 'picked': 6, 'pickled': 7, 'she': 8, 'peppers.': 9, 'peter': 10, 'fox': 11, 'sells': 12, 'jumped': 13, 'piper': 14, 'by': 15, 'dog.': 16, 'peck': 17, 'of': 18, 'quick': 19, 'the': 20}


In [10]:
# Create one-hot encoded vectors for
# each word in the corpus
one_hot_vectors = []
for sentence in corpus:
	sentence_vectors = []
	for word in sentence.split():
		vector = np.zeros(len(unique_words))
		vector[word_to_index[word.lower()]] = 1
		sentence_vectors.append(vector)
	one_hot_vectors.append(sentence_vectors)

# Print the one-hot encoded vectors
# for the first sentence
print("One-hot encoded vectors for the first sentence:")
for vector in one_hot_vectors[0]:
	print(vector)

One-hot encoded vectors for the first sentence:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]


# **Bag of words Example**

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Sample data
sentences = [
	"The quick brown fox jumped over the lazy dog.",
	"She sells seashells by the seashore.",
	"Peter Piper picked a peck of pickled peppers."
]
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the data
X = vectorizer.fit_transform(sentences)

# Convert to DataFrame for better readability
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

print(df)


   brown  by  dog  fox  jumped  lazy  of  over  peck  peppers  peter  picked  \
0      1   0    1    1       1     1   0     1     0        0      0       0   
1      0   1    0    0       0     0   0     0     0        0      0       0   
2      0   0    0    0       0     0   1     0     1        1      1       1   

   pickled  piper  quick  seashells  seashore  sells  she  the  
0        0      0      1          0         0      0    0    2  
1        0      0      0          1         1      1    1    1  
2        1      1      0          0         0      0    0    0  


# **Term Frequency**

In [12]:
# Term Frequency without Normalisation is same as above. Example below
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Sample data
sentences = [
	"The quick brown fox jumped over the lazy dog.",
	"She sells seashells by the seashore.",
	"Peter Piper picked a peck of pickled peppers."
]

# Initialize TfidfVectorizer with no IDF normalization
vectorizer = TfidfVectorizer(use_idf=False, norm=None)

# Fit and transform the data
X = vectorizer.fit_transform(sentences)

# Convert to DataFrame
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

print(df)


   brown   by  dog  fox  jumped  lazy   of  over  peck  peppers  peter  \
0    1.0  0.0  1.0  1.0     1.0   1.0  0.0   1.0   0.0      0.0    0.0   
1    0.0  1.0  0.0  0.0     0.0   0.0  0.0   0.0   0.0      0.0    0.0   
2    0.0  0.0  0.0  0.0     0.0   0.0  1.0   0.0   1.0      1.0    1.0   

   picked  pickled  piper  quick  seashells  seashore  sells  she  the  
0     0.0      0.0    0.0    1.0        0.0       0.0    0.0  0.0  2.0  
1     0.0      0.0    0.0    0.0        1.0       1.0    1.0  1.0  1.0  
2     1.0      1.0    1.0    0.0        0.0       0.0    0.0  0.0  0.0  


In [14]:
# Term Frequency with Normalisation

from collections import Counter
import pandas as pd

# Sample data
sentences = [
	"The quick brown fox jumped over the lazy dog.",
	"She sells seashells by the seashore.",
	"Peter Piper picked a peck of pickled peppers."
]

# Tokenize and compute term frequency
def compute_term_frequency(sentences):
    tf_data = []
    for sentence in sentences:
        words = sentence.lower().split()
        word_count = Counter(words)
        total_words = len(words)
        tf = {word: count / total_words for word, count in word_count.items()}
        tf_data.append(tf)
    return tf_data

# Compute Term Frequency
tf_data = compute_term_frequency(sentences)

# Create a DataFrame
df = pd.DataFrame(tf_data).fillna(0)

print(df)

        the     quick     brown       fox    jumped      over      lazy  \
0  0.222222  0.111111  0.111111  0.111111  0.111111  0.111111  0.111111   
1  0.166667  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

       dog.       she     sells  ...        by  seashore.  peter  piper  \
0  0.111111  0.000000  0.000000  ...  0.000000   0.000000  0.000  0.000   
1  0.000000  0.166667  0.166667  ...  0.166667   0.166667  0.000  0.000   
2  0.000000  0.000000  0.000000  ...  0.000000   0.000000  0.125  0.125   

   picked      a   peck     of  pickled  peppers.  
0   0.000  0.000  0.000  0.000    0.000     0.000  
1   0.000  0.000  0.000  0.000    0.000     0.000  
2   0.125  0.125  0.125  0.125    0.125     0.125  

[3 rows x 21 columns]


# **TF-IDF**

In [15]:
# Sample data
sentences = [
	"The quick brown fox jumped over the lazy dog.",
	"She sells seashells by the seashore.",
	"Peter Piper picked a peck of pickled peppers."
]

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data
X = vectorizer.fit_transform(sentences)

# Convert to DataFrame for better readability
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

print(df)

      brown        by       dog       fox    jumped      lazy        of  \
0  0.327673  0.000000  0.327673  0.327673  0.327673  0.327673  0.000000   
1  0.000000  0.423394  0.000000  0.000000  0.000000  0.000000  0.000000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.377964   

       over      peck   peppers     peter    picked   pickled     piper  \
0  0.327673  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
2  0.000000  0.377964  0.377964  0.377964  0.377964  0.377964  0.377964   

      quick  seashells  seashore     sells       she       the  
0  0.327673   0.000000  0.000000  0.000000  0.000000  0.498408  
1  0.000000   0.423394  0.423394  0.423394  0.423394  0.322002  
2  0.000000   0.000000  0.000000  0.000000  0.000000  0.000000  


# **Word2Vec**

In [17]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

# Sample data
sentences = [
	"The quick brown fox jumped over the lazy dog.",
	"She sells seashells by the seashore.",
	"Peter Piper picked a peck of pickled peppers."
]

# Preprocess sentences: tokenization and lowercasing
processed_sentences = [simple_preprocess(sentence) for sentence in sentences]

# Train Word2Vec model
model = Word2Vec(sentences=processed_sentences, vector_size=50, window=3, min_count=1, sg=0)
#vector_size=50: Number of dimensions for the word vectors.
#window=3: Maximum distance between the current and predicted word within a sentence.
#min_count=1: Ignores all words with a total frequency lower than this.
#sg=0: Uses Continuous Bag of Words (CBOW) model. Use sg=1 for Skip-gram model.

# Retrieve word vectors
words = list(model.wv.index_to_key)
word_vectors = {word: model.wv[word] for word in words}
#model.wv.index_to_key: List of words in the model's vocabulary.
#model.wv[word]: Vector for a specific word.

# Print word vectors
for word, vector in word_vectors.items():
    print(f"Word: {word}")
    print(f"Vector: {vector}\n")


Word: the
Vector: [-1.07391237e-03  4.74860339e-04  1.02046737e-02  1.80163253e-02
 -1.86063349e-02 -1.42338276e-02  1.29194520e-02  1.79487541e-02
 -1.00291595e-02 -7.52891786e-03  1.47626596e-02 -3.06462660e-03
 -9.07106884e-03  1.31066740e-02 -9.71844885e-03 -3.63512477e-03
  5.75294252e-03  1.98640209e-03 -1.65729299e-02 -1.89008843e-02
  1.46255679e-02  1.01392614e-02  1.35178342e-02  1.53059256e-03
  1.26994634e-02 -6.81092450e-03 -1.89362012e-03  1.15399696e-02
 -1.50459446e-02 -7.87127949e-03 -1.50229465e-02 -1.85945653e-03
  1.90772917e-02 -1.46395573e-02 -4.66638431e-03 -3.87584954e-03
  1.61583237e-02 -1.18598817e-02  8.73824683e-05 -9.50761139e-03
 -1.92078277e-02  1.00147920e-02 -1.75207052e-02 -8.78320262e-03
 -6.50323927e-05 -5.92649216e-04 -1.53212436e-02  1.92268975e-02
  9.96416155e-03  1.84633825e-02]

Word: pickled
Vector: [-0.01631619  0.00899308 -0.00827755  0.00164582  0.01699622 -0.00892415
  0.0090384  -0.01356834 -0.00709397  0.01879432 -0.0031533   0.00064619

# **FastText**

In [20]:
from gensim.models import FastText
from gensim.utils import simple_preprocess

# Sample data
sentences = [
	"The quick brown fox jumped over the lazy dog.",
	"She sells seashells by the seashore.",
	"Peter Piper picked a peck of pickled peppers."
]

# Preprocess sentences: tokenization and lowercasing
processed_sentences = [simple_preprocess(sentence) for sentence in sentences]

# Train Word2Vec model
model = FastText(sentences=processed_sentences, vector_size=50, window=3, min_count=1, sg=0)
#vector_size=50: Number of dimensions for the word vectors.
#window=3: Maximum distance between the current and predicted word within a sentence.
#min_count=1: Ignores all words with a total frequency lower than this.
#sg=0: Uses Continuous Bag of Words (CBOW) model. Use sg=1 for Skip-gram model.

# Retrieve word vectors
words = list(model.wv.index_to_key)
word_vectors = {word: model.wv[word] for word in words}
#model.wv.index_to_key: List of words in the model's vocabulary.
#model.wv[word]: Vector for a specific word.

# Print word vectors
for word, vector in word_vectors.items():
    print(f"Word: {word}")
    print(f"Vector: {vector}\n")


Word: the
Vector: [-6.9096177e-03 -5.5508655e-03  6.4289295e-03  4.9997931e-03
 -8.6795595e-03 -3.4872763e-04  2.4827304e-03  7.5473827e-03
 -3.5573244e-03  5.9877522e-03  1.9651183e-03 -3.2177779e-03
  2.7886103e-03  6.3880091e-03 -8.0975322e-03  3.6860199e-03
  2.7584613e-03 -2.1460734e-03 -2.2405088e-03  2.3859986e-03
  1.2677411e-03 -4.4623478e-03  6.3239941e-03  1.2329237e-03
  8.0053443e-03 -2.7659233e-03 -2.7095792e-03  2.4130228e-03
 -2.0639093e-03  2.7691249e-03 -7.9606315e-03  2.1421486e-03
 -8.7056734e-04 -2.9409751e-03 -6.1133355e-03  7.8031662e-05
 -2.3767599e-03 -4.0906081e-03  2.1443120e-03 -3.2735192e-03
 -1.1223267e-03 -1.1460079e-03 -2.1725462e-03 -8.2285581e-03
  3.3331811e-03  1.6607394e-03 -2.7815073e-03  1.9828759e-03
 -3.2524294e-03  1.2967208e-03]

Word: pickled
Vector: [-1.2475079e-03  8.7106624e-04  2.0359880e-03  1.9649668e-04
  1.4441706e-03 -1.8998918e-03  1.9598207e-03  1.0647458e-03
 -3.0352382e-04  3.8523539e-03  1.6345460e-03 -2.8182867e-03
  1.2217167e

# **GloVe**

In [26]:
import gensim
import numpy as np

# Load pre-trained GloVe vectors
def load_glove_vectors(glove_file):
    # Create a dictionary to hold word vectors
    word_vectors = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            word_vectors[word] = vector
    return word_vectors

# Example usage
#You can download pre-trained GloVe vectors from GloVe's official website. https://nlp.stanford.edu/projects/glove/
#For example, glove.6B.zip contains vectors of various dimensions (50d, 100d, 200d, 300d).
glove_file = 'glove.6B.50d.txt'  # Replace with the path to your GloVe file
word_vectors = load_glove_vectors(glove_file)

# Print vector for a specific word
word = 'cat'
if word in word_vectors:
    print(f"Vector for '{word}':\n{word_vectors[word]}")
else:
    print(f"Word '{word}' not found in GloVe vectors.")


Vector for 'cat':
[ 0.45281  -0.50108  -0.53714  -0.015697  0.22191   0.54602  -0.67301
 -0.6891    0.63493  -0.19726   0.33685   0.7735    0.90094   0.38488
  0.38367   0.2657   -0.08057   0.61089  -1.2894   -0.22313  -0.61578
  0.21697   0.35614   0.44499   0.60885  -1.1633   -1.1579    0.36118
  0.10466  -0.78325   1.4352    0.18629  -0.26112   0.83275  -0.23123
  0.32481   0.14485  -0.44552   0.33497  -0.95946  -0.097479  0.48138
 -0.43352   0.69455   0.91043  -0.28173   0.41637  -1.2609    0.71278
  0.23782 ]


# **BERT**

In [28]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Loads the BERT tokenizer. The 'bert-base-uncased' model is a smaller variant of BERT trained on uncased English text.
model = BertModel.from_pretrained('bert-base-uncased')
#Loads the pre-trained BERT model.

# Example text
text = "The quick brown fox jumps over the lazy dog."

# Tokenize input text
#Tokenizes the input text and converts it into PyTorch tensors. This is needed for feeding the input into the model.
inputs = tokenizer(text, return_tensors='pt')

# Forward pass through BERT model
with torch.no_grad():
    #Passes the tokenized input through the BERT model to obtain embeddings.
    outputs = model(**inputs)

# Get the embeddings for the first token in the sentence
# outputs['last_hidden_state'] contains embeddings for all tokens in the input sequence. The shape of this tensor is (batch_size, sequence_length, hidden_size).
last_hidden_state = outputs.last_hidden_state
print(f"Shape of last_hidden_state: {last_hidden_state.shape}")

# Print embeddings for the first token ([CLS] token). Extracts the embedding for the [CLS] token, which is often used for classification tasks.
cls_embedding = last_hidden_state[0][0]
print(f"Embedding for [CLS] token: {cls_embedding}")

# Example: Print embeddings for each token
# convert_ids_to_tokens - Converts token IDs back to token strings for better readability.
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
for token, embedding in zip(tokens, last_hidden_state[0]):
    print(f"Token: {token}, Embedding: {embedding}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Shape of last_hidden_state: torch.Size([1, 12, 768])
Embedding for [CLS] token: tensor([-3.6081e-01,  2.2708e-01, -3.0297e-01, -1.8802e-01,  4.7514e-02,
        -3.6903e-01,  1.0990e-01,  3.4954e-01, -1.6736e-01, -2.0700e-01,
        -4.3579e-01,  1.5249e-01,  4.9324e-02,  3.0811e-01,  1.9345e-01,
        -2.8209e-01, -7.0557e-02,  2.7849e-01,  1.5721e-01, -4.2256e-01,
        -5.2560e-02, -5.4511e-01, -2.9903e-01,  5.5531e-03,  1.6547e-01,
        -9.7523e-02,  1.6439e-01,  4.9919e-02, -5.1170e-04,  1.4867e-01,
        -4.6599e-02,  2.6054e-01,  5.5201e-02, -2.2966e-01,  2.6151e-01,
        -1.4315e-01,  4.6660e-01, -2.4589e-01,  2.1002e-01,  3.8450e-01,
         8.8528e-02,  2.8228e-01, -1.1471e-01, -2.3116e-01,  4.2132e-01,
        -6.5659e-01, -3.1143e+00, -1.6426e-01,  5.4055e-02, -4.0852e-01,
        -4.4582e-01, -3.1103e-02, -1.1346e-01,  4.3026e-01, -1.9937e-01,
         1.8495e-01, -8.0492e-02,  7.6652e-01,  2.6629e-01,  7.7543e-04,
        -2.1172e-01,  1.9823e-01, -3.0526e-0

# **ELMo**

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

# Load pre-trained ELMo model from TensorFlow Hub
elmo = hub.load("https://tfhub.dev/google/elmo/3")

# Example sentences
sentences = ["The quick brown fox jumps over the lazy dog",
             "The dog barked at the cat"]

# Define a function to get ELMo embeddings
def get_elmo_embeddings(sentences):
    #elmo.signatures- Retrieves the embeddings for the input sentences
    embeddings = elmo.signatures['default'](tf.constant(sentences))
    # embeddings['elmo'] - Contains the embeddings for the sentences.
    return embeddings['elmo']

# Get ELMo embeddings
embeddings = get_elmo_embeddings(sentences)

# Convert embeddings to NumPy arrays
elmo_embeddings = [np.array(embedding) for embedding in embeddings.numpy()]

# Print embeddings for the first sentence
print(f"ELMo embeddings for the first sentence:")
print(elmo_embeddings[0])


ELMo embeddings for the first sentence:
[[-0.00646185  0.00602156 -0.35598338 ... -0.29573613  0.35007405
  -0.02453545]
 [ 0.00887057 -0.3313475  -0.11483051 ... -0.5315139   0.62646693
   0.4774104 ]
 [-0.33886546 -0.03814545 -0.10515901 ...  0.1823135   0.84576094
   0.51684606]
 ...
 [-0.16538395 -0.10446502 -0.62940085 ... -0.31956372  0.29986864
   0.27593526]
 [-0.34058526 -0.40331346 -0.5687754  ... -0.14789376  0.09926283
   0.82679445]
 [ 0.18897489 -0.45617762  0.13562037 ...  0.06314402 -0.18220185
   0.40788838]]


# **LSE**

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np

# Sample data
documents = [
    "The quick brown fox jumps over the lazy dog",
    "The dog barked at the cat",
    "The cat and the dog played",
    "The quick brown fox"
]

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# Apply LSA (TruncatedSVD) to the TF-IDF matrix
# Performs dimensionality reduction using SVD. The n_components parameter specifies the number of dimensions to reduce to (e.g., topics or latent dimensions)
lsa = TruncatedSVD(n_components=2)  # Number of topics or components
X_lsa = lsa.fit_transform(X)

# Print the shape of the LSA matrix
print(f"Shape of LSA matrix: {X_lsa.shape}")

# Display the transformed documents
for i, doc in enumerate(X_lsa):
    print(f"Document {i}: {doc}")

Shape of LSA matrix: (4, 2)
Document 0: [ 0.84287038 -0.35555955]
Document 1: [0.48877143 0.71585494]
Document 2: [0.48877143 0.71585494]
Document 3: [ 0.74669212 -0.53581424]


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# Sample data
documents = [
    "The quick brown fox jumps over the lazy dog",
    "The dog barked at the cat",
    "The cat and the dog played",
    "The quick brown fox"
]

# Create a CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# Apply LDA
# Performs topic modeling. The n_components parameter specifies the number of topics.
lda = LatentDirichletAllocation(n_components=2, random_state=0)
X_lda = lda.fit_transform(X)

# Print the shape of the LDA matrix
print(f"Shape of LDA matrix: {X_lda.shape}")

# Display the topic distribution for each document
for i, doc in enumerate(X_lda):
    print(f"Document {i}: {doc}")


Shape of LDA matrix: (4, 2)
Document 0: [0.08879206 0.91120794]
Document 1: [0.86793322 0.13206678]
Document 2: [0.86793322 0.13206678]
Document 3: [0.12847965 0.87152035]
