<a href="https://colab.research.google.com/github/manohargadde/wordembeddings/blob/main/word_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **One Hot Encoding Example**

In [None]:
import numpy as np

# Define the corpus of text
corpus = [
	"The quick brown fox jumped over the lazy dog.",
	"She sells seashells by the seashore.",
	"Peter Piper picked a peck of pickled peppers."
]

In [None]:
# Create a set of unique words in the corpus
unique_words = set()
for sentence in corpus:
	for word in sentence.split():
		unique_words.add(word.lower())
print(unique_words)

In [None]:

# Create a dictionary to map each
# unique word to an index
word_to_index = {}
for i, word in enumerate(unique_words):
	word_to_index[word] = i
print(word_to_index)

In [None]:
# Create one-hot encoded vectors for
# each word in the corpus
one_hot_vectors = []
for sentence in corpus:
	sentence_vectors = []
	for word in sentence.split():
		vector = np.zeros(len(unique_words))
		vector[word_to_index[word.lower()]] = 1
		sentence_vectors.append(vector)
	one_hot_vectors.append(sentence_vectors)

# Print the one-hot encoded vectors
# for the first sentence
print("One-hot encoded vectors for the first sentence:")
for vector in one_hot_vectors[0]:
	print(vector)

# **Bag of words Example**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Sample data
sentences = [
	"The quick brown fox jumped over the lazy dog.",
	"She sells seashells by the seashore.",
	"Peter Piper picked a peck of pickled peppers."
]
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the data
X = vectorizer.fit_transform(sentences)

# Convert to DataFrame for better readability
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

print(df)


# **Term Frequency**

In [None]:
# Term Frequency without Normalisation is same as above. Example below
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Sample data
sentences = [
	"The quick brown fox jumped over the lazy dog.",
	"She sells seashells by the seashore.",
	"Peter Piper picked a peck of pickled peppers."
]

# Initialize TfidfVectorizer with no IDF normalization
vectorizer = TfidfVectorizer(use_idf=False, norm=None)

# Fit and transform the data
X = vectorizer.fit_transform(sentences)

# Convert to DataFrame
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

print(df)


In [None]:
# Term Frequency with Normalisation

from collections import Counter
import pandas as pd

# Sample data
sentences = [
	"The quick brown fox jumped over the lazy dog.",
	"She sells seashells by the seashore.",
	"Peter Piper picked a peck of pickled peppers."
]

# Tokenize and compute term frequency
def compute_term_frequency(sentences):
    tf_data = []
    for sentence in sentences:
        words = sentence.lower().split()
        word_count = Counter(words)
        total_words = len(words)
        tf = {word: count / total_words for word, count in word_count.items()}
        tf_data.append(tf)
    return tf_data

# Compute Term Frequency
tf_data = compute_term_frequency(sentences)

# Create a DataFrame
df = pd.DataFrame(tf_data).fillna(0)

print(df)

# **TF-IDF**

In [None]:
# Sample data
sentences = [
	"The quick brown fox jumped over the lazy dog.",
	"She sells seashells by the seashore.",
	"Peter Piper picked a peck of pickled peppers."
]

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data
X = vectorizer.fit_transform(sentences)

# Convert to DataFrame for better readability
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

print(df)

# **Word2Vec**

In [None]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

# Sample data
sentences = [
	"The quick brown fox jumped over the lazy dog.",
	"She sells seashells by the seashore.",
	"Peter Piper picked a peck of pickled peppers."
]

# Preprocess sentences: tokenization and lowercasing
processed_sentences = [simple_preprocess(sentence) for sentence in sentences]

# Train Word2Vec model
model = Word2Vec(sentences=processed_sentences, vector_size=50, window=3, min_count=1, sg=0)
#vector_size=50: Number of dimensions for the word vectors.
#window=3: Maximum distance between the current and predicted word within a sentence.
#min_count=1: Ignores all words with a total frequency lower than this.
#sg=0: Uses Continuous Bag of Words (CBOW) model. Use sg=1 for Skip-gram model.

# Retrieve word vectors
words = list(model.wv.index_to_key)
word_vectors = {word: model.wv[word] for word in words}
#model.wv.index_to_key: List of words in the model's vocabulary.
#model.wv[word]: Vector for a specific word.

# Print word vectors
for word, vector in word_vectors.items():
    print(f"Word: {word}")
    print(f"Vector: {vector}\n")


# **FastText**

In [None]:
from gensim.models import FastText
from gensim.utils import simple_preprocess

# Sample data
sentences = [
	"The quick brown fox jumped over the lazy dog.",
	"She sells seashells by the seashore.",
	"Peter Piper picked a peck of pickled peppers."
]

# Preprocess sentences: tokenization and lowercasing
processed_sentences = [simple_preprocess(sentence) for sentence in sentences]

# Train Word2Vec model
model = FastText(sentences=processed_sentences, vector_size=50, window=3, min_count=1, sg=0)
#vector_size=50: Number of dimensions for the word vectors.
#window=3: Maximum distance between the current and predicted word within a sentence.
#min_count=1: Ignores all words with a total frequency lower than this.
#sg=0: Uses Continuous Bag of Words (CBOW) model. Use sg=1 for Skip-gram model.

# Retrieve word vectors
words = list(model.wv.index_to_key)
word_vectors = {word: model.wv[word] for word in words}
#model.wv.index_to_key: List of words in the model's vocabulary.
#model.wv[word]: Vector for a specific word.

# Print word vectors
for word, vector in word_vectors.items():
    print(f"Word: {word}")
    print(f"Vector: {vector}\n")


# **GloVe**

In [None]:
import gensim
import numpy as np

# Load pre-trained GloVe vectors
def load_glove_vectors(glove_file):
    # Create a dictionary to hold word vectors
    word_vectors = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            word_vectors[word] = vector
    return word_vectors

# Example usage
#You can download pre-trained GloVe vectors from GloVe's official website. https://nlp.stanford.edu/projects/glove/
#For example, glove.6B.zip contains vectors of various dimensions (50d, 100d, 200d, 300d).
glove_file = 'glove.6B.50d.txt'  # Replace with the path to your GloVe file
word_vectors = load_glove_vectors(glove_file)

# Print vector for a specific word
word = 'cat'
if word in word_vectors:
    print(f"Vector for '{word}':\n{word_vectors[word]}")
else:
    print(f"Word '{word}' not found in GloVe vectors.")


# **BERT**

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Loads the BERT tokenizer. The 'bert-base-uncased' model is a smaller variant of BERT trained on uncased English text.
model = BertModel.from_pretrained('bert-base-uncased')
#Loads the pre-trained BERT model.

# Example text
text = "The quick brown fox jumps over the lazy dog."

# Tokenize input text
#Tokenizes the input text and converts it into PyTorch tensors. This is needed for feeding the input into the model.
inputs = tokenizer(text, return_tensors='pt')

# Forward pass through BERT model
with torch.no_grad():
    #Passes the tokenized input through the BERT model to obtain embeddings.
    outputs = model(**inputs)

# Get the embeddings for the first token in the sentence
# outputs['last_hidden_state'] contains embeddings for all tokens in the input sequence. The shape of this tensor is (batch_size, sequence_length, hidden_size).
last_hidden_state = outputs.last_hidden_state
print(f"Shape of last_hidden_state: {last_hidden_state.shape}")

# Print embeddings for the first token ([CLS] token). Extracts the embedding for the [CLS] token, which is often used for classification tasks.
cls_embedding = last_hidden_state[0][0]
print(f"Embedding for [CLS] token: {cls_embedding}")

# Example: Print embeddings for each token
# convert_ids_to_tokens - Converts token IDs back to token strings for better readability.
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
for token, embedding in zip(tokens, last_hidden_state[0]):
    print(f"Token: {token}, Embedding: {embedding}")


# **ELMo**

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

# Load pre-trained ELMo model from TensorFlow Hub
elmo = hub.load("https://tfhub.dev/google/elmo/3")

# Example sentences
sentences = ["The quick brown fox jumps over the lazy dog",
             "The dog barked at the cat"]

# Define a function to get ELMo embeddings
def get_elmo_embeddings(sentences):
    #elmo.signatures- Retrieves the embeddings for the input sentences
    embeddings = elmo.signatures['default'](tf.constant(sentences))
    # embeddings['elmo'] - Contains the embeddings for the sentences.
    return embeddings['elmo']

# Get ELMo embeddings
embeddings = get_elmo_embeddings(sentences)

# Convert embeddings to NumPy arrays
elmo_embeddings = [np.array(embedding) for embedding in embeddings.numpy()]

# Print embeddings for the first sentence
print(f"ELMo embeddings for the first sentence:")
print(elmo_embeddings[0])


# **LSE**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np

# Sample data
documents = [
    "The quick brown fox jumps over the lazy dog",
    "The dog barked at the cat",
    "The cat and the dog played",
    "The quick brown fox"
]

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# Apply LSA (TruncatedSVD) to the TF-IDF matrix
# Performs dimensionality reduction using SVD. The n_components parameter specifies the number of dimensions to reduce to (e.g., topics or latent dimensions)
lsa = TruncatedSVD(n_components=2)  # Number of topics or components
X_lsa = lsa.fit_transform(X)

# Print the shape of the LSA matrix
print(f"Shape of LSA matrix: {X_lsa.shape}")

# Display the transformed documents
for i, doc in enumerate(X_lsa):
    print(f"Document {i}: {doc}")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# Sample data
documents = [
    "The quick brown fox jumps over the lazy dog",
    "The dog barked at the cat",
    "The cat and the dog played",
    "The quick brown fox"
]

# Create a CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# Apply LDA
# Performs topic modeling. The n_components parameter specifies the number of topics.
lda = LatentDirichletAllocation(n_components=2, random_state=0)
X_lda = lda.fit_transform(X)

# Print the shape of the LDA matrix
print(f"Shape of LDA matrix: {X_lda.shape}")

# Display the topic distribution for each document
for i, doc in enumerate(X_lda):
    print(f"Document {i}: {doc}")
