# Word2Vec

In [None]:
# Code Example with Toy Dataset
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Toy dataset
sentences = ["I love natural language processing.", 
             "Word embeddings are powerful."]

# Tokenize sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Access embeddings
word_embeddings = model.wv
print(word_embeddings['natural'])

# GloVe

In [None]:
from glove import Corpus, Glove
from nltk.tokenize import word_tokenize

# Toy dataset
sentences = ["Word embeddings capture semantic meanings.",
             "GloVe is an impactful word embedding model."]

# Tokenize sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Creating a corpus object
corpus = Corpus() 

# Training the corpus to generate the co-occurrence matrix
corpus.fit(tokenized_sentences, window=10)

# Training the GloVe model
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

# Retrieve and display word embeddings
word = "glove"
embedding = glove.word_vectors[glove.dictionary[word]]
print(f"Embedding for '{word}': {embedding}")

# FastText

In [None]:
from gensim.models import FastText
from nltk.tokenize import word_tokenize

# Toy dataset
sentences = ["FastText embeddings handle subword information.",
             "It is effective for various languages."]
# Tokenize sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Train FastText model
model = FastText(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Access embeddings
word_embeddings = model.wv
print(word_embeddings['subword'])

 # Comparision Word Embedding Techniques

In [None]:
# Import necessary libraries
from gensim.models import Word2Vec
from gensim.models import FastText
from glove import Corpus, Glove
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Toy dataset
toy_data = [
    "word embeddings are fascinating",
    "word2vec captures semantic relationships",
    "GloVe considers global context",
    "FastText extends Word2Vec with subword information"
]

# Function to train Word2Vec model
def train_word2vec(data):
    model = Word2Vec([sentence.split() for sentence in data], vector_size=100, window=5, min_count=1, workers=4)
    return model

# Function to train GloVe model
def train_glove(data):
    corpus = Corpus()
    corpus.fit(data, window=5)
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    return glove

# Function to train FastText model
def train_fasttext(data):
    model = FastText(sentences=[sentence.split() for sentence in data], vector_size=100, window=5, min_count=1, workers=4)
    return model

# Function to plot embeddings
def plot_embeddings(model, title):
    labels = model.wv.index_to_key
    vectors = [model.wv[word] for word in labels]
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(vectors)

    x, y = [], []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=(10, 8)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.title(title)
    plt.show()

# Train models
word2vec_model = train_word2vec(toy_data)
glove_model = train_glove(toy_data)
fasttext_model = train_fasttext(toy_data)

# Plot embeddings
plot_embeddings(word2vec_model, 'Word2Vec Embeddings')
plot_embeddings(glove_model, 'GloVe Embeddings')
plot_embeddings(fasttext_model, 'FastText Embeddings')