#Word2Vec

In [None]:
import gensim.downloader

#Load pre trained glove model from Gensim
model = gensim.downloader.load('glove-twitter-25')




In [None]:
#Print words in the vocabulary of trained Word2Vec model
d = model.key_to_index
first10pairs = {k: d[k] for k in list(d)[:30]}
print(first10pairs)

{'<user>': 0, '.': 1, ':': 2, 'rt': 3, ',': 4, '<repeat>': 5, '<hashtag>': 6, '<number>': 7, '<url>': 8, '!': 9, 'i': 10, 'a': 11, '"': 12, 'the': 13, '?': 14, 'you': 15, 'to': 16, '(': 17, '<allcaps>': 18, '<elong>': 19, ')': 20, 'me': 21, 'de': 22, '<smile>': 23, '！': 24, 'que': 25, 'and': 26, '。': 27, '-': 28, 'my': 29}


In [None]:
print(len(model.key_to_index))

1193514


In [None]:
#generate embedding matrix for any word in vocabulary
model["dog"]

array([-1.2420e+00, -3.5980e-01,  5.7285e-01,  3.6675e-01,  6.0021e-01,
       -1.8898e-01,  1.2729e+00, -3.6921e-01,  8.9080e-02,  4.0339e-01,
        2.5130e-01, -2.5548e-01, -3.9209e+00, -1.1100e+00, -2.1308e-01,
       -2.3846e-01,  9.5322e-01, -5.2750e-01, -7.8049e-04, -3.5771e-01,
        5.5582e-01,  7.7869e-01,  4.6874e-01, -7.7803e-01,  7.8378e-01],
      dtype=float32)

In [None]:
model["DOG"]

KeyError: "Key 'DOG' not present"

In [None]:
model["Dog"]

KeyError: "Key 'Dog' not present"

In [None]:
model.most_similar('beautiful')

[('gorgeous', 0.933364748954773),
 ('lovely', 0.9279096722602844),
 ('amazing', 0.9218392968177795),
 ('love', 0.9173232913017273),
 ('wonderful', 0.9150214791297913),
 ('loving', 0.9093379974365234),
 ('dream', 0.9086582660675049),
 ('pretty', 0.9071912169456482),
 ('perfect', 0.9066720008850098),
 ('little', 0.906454861164093)]

In [None]:
model.most_similar('toronto')

[('vancouver', 0.9642484784126282),
 ('montreal', 0.955639660358429),
 ('chicago', 0.950315535068512),
 ('portland', 0.9401024580001831),
 ('sydney', 0.937186062335968),
 ('dallas', 0.9353677034378052),
 ('atlanta', 0.9349284172058105),
 ('phoenix', 0.9331689476966858),
 ('seattle', 0.9328917860984802),
 ('melbourne', 0.9321370124816895)]

In [None]:
#generate embedding matrix for any word not in vocabulary
model["xcccdgggh"]

KeyError: "Key 'xcccdgggh' not present"

In [None]:
#analogy task
result = model.most_similar(positive=['japan', 'japanese'], negative=['australia'])
print(result)

[('poetry', 0.8085451722145081), ('korean', 0.7509593367576599), ('animation', 0.7495404481887817), ('bible', 0.7464496493339539), ('archive', 0.7370486259460449), ('naver', 0.7355666756629944), ('mystery', 0.7336708903312683), ('kindle', 0.7309550642967224), ('公式日本語サイト', 0.7287722229957581), ('generation', 0.7250692248344421)]


In [None]:
model.similarity('japan','japanese')

0.75782835

References:
https://radimrehurek.com/gensim/models/word2vec.html

#### Two things to note while using pre-trained models:


1.   Tokens/Words are always lowercased. If a word is not in the vocabulary,   the model throws an exception.
2.   So, in the text preprocessing, we must transform all words into lowercase. We couldn't apply stemming in the text preprocessing.


#fastText

In [None]:
import numpy as np
import gensim.models.fasttext
from gensim.test.utils import datapath

cap_path = datapath("crime-and-punishment.bin")
#we need not train Facebook's FastText model, we can use load_facebook_vectors(), that only loads the pretrained word embeddings
wv = gensim.models.fasttext.load_facebook_vectors(cap_path)

In [None]:
#generate embedding matrix for words present in vocabulary
wv['landlady']

array([-0.06020484, -0.00170379,  0.00868763,  0.13152218,  0.05103018],
      dtype=float32)

In [None]:
#generate embedding matrix for words not present in vocabulary
wv['xbbhdhdhdhhd']

array([0.01352959, 0.02799635, 0.04122604, 0.09964952, 0.05035212],
      dtype=float32)

References:
- https://radimrehurek.com/gensim/models/fasttext.html
- Alternative approches to use the pre-trained FastText and train the model.
https://fasttext.cc/docs/en/crawl-vectors.html#adapt-the-dimension

#Doc2Vec

In [None]:
import gensim
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Load pre-trained GloVe word vectors
glove_model = gensim.downloader.load('glove-twitter-25')

In [None]:
# Sample document
sample_document = "This is a sample document for demonstration purposes."

# Tokenize the sample document
tokenized_doc = sample_document.lower().split()

# Convert words to vectors using pre-trained GloVe model
word_vectors = [glove_model[word] for word in tokenized_doc if word in glove_model]

# Create a TaggedDocument to represent sample document
tagged_document = TaggedDocument(words=tokenized_doc, tags=[0])

# Create Doc2Vec model
model = Doc2Vec(vector_size=25, min_count=1, epochs=10)

# Build vocabulary
model.build_vocab([tagged_document])

# Set the pre-trained GloVe vectors for words in the vocabulary
model.wv.vectors = np.array(word_vectors)

In [None]:
# Infer vector for sample document
inferred_vector = model.infer_vector(tokenized_doc)

# Print the inferred vector
print("Inferred vector for the sample document using Doc2Vec:")
print(inferred_vector)

Inferred vector for the sample document using Doc2Vec:
[ 0.01327759 -0.00549514 -0.01743878 -0.00696165 -0.00723004  0.00040637
 -0.00069506  0.00412781 -0.0095418  -0.01465905  0.00910899 -0.01506651
  0.00567768 -0.01289314  0.01413239 -0.01898264  0.00191661 -0.00668624
  0.00576241 -0.0168013   0.01876643 -0.00469655  0.01018941 -0.01996415
  0.01675976]


References:
https://radimrehurek.com/gensim/models/doc2vec.html