In [5]:
# ----------------------------------------------------------
# IMPORTS
# ----------------------------------------------------------
import numpy as np
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from keras.datasets import imdb
from sklearn.decomposition import PCA
import plotly.express as px


# ----------------------------------------------------------
# LOAD INBUILT IMDB DATASET (NO CSV, NO FOLDER NEEDED)
# ----------------------------------------------------------
# Load IMDB dataset with top 10,000 most frequent words
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

# Load word-index dictionary to decode encoded reviews
word_index = imdb.get_word_index()

# Reverse dictionary: id → word
reverse_word_index = {value: key for key, value in word_index.items()}

# Function to convert encoded review to plain text
def decode_review(encoded_review):
    # IMDB reserves first 3 indices for special tokens
    return " ".join([reverse_word_index.get(i - 3, "?") for i in encoded_review])


# ----------------------------------------------------------
# CREATE TEXT CORPUS FOR WORD2VEC
# ----------------------------------------------------------
corpus = []

# Combine BOTH training & testing reviews (50,000 total)
all_reviews = np.concatenate((X_train, X_test))

# Convert each encoded review into text + tokenize
for review in all_reviews:
    raw_text = decode_review(review)
    
    # Convert sentence to a list of cleaned tokens
    tokens = simple_preprocess(raw_text)
    
    corpus.append(tokens)    # Add tokenized sentence to corpus


# ----------------------------------------------------------
# TRAIN WORD2VEC MODEL
# ----------------------------------------------------------
# Initialize Word2Vec model
model = gensim.models.Word2Vec(
    vector_size=100,   # embedding dimension
    window=5,          # context window
    min_count=5,       # ignore rare words
    workers=4,         # use 4 CPU cores
)

# Build vocabulary
model.build_vocab(corpus)

# Train the model
model.train(
    corpus,
    total_examples=model.corpus_count,
    epochs=model.epochs
)

print("Word2Vec Training Completed!")


# ----------------------------------------------------------
# WORD SIMILARITY TESTS
# ----------------------------------------------------------
# Example: find similar words to "good"
print("\nWords similar to 'good':")
print(model.wv.most_similar("good"))

# Example odd-one-out test
print("\nOdd one out test:")
print(model.wv.doesnt_match(["good", "great", "excellent", "king"]))


# ----------------------------------------------------------
# EXTRACT WORD EMBEDDINGS
# ----------------------------------------------------------
vectors = model.wv.get_normed_vectors()     # embedding matrix
words = model.wv.index_to_key               # list of words

print("\nVocabulary Size:", len(words))
print("Embedding Matrix Shape:", vectors.shape)


# ----------------------------------------------------------
# PCA REDUCTION TO 3D FOR VISUALIZATION
# ----------------------------------------------------------
pca = PCA(n_components=3)

# Reduce embeddings from 100D → 3D
X_reduced = pca.fit_transform(vectors)

print("\nPCA Shape:", X_reduced.shape)


# ----------------------------------------------------------
# 3D VISUALIZATION USING PLOTLY
# ----------------------------------------------------------
# Select a slice of 200 words for plotting
slice_start = 200
slice_end = 300

fig = px.scatter_3d(
    X_reduced[slice_start:slice_end],
    x=0, y=1, z=2,
    color=words[slice_start:slice_end],
    title="3D Visualization of Word2Vec Embeddings (IMDB Dataset)"
)

fig.show()

# ----------------------------------------------------------
# END OF FILE
# ----------------------------------------------------------


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


Word2Vec Training Completed!

Words similar to 'good':
[('bad', 0.7934481501579285), ('great', 0.740196168422699), ('decent', 0.7232195734977722), ('cool', 0.6798044443130493), ('nice', 0.6542497277259827), ('funny', 0.6299184560775757), ('ok', 0.6217984557151794), ('fine', 0.6127150058746338), ('okay', 0.5938103795051575), ('weak', 0.5902507305145264)]

Odd one out test:
king

Vocabulary Size: 9621
Embedding Matrix Shape: (9621, 100)

PCA Shape: (9621, 3)
