<a href="https://colab.research.google.com/github/mr-nudo/intelligent-tools/blob/master/Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries

In [1]:
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Lambda, Average
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import random

# Set random seed

In [2]:
# Set environment variables
os.environ['PYTHONHASHSEED'] = str(25)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

# Set seed values
np.random.seed(25)
tf.random.set_seed(25)
random.seed(25)

# TODO: Preprocess

In [3]:
# Preprocess the text
def preprocess(text):
    lowercase_text = text.lower()
    words = lowercase_text.split()
    return words


# TODO: Build Vocabulary and training data

In [6]:
# Build vocabulary and generate training data
def build_and_prepare_data(words, window_size):
    # Build vocabulary
    vocab = set(words)
    vocab = list(vocab)
    vocab_size = len(vocab)
    word_to_index = {word: index for index, word in enumerate(vocab)}

    # Generate context-target pairs
    contexts = []
    targets = []
    for i in range(window_size, len(words) - window_size):
        context = words[i - window_size:i] + words[i + 1:i + window_size + 1]
        target = words[i]
        contexts.append(context)
        targets.append(target)

    # Extract contexts and targets from data
    contexts = [word_to_index[word] for word in contexts]
    targets = [word_to_index[target] for target in targets]

    # Prepare contexts and targets for training by padding and one-hot encoding
    max_context_len = max(len(context) for context in contexts)
    contexts = pad_sequences(contexts, maxlen=max_context_len, padding='post')
    targets = to_categorical(targets, num_classes=vocab_size)

    return vocab, contexts, targets

# TODO: Build CBOW model

In [7]:
# Define CBOW model function
def build_cbow_model(vocab_size, embed_size, window_size):
    # Define the model architecture
    inputs = Input(shape=(window_size,))

    # Embedding layer to convert words into vectors
    embeddings = Embedding(vocab_size, embed_size)(inputs)

    # Average layer to combine context word vectors
    average = Average()(embeddings)

    # Output layer with softmax for predicting target word
    outputs = Dense(vocab_size, activation='softmax')(average)

    # Compiling model
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Print model summary
    model.summary()

    return model

# TODO: Set file path

In [None]:
# TODO: set correct file path
file_path = 'path_to_small_corpus.txt'

# Running the helper functions

In [None]:
# Read the file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

words = preprocess(text)

# Print vocabulary size
print(f"Number of words: {len(words)}")

# Model parameters
window_size = 2

# Prepare dataset
vocab, contexts, targets = build_and_prepare_data(words, window_size)

vocab_size = len(vocab)
# Print vocabulary size
print(f"Vocabulary size: {vocab_size}")

# Print lengths of contexts and targets
print(f"Length of contexts array: {len(contexts)}")
print(f"Length of targets array: {len(targets)}")

# Split the data inton training and validation sets

In [None]:
# Splitting the data
contexts_train, contexts_val, targets_train, targets_val = train_test_split(contexts, targets, test_size=0.2, random_state=25)

embed_size = 2

# Train the model

In [None]:
# Create and train the model
model = build_cbow_model(vocab_size, embed_size, window_size)
history = model.fit(contexts_train, targets_train, validation_data=(contexts_val, targets_val), epochs=7, verbose=1)


# TODO: Visualise the Training and Validation loss

In [None]:
# Plotting the training and validation loss
#TODO

# TODO: Extract the embeddings

In [None]:
# Extract embeddings
#TODO

# TODO: Find similar words

In [None]:
def cosine_similarity(vec_a, vec_b):
    """Calculate the cosine similarity between two vectors."""
    #TODO
    return similarity

def find_similar_words(query_word, vocab, embeddings, top_n=3):
    """Find the top_n words most similar to the query_word based on the embeddings."""
    similarities = []

    #TODO populate the similarities list



    # Sort based on similarity scores
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Print top similar words
    print(f"Words most similar to '{query_word}':")
    for word, similarity in similarities[:top_n]:
        print(f"{word}: {similarity:.4f}")



In [None]:
query_words = ['poland', 'thailand', 'morocco']

for query_word in query_words:
    find_similar_words(query_word, vocab, embeddings)
    print("\n")


# TODO: Visualise the embeddings

In [None]:
# Create a scatter plot of the embeddings
# TODO