<a href="https://colab.research.google.com/github/mansvi77/Word-Similarity-using-Deep-Learning/blob/main/Untitled6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import tensorflow as tf
import random
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Reshape, Dot
from tensorflow.keras.preprocessing.text import Tokenizer

# ==========================================
# ðŸ”§ THE FIX: Custom Skip-Gram Generator
# ==========================================
# The built-in Keras function is broken in new Python versions.
# We write our own simple version here to replace it.
def generate_skipgrams(sequence, vocabulary_size, window_size=2):
    pairs = []
    labels = []

    # Loop through every word in the sentence (this is our 'target')
    for i, target_word in enumerate(sequence):
        # Determine the range of context words (e.g., 2 words back, 2 words forward)
        start = max(0, i - window_size)
        end = min(len(sequence), i + window_size + 1)

        for j in range(start, end):
            if i == j: continue # Skip the target word itself

            context_word = sequence[j]

            # 1. POSITIVE PAIR (The words actually appeared together)
            pairs.append([target_word, context_word])
            labels.append(1) # Label 1 means "Yes, these are neighbors"

            # 2. NEGATIVE PAIR (Random Sampling)
            # We pick a random word from the dictionary to teach the model
            # what is NOT a neighbor.
            random_word = int(random.randrange(1, vocabulary_size))

            pairs.append([target_word, random_word])
            labels.append(0) # Label 0 means "No, this is random noise"

    return np.array(pairs), np.array(labels)

# ==========================================
# STEP 1: The Data (Our Tiny Library)
# ==========================================
corpus = [
    "the king is a royal man",
    "the queen is a royal woman",
    "royal men are kings",
    "royal women are queens",
    "the man is strong",
    "the woman is strong"
]

# ==========================================
# STEP 2: Preprocessing (Text -> Numbers)
# ==========================================
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

word2id = tokenizer.word_index
id2word = {v:k for k,v in word2id.items()}

vocab_size = len(word2id) + 1
embed_size = 10

print(f"Total unique words: {vocab_size}")
print(f"Word Map: {word2id}")

# ==========================================
# STEP 3: Generate Training Pairs (Using our FIX)
# ==========================================
sequences = tokenizer.texts_to_sequences(corpus)

targets = []
contexts = []
labels = []

window_size = 2

for seq in sequences:
    # We use our custom function 'generate_skipgrams' instead of the broken Keras one
    pairs, labs = generate_skipgrams(seq, vocabulary_size=vocab_size, window_size=window_size)

    for p, l in zip(pairs, labs):
        targets.append(p[0])
        contexts.append(p[1])
        labels.append(l)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print(f"\nCreated {len(targets)} training pairs.")

# ==========================================
# STEP 4: Build the Neural Network
# ==========================================
input_target = Input((1,), name='target_word_input')
input_context = Input((1,), name='context_word_input')

embedding_layer = Embedding(vocab_size, embed_size, input_length=1, name='word_embedding')

target_vector = embedding_layer(input_target)
context_vector = embedding_layer(input_context)

# Dot Product checks similarity
dot_product = Dot(axes=2, name='similarity_check')([target_vector, context_vector])
dot_product = Reshape((1,), name='reshape')(dot_product)

# Sigmoid output (0 to 1)
output = Dense(1, activation='sigmoid', name='final_class')(dot_product)

model = Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam')

# ==========================================
# STEP 5: Train
# ==========================================
print("\nTraining Model...")
model.fit(x=[targets, contexts], y=labels, batch_size=1, epochs=100, verbose=0)
print("Training Complete.")

# ==========================================
# STEP 6: Test Results
# ==========================================
learned_weights = model.get_layer('word_embedding').get_weights()[0]

def get_similarity(word1, word2):
    try:
        id1 = word2id[word1]
        id2 = word2id[word2]
        vec1 = learned_weights[id1]
        vec2 = learned_weights[id2]
        dot = np.dot(vec1, vec2)
        norm = np.linalg.norm(vec1) * np.linalg.norm(vec2)
        return dot / norm
    except KeyError:
        return 0

print("\n--- TEST RESULTS ---")
print(f"Similarity 'king' vs 'man':    {get_similarity('king', 'man'):.4f}")
print(f"Similarity 'king' vs 'queen':  {get_similarity('king', 'queen'):.4f}")
print(f"Similarity 'king' vs 'strong': {get_similarity('king', 'strong'):.4f}")

Total unique words: 15
Word Map: {'the': 1, 'is': 2, 'royal': 3, 'a': 4, 'man': 5, 'woman': 6, 'are': 7, 'strong': 8, 'king': 9, 'queen': 10, 'men': 11, 'kings': 12, 'women': 13, 'queens': 14}

Created 152 training pairs.

Training Model...




Training Complete.

--- TEST RESULTS ---
Similarity 'king' vs 'man':    0.5066
Similarity 'king' vs 'queen':  0.6571
Similarity 'king' vs 'strong': -0.1238
