# ASSIGNMENT 7

**Train Language Embeddings: Use the provided Word2Vec notebook to train embeddings in a language other than English (your own language).**

Using indicnlp library, for natural language processing in Indian languages (Hindi in our case). We are using indicnlp as it provides better tools and resources for working with various Indian languages, including tokenization, trasliteration, and other NLP tasks.

### Import Libraries

In [2]:
import gensim
from gensim.models import Word2Vec
from indicnlp.tokenize import sentence_tokenize, indic_tokenize
import re

## 1 Train Language Embeddings with Word2Vec (CBOW)

In [190]:
# Load and preprocess Hindi text data
with open('D:/6106_Neural_Modeling_Method/Assignments/Assignment7/hindipoems.txt', 'r', encoding='utf-8') as file:
    hindi_text = file.read()

# Tokenize the text into sentences
sentences = sentence_tokenize.sentence_split(hindi_text, lang='hi')

# Tokenize each sentence into words
tokenized_sentences = [indic_tokenize.trivial_tokenize(sentence) for sentence in sentences]

# Train Word2Vec CBOW model
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, sg=0)  # sg=0 for CBOW

# Save the model
model.save("D:/6106_Neural_Modeling_Method/Assignments/Assignment7/hindi_word2vec_cbow.model")

# Save the embeddings
word_vectors = model.wv
word_vectors.save("D:/6106_Neural_Modeling_Method/Assignments/Assignment7/hindi_word2vec_cbow_vectors.kv")

In [191]:
# Example of using the model
print(model.wv.most_similar("प्यार"))  # Replace with any Hindi word

[('!', 0.37085211277008057), ('शिशु\nउनमें', 0.35214439034461975), ('का', 0.3512568473815918), ('मत', 0.34760239720344543), ('\nजीवन', 0.34136736392974854), ('था', 0.341358482837677), ('करता', 0.3359297513961792), ('\nमेरी', 0.33115971088409424), ('उदय', 0.3304550051689148), ('पर', 0.32900917530059814)]


## 2 Develop RNN-based Embeddings

In [69]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

**First, let's create default Keras embeddings:**

In [70]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

# Assuming you've already processed your text data and created tokenized_sentences

# Flatten the list of tokenized sentences
all_words = [word for sentence in tokenized_sentences for word in sentence]

# Create a tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_words)

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(all_words)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Create default Keras embedding model
embedding_dim = 100
keras_embedding_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim)
])

# Build the model by providing an input shape
keras_embedding_model.build((None, max_length))

# Compile the model
keras_embedding_model.compile('adam', 'mse')

# Print model summary
print(keras_embedding_model.summary())

# Get the default Keras embeddings
keras_embeddings = keras_embedding_model.layers[0].get_weights()[0]

# Save the Keras embeddings in txt format
with open("D:/6106_Neural_Modeling_Method/Assignments/Assignment7/hindi_keras_embeddings5.txt", "w", encoding="utf-8") as f:
    for word, index in tokenizer.word_index.items():
        vector = keras_embeddings[index]
        vector_str = " ".join([str(v) for v in vector])
        f.write(f"{word} {vector_str}\n")

print("Keras embeddings saved in text format.")

None
Keras embeddings saved in text format.


### Load 

In [77]:
def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = [float(x) for x in values[1:]]
            embeddings[word] = vector
    return embeddings

# Load the saved embeddings
loaded_embeddings = load_embeddings("D:/6106_Neural_Modeling_Method/Assignments/Assignment7/hindi_keras_embeddings5.txt")

# Now you can use the loaded embeddings
print(f"Number of words in the embeddings: {len(loaded_embeddings)}")
print(f"Embedding dimension: {len(next(iter(loaded_embeddings.values())))}")

Number of words in the embeddings: 1601
Embedding dimension: 100


**Now, let's create RNN-based embeddings:**

In [71]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize and prepare input-output pairs for the RNN
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_sentences)
sequences = tokenizer.texts_to_sequences(tokenized_sentences)
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences to ensure consistent input length
max_len = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Initialize random embedding matrix (replace with actual embeddings if available)
embedding_dim = 256
embedding_matrix = np.random.rand(vocab_size, embedding_dim)

# Define the RNN model with Bidirectional LSTM
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dense(vocab_size, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare input (X_train) and target (y_train) data for sequence prediction
X_train = padded_sequences[:, :-1]  # All words except the last one in each sequence
y_train = padded_sequences[:, 1:]   # All words except the first one in each sequence

# Ensure y_train has the same shape as model output
y_train = np.expand_dims(y_train, -1)

# Print model summary to verify parameters
model.summary()



In [72]:
# Train the model
history = model.fit(X_train, y_train, epochs=200, batch_size=64)

Epoch 1/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 627ms/step - accuracy: 0.4577 - loss: 7.0337
Epoch 2/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 594ms/step - accuracy: 0.8871 - loss: 4.5493
Epoch 3/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 612ms/step - accuracy: 0.8851 - loss: 2.7010
Epoch 4/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 605ms/step - accuracy: 0.8864 - loss: 1.4008
Epoch 5/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 603ms/step - accuracy: 0.8838 - loss: 1.1095
Epoch 6/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 581ms/step - accuracy: 0.8843 - loss: 1.1177
Epoch 7/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 625ms/step - accuracy: 0.8856 - loss: 1.1150
Epoch 8/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 590ms/step - accuracy: 0.8859 - loss: 1.1023
Epoch 9/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━

In [73]:
# Extract the trained embeddings
trained_embeddings = model.layers[0].get_weights()[0]

# Save the embeddings in txt format
with open("D:/6106_Neural_Modeling_Method/Assignments/Assignment7/hindi_rnn_embeddings5.txt", "w", encoding="utf-8") as f:
    for word, index in tokenizer.word_index.items():
        vector = trained_embeddings[index]
        vector_str = " ".join([str(v) for v in vector])
        f.write(f"{word} {vector_str}\n")

print("RNN embeddings saved in text format.")

RNN embeddings saved in text format.


In [75]:
# Function to find similar words using embeddings
def find_similar_words(word, embeddings, word_index, top_n=5):
    if word not in word_index:
        return []
    
    word_vector = embeddings[word_index[word]]
    similarities = []
    
    for w, i in word_index.items():
        if w != word:
            similarity = np.dot(word_vector, embeddings[i]) / (np.linalg.norm(word_vector) * np.linalg.norm(embeddings[i]))
            similarities.append((w, similarity))
    
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]

# Demonstrate RNN embedding quality
test_word = "प्यार"  # Replace with a Hindi word from your vocabulary
similar_words = find_similar_words(test_word, rnn_embeddings, tokenizer.word_index)
print(f"Words similar to '{test_word}' using RNN embeddings:")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

Words similar to 'प्यार' using RNN embeddings:
लाना: 0.7911
आ‌ई: 0.7882
पांव: 0.7870
स्नेह: 0.7864
मूँदे: 0.7838


In [86]:
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:  # Skip empty lines
                continue
            values = line.split()
            if len(values) < 2:  # Check if there's at least a word and one vector component
                #print(f"Warning: Skipping line {line_num} due to insufficient data: {line}")
                continue
            try:
                word = values[0]
                vector = np.array([float(x) for x in values[1:]])
                embeddings[word] = vector
            except ValueError as e:
                print(f"Error on line {line_num}: {e}")
                print(f"Problematic line: {line}")
                continue
    return embeddings

# Load the saved embeddings
keras_embeddings = load_embeddings("D:/6106_Neural_Modeling_Method/Assignments/Assignment7/hindi_keras_embeddings5.txt")
rnn_embeddings = load_embeddings("D:/6106_Neural_Modeling_Method/Assignments/Assignment7/hindi_rnn_embeddings5.txt")

# Create word_index from the loaded embeddings
word_index = {word: i for i, word in enumerate(keras_embeddings.keys())}
index_to_word = {i: word for word, i in word_index.items()}

def find_similar_words(word, embeddings, top_n=5):
    if word not in embeddings:
        return []
    
    word_vector = embeddings[word]
    similarities = []
    
    for w, vec in embeddings.items():
        if w != word:
            similarity = 1 - cosine(word_vector, vec)
            similarities.append((w, similarity))
    
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]

# 1. Word similarity comparison
test_words = ["प्यार", "जीवन", "समय", "सुंदर", "दुनिया"]  # Replace with Hindi words from your vocabulary

print("Word Similarity Comparison:")
for word in test_words:
    print(f"\nSimilar words to '{word}':")
    print("Keras embeddings:")
    keras_similar = find_similar_words(word, keras_embeddings)
    for w, sim in keras_similar:
        print(f"  {w}: {sim:.4f}")
    
    print("RNN embeddings:")
    rnn_similar = find_similar_words(word, rnn_embeddings)
    for w, sim in rnn_similar:
        print(f"  {w}: {sim:.4f}")

Word Similarity Comparison:

Similar words to 'प्यार':
Keras embeddings:
  इन्द्रधनुष: 0.3738
  बनाना: 0.3259
  हमारी: 0.2849
  कवि: 0.2752
  कल: 0.2727
RNN embeddings:
  बजता: 0.7982
  .: 0.7964
  तरफ: 0.7921
  बिखराता: 0.7875
  उखड़: 0.7861

Similar words to 'जीवन':
Keras embeddings:
  सजीव: 0.3683
  अर्पण: 0.3386
  वितरण: 0.3153
  मंदिर: 0.3120
  भेद: 0.3018
RNN embeddings:
  सरित: 0.8102
  छूट: 0.8037
  लहर: 0.8033
  पक्‍के: 0.7998
  ’: 0.7995

Similar words to 'समय':
Keras embeddings:
  मुझमें: 0.2986
  लगा: 0.2959
  रात्रि: 0.2787
  रथ: 0.2729
  कलिकाएँ: 0.2720
RNN embeddings:
  गहरे: 0.7935
  उठा: 0.7915
  चुस्त: 0.7910
  छिपा: 0.7898
  बोली: 0.7897

Similar words to 'सुंदर':
Keras embeddings:
  नीले: 0.3294
  ज्योति: 0.3243
  उगले: 0.3232
  संतान: 0.3220
  नाचूँगा: 0.3100
RNN embeddings:
  मेहनत: 0.8117
  वक़्त: 0.8116
  फेनिल: 0.8084
  नींद: 0.8081
  लेकर: 0.8062

Similar words to 'दुनिया':
Keras embeddings:
  उन्हे: 0.3416
  जमुन: 0.2999
  इसे: 0.2940
  खड़ा: 0.2837
  निकलता:

# 3 Build a Chatbot

**3. Build a Chatbot: Create a chatbot that uses the trained embeddings and evaluate its performance against a similar chatbot that uses English embeddings. Experiment with Pre-trained Embeddings: Optionally, download and integrate pre-trained embeddings for your language and compare their impact on the chatbot’s performance versus the embeddings you trained.**

In [87]:
import numpy as np
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [88]:
import tensorflow as tf
tf.__version__

'2.18.0'

## Read, process, and tokenize data

In [89]:
import re
def clean_numbers(text):
    pattern = r"[\d-]"
    return re.sub(pattern, '', text)

In [90]:
import re

# Heal the text
text = ''
print( "Reading txt file...")
with open('D:/6106_Neural_Modeling_Method/Assignments/Assignment7/hindipoems.txt',  'r', encoding='utf-8') as f:
    text = f.read()

# sentence delimiter processing
text = text.replace(",\n", " _eol_ ")
text = text.replace(",", " _comma_  ")
text = text.replace(":", " _comma_  ")
text = text.replace(";", " _comma_  ")

text = text.replace("?\n", ". ")
text = text.replace("!\n", ". ")
text = text.replace(".\n", ". ")
text = text.replace("?", ".")
text = text.replace("!", ".")

text = text.replace('"',"")

# i leave apostrophes in place, spawning separate words
#text = text.replace("’","")

# absorb tabs
text = text.replace("\t", "")
text = text.replace("  ", "")

# remove numbes
text = clean_numbers(text)

# absorb soace
_RE_COMBINE_WHITESPACE = re.compile(r"\s+")
text = _RE_COMBINE_WHITESPACE.sub(" ", text).strip()
print('done!')

Reading txt file...
done!


In [91]:
text = text.lower()
text = text.replace('i ', 'I ')

In [92]:
len(text)

27139

In [93]:
text[0:2000]

'poems by harivansh raI bachchan source<https _comma_//hindionlinejankari.com/harivanshraibachchanpoems/> & <https _comma_//hindikavita.com/hindipoetryharivanshraibachchan.php> •• अग्निपथ कविता •• वृक्ष हों भले खड़े _eol_ हों घने हों बड़े _eol_ एक पत्र छाँह भी _eol_ माँग मत _comma_ माँग मत _comma_ माँग मत _eol_ अग्निपथ अग्निपथ अग्निपथ। तू न थकेगा कभी _comma_ तू न रुकेगा कभी _eol_ तू न मुड़ेगा कभी _eol_ कर शपथ _comma_ कर शपथ _comma_ कर शपथ _eol_ अग्निपथ अग्निपथ अग्निपथ। यह महान दृश्य है _eol_ चल रहा मनुष्य है _eol_ अश्रु श्वेत रक्त से _eol_ लथपथ लथपथ लथपथ _eol_ अग्निपथ अग्निपथ अग्निपथ। ••• नीड़ का निर्माण ••• नीड़ का निर्माण फिरफिर _eol_ नेह का आह्णान फिरफिर। वह उठी आँधी कि नभ में छा गया सहसा अँधेरा _eol_ धूलि धूसर बादलों ने भूमि को इस भाँति घेरा _eol_ रातसा दिन हो गया _comma_ फिर रात आ\u200cई और काली _eol_ लग रहा था अब न होगा इस निशा का फिर सवेरा _eol_ रात के उत्पातभय से भीत जनजन _comma_ भीत कणकण किंतु प्राची से उषा की मोहिनी मुस्कान फिरफिर नीड़ का निर्माण फिरफिर _eol_ नेह का आह्णान फि

In [94]:
training_data = text.split('.')
training_data[0:8]

['poems by harivansh raI bachchan source<https _comma_//hindionlinejankari',
 'com/harivanshraibachchanpoems/> & <https _comma_//hindikavita',
 'com/hindipoetryharivanshraibachchan',
 'php> •• अग्निपथ कविता •• वृक्ष हों भले खड़े _eol_ हों घने हों बड़े _eol_ एक पत्र छाँह भी _eol_ माँग मत _comma_ माँग मत _comma_ माँग मत _eol_ अग्निपथ अग्निपथ अग्निपथ। तू न थकेगा कभी _comma_ तू न रुकेगा कभी _eol_ तू न मुड़ेगा कभी _eol_ कर शपथ _comma_ कर शपथ _comma_ कर शपथ _eol_ अग्निपथ अग्निपथ अग्निपथ। यह महान दृश्य है _eol_ चल रहा मनुष्य है _eol_ अश्रु श्वेत रक्त से _eol_ लथपथ लथपथ लथपथ _eol_ अग्निपथ अग्निपथ अग्निपथ। ••• नीड़ का निर्माण ••• नीड़ का निर्माण फिरफिर _eol_ नेह का आह्णान फिरफिर। वह उठी आँधी कि नभ में छा गया सहसा अँधेरा _eol_ धूलि धूसर बादलों ने भूमि को इस भाँति घेरा _eol_ रातसा दिन हो गया _comma_ फिर रात आ\u200cई और काली _eol_ लग रहा था अब न होगा इस निशा का फिर सवेरा _eol_ रात के उत्पातभय से भीत जनजन _comma_ भीत कणकण किंतु प्राची से उषा की मोहिनी मुस्कान फिरफिर नीड़ का निर्माण फिरफिर _eol_ नेह

In [95]:
for i in range(len(training_data)):
    training_data[i] = training_data[i].strip()
training_data[0:8]

['poems by harivansh raI bachchan source<https _comma_//hindionlinejankari',
 'com/harivanshraibachchanpoems/> & <https _comma_//hindikavita',
 'com/hindipoetryharivanshraibachchan',
 'php> •• अग्निपथ कविता •• वृक्ष हों भले खड़े _eol_ हों घने हों बड़े _eol_ एक पत्र छाँह भी _eol_ माँग मत _comma_ माँग मत _comma_ माँग मत _eol_ अग्निपथ अग्निपथ अग्निपथ। तू न थकेगा कभी _comma_ तू न रुकेगा कभी _eol_ तू न मुड़ेगा कभी _eol_ कर शपथ _comma_ कर शपथ _comma_ कर शपथ _eol_ अग्निपथ अग्निपथ अग्निपथ। यह महान दृश्य है _eol_ चल रहा मनुष्य है _eol_ अश्रु श्वेत रक्त से _eol_ लथपथ लथपथ लथपथ _eol_ अग्निपथ अग्निपथ अग्निपथ। ••• नीड़ का निर्माण ••• नीड़ का निर्माण फिरफिर _eol_ नेह का आह्णान फिरफिर। वह उठी आँधी कि नभ में छा गया सहसा अँधेरा _eol_ धूलि धूसर बादलों ने भूमि को इस भाँति घेरा _eol_ रातसा दिन हो गया _comma_ फिर रात आ\u200cई और काली _eol_ लग रहा था अब न होगा इस निशा का फिर सवेरा _eol_ रात के उत्पातभय से भीत जनजन _comma_ भीत कणकण किंतु प्राची से उषा की मोहिनी मुस्कान फिरफिर नीड़ का निर्माण फिरफिर _eol_ नेह

In [96]:
tokenizer = Tokenizer()

# Set the maximum sequence length for padding
max_sequence_length = 200

# Fit the tokenizer on the training data
tokenizer.fit_on_texts(training_data)
 
# Convert the training data into sequences of tokens
sequences = tokenizer.texts_to_sequences(training_data)
 
# Pad the sequences to have the same length
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

In [97]:
print(sequences[0])

[596, 597, 598, 599, 600, 601, 355, 2, 602]


In [98]:
print(padded_sequences[0])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0 596 597 598 599 600 601 355
   2 602]


In [99]:
len(padded_sequences)

110

## Training Data

In [100]:
# Prepare the input and output data for training
X = padded_sequences[:, :-1]
y = padded_sequences[:, -1]

# Get the total number of unique words in the training data
vocab_size = len(tokenizer.word_index) + 1

In [101]:
print(padded_sequences[0], len(padded_sequences[0]))

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0 596 597 598 599 600 601 355
   2 602] 200


In [102]:
print(X[0], len(X[0]))

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0 596 597 598 599 600 601 355
   2] 199


In [103]:
print(y[0])

602


## Complete training data

In [104]:
X = []
y = []
#for s in sequences:
print(sequences[0])
for s in [sequences[0]]:
    #seq_array = s.split()
    words_in_s = [s[0]]
    for w in s[1:]:
        X.append([v for v in words_in_s])
        y.append(w)
        words_in_s.append(w)

for i in range(len(X)):
    print(X[i], y[i])

[596, 597, 598, 599, 600, 601, 355, 2, 602]
[596] 597
[596, 597] 598
[596, 597, 598] 599
[596, 597, 598, 599] 600
[596, 597, 598, 599, 600] 601
[596, 597, 598, 599, 600, 601] 355
[596, 597, 598, 599, 600, 601, 355] 2
[596, 597, 598, 599, 600, 601, 355, 2] 602


In [105]:
junk = [str(i) for i in range(10)]
from tqdm import tqdm
for s in tqdm(junk):
    print(s)

100%|████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9788.34it/s]

0
1
2
3
4
5
6
7
8
9





In [106]:
from tqdm import tqdm

X = []
y = []
for s in tqdm(sequences):
    if 0 < len(s):
        words_in_s = [s[0]]
        for w in s[1:]:
            X.append([v for v in words_in_s])
            y.append(w)
            words_in_s.append(w)

len(X), len(y)

100%|██████████████████████████████████████████████████████████████████████████████| 110/110 [00:00<00:00, 1964.74it/s]


(5390, 5390)

In [107]:
print(X[100], y[100])

[606, 10, 110, 218, 10, 607, 134, 608, 609, 1, 134, 610, 134, 611, 1, 22, 612, 357, 24, 1, 219, 39, 2, 219, 39, 2, 219, 39, 1, 110, 110, 275, 27, 13, 613, 72, 2, 27, 13, 276, 72, 1, 27, 13, 614, 72, 1, 12, 277, 2, 12, 277, 2, 12, 277, 1, 110, 110, 275, 19, 358, 615, 3, 1, 73, 97, 616, 3, 1, 359, 617, 360, 7, 1, 278, 278, 278, 1, 110, 110, 275, 361, 161, 5, 135, 361, 161, 5] 135


In [108]:
print(X[101], y[101])

[606, 10, 110, 218, 10, 607, 134, 608, 609, 1, 134, 610, 134, 611, 1, 22, 612, 357, 24, 1, 219, 39, 2, 219, 39, 2, 219, 39, 1, 110, 110, 275, 27, 13, 613, 72, 2, 27, 13, 276, 72, 1, 27, 13, 614, 72, 1, 12, 277, 2, 12, 277, 2, 12, 277, 1, 110, 110, 275, 19, 358, 615, 3, 1, 73, 97, 616, 3, 1, 359, 617, 360, 7, 1, 278, 278, 278, 1, 110, 110, 275, 361, 161, 5, 135, 361, 161, 5, 135] 111


In [109]:
padded_X = pad_sequences(X, maxlen=max_sequence_length)
padded_X[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [110]:
y2 = np.array(y)
y2.shape

(5390,)

## Load embedding for both Keras Enbedding and RNN Embeddings

## 1.Keras

### 2.1 Load Embeddings

In [111]:
import os
import numpy as np

glove_dir = "D:/6106_Neural_Modeling_Method/Assignments/Assignment7/"

embeddings_index = {} #initialize dictionary
f = open(os.path.join(glove_dir, 'D:/6106_Neural_Modeling_Method/Assignments/Assignment7/hindi_keras_embeddings5.txt'), encoding='utf8')
try:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
except:
    print(line)
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 1601 word vectors.


In [112]:
vocab_size

1630

In [114]:
embedding_dim = 100

vocabulary_size = vocab_size
embedding_matrix = np.zeros((vocabulary_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < vocabulary_size:
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [115]:
embedding_matrix.shape

(1630, 100)

In [116]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.04580086,  0.01601617,  0.02020026, ...,  0.00669035,
         0.04210837,  0.0457218 ],
       [-0.02822907,  0.0343956 , -0.04777978, ...,  0.02365724,
         0.02021687, -0.04505454],
       [ 0.00803049, -0.04102447,  0.00462417, ...,  0.02595557,
        -0.00771595, -0.00576849]])

### 2.2 Build model architecture

In [117]:
num_tokens = vocab_size

In [118]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import tensorflow as tf

# Define your parameters (ensure these are correctly initialized)
num_tokens = 1630  # Example value for the vocabulary size
embedding_dim = 100  # Example dimension for the embedding
embedding_matrix = tf.random.normal((num_tokens, embedding_dim))  # Example random embedding matrix
vocab_size = 1630  # The number of unique tokens in your vocabulary
max_sequence_length = 100  # The input sequence length

# Build the model
model = Sequential()
model.add(Embedding(
    input_dim=num_tokens,
    output_dim=embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    input_length=max_sequence_length,
    trainable=False
))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [119]:
# Display the model summary
model.build(input_shape=(None, max_sequence_length))  # Explicitly define the input shape
model.summary()

### 2.3 Train the model for Keras 

In [120]:
%%time
model.fit(padded_X, y2, epochs=1, verbose=1)

[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 109ms/step - accuracy: 0.0373 - loss: 7.0097
CPU times: total: 1min 7s
Wall time: 21.1 s


<keras.src.callbacks.history.History at 0x29e58081ba0>

In [121]:
model.fit(padded_X, y2, epochs=150, verbose=1)

Epoch 1/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 109ms/step - accuracy: 0.0738 - loss: 6.0947
Epoch 2/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 111ms/step - accuracy: 0.0787 - loss: 5.8254
Epoch 3/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 104ms/step - accuracy: 0.0936 - loss: 5.4349
Epoch 4/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 87ms/step - accuracy: 0.1258 - loss: 5.0081
Epoch 5/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 65ms/step - accuracy: 0.1843 - loss: 4.6115
Epoch 6/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 65ms/step - accuracy: 0.2271 - loss: 4.2153
Epoch 7/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 64ms/step - accuracy: 0.2690 - loss: 3.8271
Epoch 8/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 65ms/step - accuracy: 0.3196 - loss: 3.4358
Epoch 9/150


<keras.src.callbacks.history.History at 0x29e2c69eb60>

In [122]:
import json
 
# Save only the model architecture
model_json = model.to_json()
with open('D:/6106_Neural_Modeling_Method/Assignments/Assignment7/rnn/model_architecture_keras.json', 'w') as json_file:
    json_file.write(model_json)
print("Model architecture saved successfully!")
 
# Save the model weights
model.save_weights('D:/6106_Neural_Modeling_Method/Assignments/Assignment7/rnn/model_weights_keras.weights.h5')
print("Model weights saved successfully!")

Model architecture saved successfully!
Model weights saved successfully!


In [156]:
from tensorflow.keras.models import model_from_json
# Load the model architecture
with open('D:/6106_Neural_Modeling_Method/Assignments/Assignment7/rnn/model_architecture_keras.json', 'r') as json_file:
    loaded_model_json = json_file.read()
model = model_from_json(loaded_model_json)
print("Model architecture loaded successfully!")

# Assuming 'model' is your already defined model with the correct architecture
model.load_weights('D:/6106_Neural_Modeling_Method/Assignments/Assignment7/rnn/model_weights_keras.weights.h5')
 
print("Weights loaded successfully!")

Model architecture loaded successfully!
Weights loaded successfully!


## Generate Text for Keras

In [158]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
 
# Function to generate text based on a prompt
def generate_text(model, tokenizer, prompt, max_sequence_length, next_words=20):
    generated_text = prompt
    for _ in range(next_words):
        # Tokenize and pad the current text
        token_list = tokenizer.texts_to_sequences([generated_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
 
        # Predict the next word
        predicted = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted, axis=-1)[0]
 
        # Convert the predicted index to word
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
 
        # Add the predicted word to the generated text
        generated_text += " " + output_word
 
        # Stop if a period (end of sentence) is generated
        if output_word == '.':
            break
 
    # Replace 'comma' with ',' in the generated sentence
    generated_text = generated_text.replace(' comma ', ', ')
 
    return generated_text
 
# Set the maximum sequence length (should match your model's input configuration)
max_sequence_length = 100  # Adjust to match your model's configuration
 
# Define the prompt just once
prompt = "वृक्ष हों भले खड़े" # कर शपथ, कर शपथ, कर शपथ,
 
# Generate at least 5 different sentences for the prompt
print(f"\nGenerating 5 sentences for the prompt: '{prompt}'\n")
for i in range(5):
    generated_sentence = generate_text(model, tokenizer, prompt, max_sequence_length)
    print(f"Generated sentence {i + 1}: {generated_sentence}\n")


Generating 5 sentences for the prompt: 'वृक्ष हों भले खड़े'

Generated sentence 1: वृक्ष हों भले खड़े eol हों घने हों बड़े eol एक पत्र छाँह भी eol माँग मत, माँग मत, माँग मत eol

Generated sentence 2: वृक्ष हों भले खड़े eol हों घने हों बड़े eol एक पत्र छाँह भी eol माँग मत, माँग मत, माँग मत eol

Generated sentence 3: वृक्ष हों भले खड़े eol हों घने हों बड़े eol एक पत्र छाँह भी eol माँग मत, माँग मत, माँग मत eol

Generated sentence 4: वृक्ष हों भले खड़े eol हों घने हों बड़े eol एक पत्र छाँह भी eol माँग मत, माँग मत, माँग मत eol

Generated sentence 5: वृक्ष हों भले खड़े eol हों घने हों बड़े eol एक पत्र छाँह भी eol माँग मत, माँग मत, माँग मत eol



## 2.RNN 

### 2.1 Load Embeddings

In [142]:
import os
import numpy as np

glove_dir = "D:/6106_Neural_Modeling_Method/Assignments/Assignment7/"

embeddings_index = {}  # initialize dictionary
with open(os.path.join(glove_dir, 'hindi_rnn_embeddings5.txt'), encoding='utf8') as f:
    for line in f:
        parts = line.strip().split(' ', 1)  # Split only at the first space
        if len(parts) == 2:
            word, vector_str = parts
            try:
                vector = np.fromstring(vector_str, sep=' ', dtype='float32')
                embeddings_index[word] = vector
            except ValueError:
                print(f"Error processing line: {line[:50]}...")  # Print first 50 chars of problematic line

print('Found %s word vectors.' % len(embeddings_index))


Found 1562 word vectors.


In [143]:
vocab_size

1630

In [144]:
embedding_dim = 256

vocabulary_size = vocab_size
embedding_matrix = np.zeros((vocabulary_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < vocabulary_size:
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [145]:
embedding_matrix.shape

(1630, 256)

In [146]:
embedding_matrix

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.29197177, 0.14049494, 0.89266354, ..., 0.09321079, 0.074166  ,
        0.14163157],
       [0.82780212, 0.64313257, 0.2325514 , ..., 0.95406705, 0.89009041,
        0.8139115 ],
       [0.21626054, 0.00721073, 0.74732375, ..., 0.30306888, 0.0416267 ,
        0.1544755 ]])

### 2.2 Build model architecture

In [147]:
num_tokens = vocab_size

In [148]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import tensorflow as tf

# Define your parameters (ensure these are correctly initialized)
num_tokens = 1630  # Example value for the vocabulary size
embedding_dim = 256  # Example dimension for the embedding
embedding_matrix = tf.random.normal((num_tokens, embedding_dim))  # Example random embedding matrix
vocab_size = 1630  # The number of unique tokens in your vocabulary
max_sequence_length = 100  # The input sequence length

# Build the model
model = Sequential()
model.add(Embedding(
    input_dim=num_tokens,
    output_dim=embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    input_length=max_sequence_length,
    trainable=False
))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [149]:
# Display the model summary
model.build(input_shape=(None, max_sequence_length))  # Explicitly define the input shape
model.summary()

### 2.3 Train the model for RNN

In [150]:
%%time
model.fit(padded_X, y2, epochs=1, verbose=1)

[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 72ms/step - accuracy: 0.0446 - loss: 6.9850
CPU times: total: 58.9 s
Wall time: 13.9 s


<keras.src.callbacks.history.History at 0x29e59aa7610>

In [151]:
model.fit(padded_X, y2, epochs=150, verbose=1)

Epoch 1/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 75ms/step - accuracy: 0.0753 - loss: 5.9937
Epoch 2/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 73ms/step - accuracy: 0.1061 - loss: 5.4948
Epoch 3/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 81ms/step - accuracy: 0.1505 - loss: 4.9998
Epoch 4/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 74ms/step - accuracy: 0.2213 - loss: 4.4266
Epoch 5/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 77ms/step - accuracy: 0.2846 - loss: 3.9935
Epoch 6/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 72ms/step - accuracy: 0.3454 - loss: 3.4895
Epoch 7/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 72ms/step - accuracy: 0.3980 - loss: 3.0700
Epoch 8/150
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 76ms/step - accuracy: 0.4518 - loss: 2.7653
Epoch 9/150
[1m

<keras.src.callbacks.history.History at 0x29e811a2f80>

In [152]:
import json
 
# Save only the model architecture
model_json = model.to_json()
with open('D:/6106_Neural_Modeling_Method/Assignments/Assignment7/rnn/model_architecture_RNN.json', 'w') as json_file:
    json_file.write(model_json)
print("Model architecture saved successfully!")
 
# Save the model weights
model.save_weights('D:/6106_Neural_Modeling_Method/Assignments/Assignment7/rnn/model_weights_RNN.weights.h5')
print("Model weights saved successfully!")

Model architecture saved successfully!
Model weights saved successfully!


In [153]:
from tensorflow.keras.models import model_from_json
# Load the model architecture
with open('D:/6106_Neural_Modeling_Method/Assignments/Assignment7/rnn/model_architecture_RNN.json', 'r') as json_file:
    loaded_model_json = json_file.read()
model = model_from_json(loaded_model_json)
print("Model architecture loaded successfully!")

# Assuming 'model' is your already defined model with the correct architecture
model.load_weights('D:/6106_Neural_Modeling_Method/Assignments/Assignment7/rnn/model_weights_RNN.weights.h5')
 
print("Weights loaded successfully!")

Model architecture loaded successfully!
Weights loaded successfully!


## Generate Text for RNN

In [155]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
 
# Function to generate text based on a prompt
def generate_text(model, tokenizer, prompt, max_sequence_length, next_words=20):
    generated_text = prompt
    for _ in range(next_words):
        # Tokenize and pad the current text
        token_list = tokenizer.texts_to_sequences([generated_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
 
        # Predict the next word
        predicted = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted, axis=-1)[0]
 
        # Convert the predicted index to word
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
 
        # Add the predicted word to the generated text
        generated_text += " " + output_word
 
        # Stop if a period (end of sentence) is generated
        if output_word == '.':
            break
 
    # Replace 'comma' with ',' in the generated sentence
    generated_text = generated_text.replace(' comma ', ', ')
 
    return generated_text
 
# Set the maximum sequence length (should match your model's input configuration)
max_sequence_length = 100  # Adjust to match your model's configuration
 
# Define the prompt just once
prompt = "वृक्ष हों भले खड़े" # कर शपथ, कर शपथ, कर शपथ,
 
# Generate at least 5 different sentences for the prompt
print(f"\nGenerating 5 sentences for the prompt: '{prompt}'\n")
for i in range(5):
    generated_sentence = generate_text(model, tokenizer, prompt, max_sequence_length)
    print(f"Generated sentence {i + 1}: {generated_sentence}\n")


Generating 5 sentences for the prompt: 'वृक्ष हों भले खड़े'

Generated sentence 1: वृक्ष हों भले खड़े था eol हों घने हों बड़े eol एक पत्र छाँह भी eol माँग मत, माँग मत, माँग मत

Generated sentence 2: वृक्ष हों भले खड़े था eol हों घने हों बड़े eol एक पत्र छाँह भी eol माँग मत, माँग मत, माँग मत

Generated sentence 3: वृक्ष हों भले खड़े था eol हों घने हों बड़े eol एक पत्र छाँह भी eol माँग मत, माँग मत, माँग मत

Generated sentence 4: वृक्ष हों भले खड़े था eol हों घने हों बड़े eol एक पत्र छाँह भी eol माँग मत, माँग मत, माँग मत

Generated sentence 5: वृक्ष हों भले खड़े था eol हों घने हों बड़े eol एक पत्र छाँह भी eol माँग मत, माँग मत, माँग मत



# 4 Compare Embedding Quality

### Load the saved embeddings and models:

In [178]:
import json
import numpy as np
import gzip
from tensorflow.keras.models import model_from_json

# Load RNN model and weights
with open('D:/6106_Neural_Modeling_Method/Assignments/Assignment7/rnn/model_architecture_rnn.json', 'r') as json_file:
    loaded_model_json = json_file.read()
rnn_model = model_from_json(loaded_model_json)
rnn_model.load_weights('D:/6106_Neural_Modeling_Method/Assignments/Assignment7/rnn/model_weights_rnn.weights.h5')

# Load RNN embeddings
rnn_embeddings = {}
with open("D:/6106_Neural_Modeling_Method/Assignments/Assignment7/hindi_rnn_embeddings5.txt", "r", encoding="utf-8") as f:
    for line_num, line in enumerate(f, 1):
        line = line.strip()
        if not line:
            continue
        values = line.split()
        if len(values) < 2:
            continue
        word = values[0]
        try:
            vector = np.asarray(values[1:], dtype='float32')
            rnn_embeddings[word] = vector
        except ValueError as e:
            print(f"Error on line {line_num}: {e}")

# Load Word2Vec embeddings
word2vec_embeddings = {}
with open("D:/6106_Neural_Modeling_Method/Assignments/Assignment7/hindi_keras_embeddings5.txt", "r", encoding="utf-8") as f:
    for line_num, line in enumerate(f, 1):
        line = line.strip()
        if not line:
            continue
        values = line.split()
        if len(values) < 2:
            continue
        word = values[0]
        try:
            vector = np.asarray(values[1:], dtype='float32')
            word2vec_embeddings[word] = vector
        except ValueError as e:
            print(f"Error on line {line_num}: {e}")

# Specify target words if you only need embeddings for specific words
target_words = set(rnn_embeddings.keys()).union(word2vec_embeddings.keys())

# Limit the number of lines to load from the large pre-trained embeddings file
max_lines = 100000  # Set a limit to prevent loading all data if not needed

# Load pre-trained Hindi embeddings (gzip compressed)
pretrained_embeddings = {}
with gzip.open("D:/6106_Neural_Modeling_Method/Assignments/Assignment7/cc.hi.300.vec.gz", "rt", encoding="utf-8") as f:
    for line_num, line in enumerate(f, 1):
        if line_num > max_lines:
            break
        line = line.strip()
        if not line:
            continue
        values = line.split()
        word = values[0]
        if word not in target_words:  # Only load target words if specified
            continue
        try:
            vector = np.asarray(values[1:], dtype='float32')
            pretrained_embeddings[word] = vector
        except ValueError as e:
            print(f"Error on line {line_num}: {e}")

print(f"Loaded {len(rnn_embeddings)} RNN embeddings")
print(f"Loaded {len(word2vec_embeddings)} Word2Vec embeddings")
print(f"Loaded {len(pretrained_embeddings)} pre-trained embeddings (limited to {max_lines} lines)")


Loaded 1562 RNN embeddings
Loaded 1601 Word2Vec embeddings
Loaded 1454 pre-trained embeddings (limited to 100000 lines)


### Implement evaluation metric, Prepare evaluation data, Evaluate and compare embeddings

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_word_similarity(embeddings, word_pairs):
    similarities = []
    for word1, word2 in word_pairs:
        if word1 in embeddings and word2 in embeddings:
            vec1 = embeddings[word1].reshape(1, -1)
            vec2 = embeddings[word2].reshape(1, -1)
            similarity = cosine_similarity(vec1, vec2)[0][0]
            similarities.append(similarity)
    return np.mean(similarities) if similarities else np.nan

def evaluate_analogy_task(embeddings, analogies):
    correct = 0
    total = 0
    for a, b, c, d in analogies:
        if a in embeddings and b in embeddings and c in embeddings and d in embeddings:
            a_vec, b_vec, c_vec, d_vec = (embeddings[w] for w in (a, b, c, d))
            result = b_vec - a_vec + c_vec
            
            max_similarity = -1
            best_word = None
            for word, vec in embeddings.items():
                if word not in [a, b, c]:
                    similarity = cosine_similarity(result.reshape(1, -1), vec.reshape(1, -1))[0][0]
                    if similarity > max_similarity:
                        max_similarity = similarity
                        best_word = word
            
            if best_word == d:
                correct += 1
            total += 1
    
    return correct / total if total > 0 else 0

def evaluate_chatbot_performance(embeddings, test_questions, test_answers):
    correct = 0
    total = len(test_questions)
    
    for question, correct_answer in zip(test_questions, test_answers):
        question_words = question.split()
        answer_words = correct_answer.split()
        
        question_embedding = np.mean([embeddings.get(word, np.zeros(embeddings[list(embeddings.keys())[0]].shape)) for word in question_words], axis=0)
        answer_embedding = np.mean([embeddings.get(word, np.zeros(embeddings[list(embeddings.keys())[0]].shape)) for word in answer_words], axis=0)
        
        similarity = cosine_similarity(question_embedding.reshape(1, -1), answer_embedding.reshape(1, -1))[0][0]
        
        if similarity > 0.5:  # Adjust this threshold as needed
            correct += 1
    
    return correct / total

# Assuming you have already loaded your embeddings
embedding_types = {
    "RNN": rnn_embeddings,
    "Word2Vec": word2vec_embeddings,
    "Pre-trained": pretrained_embeddings
}

# Define your word pairs, analogies, test questions, and test answers
word_pairs = [
    ("राजा", "रानी"),  # king, queen
    ("मुंबई", "शहर"),  # Mumbai, city
    ("खाना", "भोजन"),  # food, meal
    ("सूरज", "चांद")   # sun, moon
]

analogies = [
    ("राजा", "रानी", "लड़का", "लड़की"),  # king, queen, boy, girl
    ("भारत", "दिल्ली", "फ्रांस", "पेरिस"),  # India, Delhi, France, Paris
    ("गरम", "ठंडा", "दिन", "रात"),  # hot, cold, day, night
]

test_questions = [
    "भारत की राजधानी क्या है?",  # What is the capital of India?
    "सूरज किस दिशा में उगता है?",  # In which direction does the sun rise?
    "पानी का रासायनिक सूत्र क्या है?",  # What is the chemical formula for water?
]

test_answers = [
    "दिल्ली",  # Delhi
    "पूरब",  # East
    "H2O",
]

print("Word Similarity Evaluation:")
for name, embeddings in embedding_types.items():
    similarity_score = evaluate_word_similarity(embeddings, word_pairs)
    print(f"{name} Embeddings: {similarity_score:.4f}")

print("\nAnalogy Task Evaluation:")
for name, embeddings in embedding_types.items():
    analogy_score = evaluate_analogy_task(embeddings, analogies)
    print(f"{name} Embeddings: {analogy_score:.4f}")

print("\nChatbot Performance Evaluation:")
for name, embeddings in embedding_types.items():
    accuracy = evaluate_chatbot_performance(embeddings, test_questions, test_answers)
    print(f"{name} Embeddings: {accuracy:.4f}")

## 5. Train and Compare a Classic DNN Model

## Train a shallow DNN for embeddings:

In [196]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Dropout, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Assuming 'sentences' and 'sequences' are defined and processed as in your setup
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1

# Convert sentences to sequences
sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = pad_sequences(sequences)

# Prepare the input and target sequences for next word prediction
input_sequences = []
target_words = []
for seq in sequences:
    for i in range(1, len(seq)):
        n_gram_seq = seq[:i+1]
        input_sequences.append(n_gram_seq[:-1])
        target_words.append(n_gram_seq[-1])

# Pad the input sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen=padded_sequences.shape[1])
target_words = np.array(target_words)

# Define the improved DNN model
embedding_dim = 300
model = Sequential([
    Embedding(total_words, embedding_dim, input_length=padded_sequences.shape[1]),
    GlobalAveragePooling1D(),
    Dense(128),
    LeakyReLU(alpha=0.1),
    Dropout(0.4),
    Dense(64),
    LeakyReLU(alpha=0.1),
    Dropout(0.3),
    Dense(32),
    LeakyReLU(alpha=0.1),
    Dense(total_words, activation='softmax')
])

# Use a learning rate scheduler and early stopping for optimization
optimizer = Adam(learning_rate=0.001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Learning rate reduction and early stopping callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5)
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    padded_input_sequences, target_words,
    epochs=50, batch_size=64, validation_split=0.2,
    callbacks=[reduce_lr, early_stop]
)

# Extract DNN embeddings
dnn_embeddings = model.layers[0].get_weights()[0]
dnn_word_index = tokenizer.word_index

# Save DNN embeddings in text format
with open("D:/6106_Neural_Modeling_Method/Assignments/Assignment7/dnn_embeddings.txt", "w", encoding="utf-8") as f:
    for word, idx in dnn_word_index.items():
        vector = dnn_embeddings[idx]
        vector_str = " ".join([str(v) for v in vector])
        f.write(f"{word} {vector_str}\n")

print("DNN embeddings saved in text format.")

 

Epoch 1/50




[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.0082 - loss: 7.2449 - val_accuracy: 0.0271 - val_loss: 7.1915 - learning_rate: 0.0010
Epoch 2/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.0241 - loss: 6.5474 - val_accuracy: 0.0209 - val_loss: 7.4678 - learning_rate: 0.0010
Epoch 3/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.0207 - loss: 6.4805 - val_accuracy: 0.0271 - val_loss: 7.7528 - learning_rate: 0.0010
Epoch 4/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.0202 - loss: 6.4502 - val_accuracy: 0.0271 - val_loss: 7.8555 - learning_rate: 0.0010
Epoch 5/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.0220 - loss: 6.4276 - val_accuracy: 0.0271 - val_loss: 7.9813 - learning_rate: 5.0000e-04
Epoch 6/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/st

**we successfully generated language embeddings using three distinct methods: RNN-based embeddings, Keras's Word2Vec-based embeddings, and DNN-based embeddings. Each approach was applied to Hindi text data, allowing us to explore the unique qualities and effectiveness of different embedding techniques.
We trained a chatbot using the RNN-based and Keras embeddings, both of which demonstrated satisfactory performance in understanding and responding to Hindi inputs. The RNN embeddings, with their ability to capture sequential dependencies, contributed to enhanced context comprehension, while Keras embeddings provided a reliable baseline performance.**
 