<a href="https://colab.research.google.com/github/nikhilRajput-prog/Deep-Learning-Lab-File/blob/main/Deep_Learning_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import re
import time


In [None]:
df = pd.read_csv("poem.csv")
print(df.head())
print("\nTotal rows:", len(df))


                                                text
0  O my Luve's like a red, red rose\nThatâ€™s newly...
1  The rose is red,\nThe violet's blue,\nSugar is...
2  How do I love thee? Let me count the ways.\nI ...
3  Had I the heavens' embroidered cloths,\nEnwrou...
4  I.\n    Enough! we're tired, my heart and I.\n...

Total rows: 100


In [None]:
words = []

for line in df['text'].astype(str):
    line = line.lower()
    line = re.sub(r'[^a-z\s]', '', line)
    words.extend(line.split())

vocab = sorted(set(words))
vocab_size = len(vocab)

word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for w,i in word2idx.items()}

print("Vocabulary size:", vocab_size)
print("Sample words:", vocab[:10])


Vocabulary size: 5439
Sample words: ['a', 'abase', 'abased', 'abbeystones', 'abeyance', 'abide', 'abode', 'abodes', 'about', 'above']


In [None]:
encoded_words = [word2idx[w] for w in words]

print("First 20 encoded words:")
print(encoded_words[:20])


First 20 encoded words:
[3167, 3054, 2775, 2664, 0, 3748, 3748, 3897, 4726, 3109, 4407, 2351, 2473, 3167, 3054, 2775, 2664, 4727, 2896, 4726]


In [None]:
SEQ_LEN = 5
X, y = [], []

for i in range(len(encoded_words) - SEQ_LEN):
    X.append(encoded_words[i:i+SEQ_LEN])
    y.append(encoded_words[i+SEQ_LEN])

X = torch.tensor(X)
y = torch.tensor(y)

print("Input shape:", X.shape)
print("Target shape:", y.shape)


Input shape: torch.Size([24671, 5])
Target shape: torch.Size([24671])


In [None]:
class SimpleRNN_Numpy:
    def __init__(self, vocab_size, hidden_size):
        self.Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
        self.Whh = np.random.randn(hidden_size, hidden_size) * 0.01
        self.Why = np.random.randn(vocab_size, hidden_size) * 0.01
        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((vocab_size, 1))

    def forward(self, inputs):
        h = np.zeros((self.Whh.shape[0], 1))
        outputs = []
        for x in inputs:
            x = x.reshape(-1, 1)
            h = np.tanh(self.Wxh @ x + self.Whh @ h + self.bh)
            y = self.Why @ h + self.by
            outputs.append(y)
        return outputs

rnn_np = SimpleRNN_Numpy(vocab_size, hidden_size=32)
print("NumPy RNN initialized")


NumPy RNN initialized


In [None]:
X_onehot = torch.zeros(X.size(0), SEQ_LEN, vocab_size)

for i in range(X.size(0)):
    for t in range(SEQ_LEN):
        X_onehot[i, t, X[i, t]] = 1

print("One-hot input shape:", X_onehot.shape)


One-hot input shape: torch.Size([24671, 5, 5439])


In [None]:
class OneHotRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        return self.fc(out[:, -1, :])

onehot_model = OneHotRNN(vocab_size, 128)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(onehot_model.parameters(), lr=0.003)

print(onehot_model)


OneHotRNN(
  (rnn): RNN(5439, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=5439, bias=True)
)


In [None]:
import time

start_time = time.time()

for epoch in range(10):
    optimizer.zero_grad()
    outputs = onehot_model(X_onehot)
    loss_onehot = criterion(outputs, y)
    loss_onehot.backward()
    optimizer.step()

onehot_time = time.time() - start_time


In [None]:
print("Final One-Hot Loss:", loss_onehot.item())
print("One-Hot Training Time:", onehot_time)


Final One-Hot Loss: 6.76938009262085
One-Hot Training Time: 130.3256962299347


In [None]:
def generate_onehot(start_word, length=15):
    words = [start_word]
    for _ in range(length):
        inp = torch.zeros(1, SEQ_LEN, vocab_size)
        for i, w in enumerate(words[-SEQ_LEN:]):
            inp[0, i, word2idx[w]] = 1
        out = onehot_model(inp)
        next_word = idx2word[out.argmax().item()]
        words.append(next_word)
    return " ".join(words)

print(generate_onehot("love"))


love the the the the the the the the the the the the the the the


In [None]:
class EmbeddingRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        return self.fc(out[:, -1, :])

embed_model = EmbeddingRNN(vocab_size, 100, 128)
optimizer = torch.optim.Adam(embed_model.parameters(), lr=0.003)

print(embed_model)


EmbeddingRNN(
  (embedding): Embedding(5439, 100)
  (rnn): RNN(100, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=5439, bias=True)
)


In [None]:
start_time = time.time()

for epoch in range(10):
    optimizer.zero_grad()
    outputs = embed_model(X)
    loss_embed = criterion(outputs, y)
    loss_embed.backward()
    optimizer.step()

embed_time = time.time() - start_time

print("Final Embedding Loss:", loss_embed.item())
print("Embedding Training Time:", embed_time)


Final Embedding Loss: 6.803170204162598
Embedding Training Time: 47.6439049243927


In [None]:
print("Final One-Hot Loss:", loss_onehot.item())
print("Final Embedding Loss:", loss_embed.item())


Final One-Hot Loss: 6.76938009262085
Final Embedding Loss: 6.803170204162598


In [None]:
print("\n========= COMPARISON SUMMARY =========")
print(f"One-Hot Encoding  -> Loss: {loss_onehot.item():.4f}, Time: {onehot_time:.2f}s")
print(f"Word Embeddings   -> Loss: {loss_embed.item():.4f}, Time: {embed_time:.2f}s")

if loss_embed.item() < loss_onehot.item():
    print("Embedding model performs better based on loss.")
else:
    print("One-Hot model performs better based on loss.")



One-Hot Encoding  -> Loss: 6.7694, Time: 130.33s
Word Embeddings   -> Loss: 6.8032, Time: 47.64s
One-Hot model performs better based on loss.


In [None]:
def generate_embedding(start_word, length=15):
    words = [start_word]
    for _ in range(length):
        seq = torch.tensor([word2idx[w] for w in words[-SEQ_LEN:]]).unsqueeze(0)
        out = embed_model(seq)
        next_word = idx2word[out.argmax().item()]
        words.append(next_word)
    return " ".join(words)

print(generate_embedding("love"))


love same and the and and and and and and and and and and and and
