In [21]:
# Imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import re
from collections import Counter
import requests
print("✅ complete")

✅ complete


In [22]:
import re
import requests
from collections import Counter

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    return text.split()

def build_vocab(word_list, vocab_size=5000):
    most_common = Counter(word_list).most_common(vocab_size - 1)
    vocab = {w: i+1 for i, (w, _) in enumerate(most_common)}
    vocab["<UNK>"] = 0
    return vocab

def encode_words(word_list, vocab):
    return [vocab.get(word, vocab["<UNK>"]) for word in word_list]

# --- Download and process text ---
url = "https://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
raw_text = response.text

words = preprocess_text(raw_text)
vocab = build_vocab(words)               # ⬅ Now matches your original variable name
encoded = encode_words(words, vocab)
idx2word = {i: w for w, i in vocab.items()}  

In [23]:
# --- Dataset Class ---
class LanguageModelDataset(Dataset):
    def __init__(self, data, seq_len):
        self.data = data
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx : idx + self.seq_len])
        y = torch.tensor(self.data[idx + self.seq_len])
        return x, y

seq_len = 5
dataset = LanguageModelDataset(encoded, seq_len)
loader = DataLoader(dataset, batch_size=64, shuffle=True, drop_last=True)
print("✅ complete")

✅ complete


In [24]:
# --- Model Definition ---
class NeuralLM(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1])  # only use the last time step
        return out

vocab_size = len(vocab)
model = NeuralLM(vocab_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = nn.CrossEntropyLoss()
print("✅ complete")

✅ complete


In [25]:
# --- Training Loop ---
for epoch in range(100):
    total_loss = 0
    for inputs, targets in loader:
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
    torch.save(model.state_dict(), f"neural_lm_epoch{epoch+1}.pt")
print("✅ complete")

Epoch 1, Loss: 2409.8725
Epoch 2, Loss: 1919.3764
Epoch 3, Loss: 1516.2150
Epoch 4, Loss: 1153.0828
Epoch 5, Loss: 884.2304
Epoch 6, Loss: 696.0562
Epoch 7, Loss: 551.7852
Epoch 8, Loss: 448.5031
Epoch 9, Loss: 372.3228
Epoch 10, Loss: 324.1809
Epoch 11, Loss: 291.6503
Epoch 12, Loss: 261.1475
Epoch 13, Loss: 239.8613
Epoch 14, Loss: 230.3989
Epoch 15, Loss: 224.7477
Epoch 16, Loss: 228.2571
Epoch 17, Loss: 214.5289
Epoch 18, Loss: 200.2079
Epoch 19, Loss: 200.6325
Epoch 20, Loss: 208.5522
Epoch 21, Loss: 202.3596
Epoch 22, Loss: 207.9450
Epoch 23, Loss: 198.6125
Epoch 24, Loss: 193.0460
Epoch 25, Loss: 206.5504
Epoch 26, Loss: 201.2641
Epoch 27, Loss: 195.6051
Epoch 28, Loss: 199.8329
Epoch 29, Loss: 201.3827
Epoch 30, Loss: 194.5481
Epoch 31, Loss: 210.2118
Epoch 32, Loss: 194.4947
Epoch 33, Loss: 194.0844
Epoch 34, Loss: 206.6296
Epoch 35, Loss: 208.2340
Epoch 36, Loss: 208.5478
Epoch 37, Loss: 207.9523
Epoch 38, Loss: 202.8148
Epoch 39, Loss: 206.0009
Epoch 40, Loss: 194.9866
Epoch

In [26]:
# --- Next Word Prediction ---
def predict_next_word(seed_text):
    model.eval()
    seed_words = preprocess_text(seed_text)[-seq_len:]
    encoded_input = encode_words(seed_words, vocab)
    if len(encoded_input) < seq_len:
        encoded_input = [0] * (seq_len - len(encoded_input)) + encoded_input
    input_tensor = torch.tensor([encoded_input])
    with torch.no_grad():
        output = model(input_tensor)
        next_word_id = torch.argmax(output, dim=-1).item()
    return idx2word[next_word_id]

print("Input:", "she was not a bit")
print("Next word prediction:", predict_next_word("she was not a bit"))
print("✅ complete")

Input: she was not a bit
Next word prediction: hurt
✅ complete


In [27]:
print("Input:", "she was not a bit")
print("Next word prediction:", predict_next_word("she was not a bit"))

Input: she was not a bit
Next word prediction: hurt


In [28]:
print("Input:", "Alice fell down the rabbit")
print("Next word prediction:", predict_next_word("Alice fell down the rabbit"))

Input: Alice fell down the rabbit
Next word prediction: then


In [29]:
print("Input:", "I am going to attack")
print("Next word prediction:", predict_next_word("I am going to attack"))

Input: I am going to attack
Next word prediction: the
