In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/corpus/Ulysses.txt
/kaggle/input/corpus/Pride_and_Prejudice.txt


In [1]:
import re
import string
import sys

class Tokenizer:
    def __init__(self, corpus):
        self.corpus = corpus
        self.preprocess_corpus()
        self.sen_tokenize = re.compile(r'(?<=[.!?]")[\s]|(?<=[.!?])[\s]')
        self.word_and_punct_tokenize = re.compile(r'''(?:[A-Z]\.)+|\w+(?:-\w+)*|\w+(?:'\w+)?|\.\.\.|(?:Mr|Mrs|Dr|Ms)\.|\w+|[^\w\s]|'\w+''')
        self.num_tokenize = re.compile(r'\b\d+(\.\d+)?\b')
        self.mail_tokenize = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
        self.url_tokenize = re.compile(r'(http[s]?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        self.hash_tokenize = re.compile(r'#[\w\-]+')
        self.mention_tokenize = re.compile(r'@[\w\-]+')
        self.money_tokenize = re.compile(r'\b\d+(\.\d+)?\s?\$|\$\s?\d+(\.\d+)?\b')
        self.percent_tokenize = re.compile(r'\b\d+(\.\d+)?\%')
        self.age_tokenize = re.compile(r'\b\d{1,3}(?:\s|-)?(?:year(?:s)?\s?-?\s?old)\b')
        self.time_tokenize = re.compile(r'\b\d{1,2}:\d{2}\s?(?:AM|PM)?\b|\b(?:morning|afternoon|evening|night)\b', flags=re.IGNORECASE)
        self.date_tokenize = re.compile(r'\b(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{2,4}[-/]\d{1,2}[-/]\d{1,2}|\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{2,4}|\d{1,2}\s(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{2,4})\b')

    def preprocess_corpus(self):
        self.corpus = self.corpus.lower()
        self.corpus = re.sub(r'\s+', ' ', self.corpus)
        alphanumeric_tokenize = re.compile(r'(\d+|\D+)')
        self.corpus = ' '.join([' '.join(alphanumeric_tokenize.findall(word)) if alphanumeric_tokenize.match(word) and '.' not in word else word for word in self.corpus.split()])
        self.corpus = self.corpus.replace("_", "")
        self.corpus = re.sub(r'\b(mr|mrs|ms|dr|prof|rev|rd|st|gen|rep|sen|sr|jr)\.', r'\1', self.corpus)

    def replace_entities(self, text):
        text = re.sub(self.url_tokenize, '<URL>', text)
        text = re.sub(self.mail_tokenize, '<MAILID>', text)
        text = re.sub(self.date_tokenize, '<DATE>', text)
        text = re.sub(self.time_tokenize, '<TIME>', text)
        text = re.sub(self.age_tokenize, '<AGE>', text)
        text = re.sub(self.percent_tokenize, '<PERCENTAGE>', text)
        text = re.sub(self.money_tokenize, '<CURRENCY>', text)
        text = re.sub(self.num_tokenize, '<NUM>', text)
        text = re.sub(self.mention_tokenize, '<MENTION>', text)
        text = re.sub(self.hash_tokenize, '<HASHTAG>', text)
        return text
    
    def tokenize_sentence(self, text):
        sentences = re.split(self.sen_tokenize, text)
        sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
        return sentences

    def tokenize_word(self, sentence):
        words_and_punctuations = []
        
        # Keep special tokens and words, but remove standalone punctuations
        tokens = re.findall(r'<URL>|<MAILID>|<DATE>|<TIME>|<AGE>|<PERCENTAGE>|<CURRENCY>|<NUM>|<MENTION>|<HASHTAG>|' + self.word_and_punct_tokenize.pattern, sentence)
        
        for token in tokens:
            # Remove pure punctuation tokens (excluding predefined tags)
            if token.isalnum() or token.startswith("<") and token.endswith(">"):
                words_and_punctuations.append(token)
        
        return words_and_punctuations

    def tokenize(self):
        self.corpus = self.replace_entities(self.corpus)
        sentences = self.tokenize_sentence(self.corpus)
        tokenized_corpus = [self.tokenize_word(sentence) for sentence in sentences]
        return tokenized_corpus

def tokenize_sentences(text):
    # sentences = re.split(
    #     r'(?<=[.!?])"? | (?<=[.!?])\s+(?=[A-Z])', text.strip())

    patterns = {
        "percentage": r"\b\d+(\.\d+)?%",  # Match percentages like 45%
        "url": r"(https?://\S+|www\.\S+)",  # Match URLs
        # Match emails
        "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
        "hashtag": r"#\w+",  # Match hashtags
        "mention": r"@\w+",  # Match mentions
        "age": r"\b\d{1,3}\s?(years old|yo|yrs)\b",  # Match age patterns
        # Match times like 12:30 PM
        "time": r"\b\d{1,2}:\d{2} ?(AM|PM|am|pm)?\b",
        "number": r"\b\d+(\.\d+)?\b",  # Match any number
        "date": r"\b\d{1,2}(?:st|nd|rd|th)?\s+(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b"  # Match dates like 15th October
    }

    # Placeholders for substitution
    placeholders = {
        "url": "<URL>",
        "email": "<EMAIL>",
        "hashtag": "<HASHTAG>",
        "mention": "<MENTION>",
        "percentage": "<PERCENTAGE>",
        "age": "<AGE>",
        "time": "<TIME>",
        "number": "<NUMBER>",
        "date": "<DATE>"
    }

    for key, pattern in patterns.items():
        text = re.sub(pattern, placeholders[key], text)
    
    
    # Split text into tokens, keeping punctuation separate
    tokens = re.findall(r'\b\w+\b|[.,!?;(){}\[\]":\'/-]', text)  # Match words and individual punctuation
    temp_placeholders=['URL', 'EMAIL', 'HASHTAG', 'MENTION','PERCENTAGE','AGE','TIME','NUMBER', 'DATE']
    
    # Clean up extra spaces around the tokens
    tokens = [token if token not in temp_placeholders else '<'+token+'>' for token in tokens]
    
    return tokens

def tokenize_text(corpus):
        # Remove quotation marks before splitting into sentences
        corpus = re.sub(r'[\'"]', '', corpus)
        corpus = re.sub(r'[\'_]', '', corpus)
        corpus = re.sub(r'[\'-]', '', corpus)
        

        # Split into sentences considering punctuation marks and the case of the next letter
        sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', corpus.strip())
        
        tokenized_sentences = [tokenize_sentences(sentence) for sentence in sentences]
        return tokenized_sentences
    
    
def read_corpus(file_path: str):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    with open("output.txt", 'w', encoding='utf-8') as file:
        file.write(text)

    return tokenize_text(text)


def main():
    # sentences = read_corpus("./pride_and_prejudice.txt")
    # for i in range(30):
    #     print (sentences[i])
    # print()
    print("Welcome to the Toknizer:")
    while True:
        text = input("\nEnter text to tokenize (or 'quit' to exit): ")
        if text.lower() == 'quit':
            print("Thank you for using tokenizer bye!")
            break

        tokens = tokenize_text(text)
        print("\nTokenized text:")
        print(tokens)

if __name__ == "__main__":
    main()


Welcome to the Toknizer:
Thank you for using tokenizer bye!


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import defaultdict
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import os
# from tokenizer import Tokenizer

def tokenize_corpus(corpus_path, n):
    with open(corpus_path, "r", encoding="utf-8") as file:
        corpus = file.read()

    tokenizer = Tokenizer(corpus)
    tokenized_sentences = tokenizer.tokenize()

    # Add <s> and </s> tokens
    final_tokenized = []
    for sen in tokenized_sentences:
        final_tokenized.append(["<s>"] * (n-1) + sen + ["</s>"])

    # Split dataset into train (80%), val (10%), test (10%)
    train_sentences, temp_sentences = train_test_split(final_tokenized, test_size=0.2, random_state=42)
    val_sentences, test_sentences = train_test_split(temp_sentences, test_size=0.5, random_state=42)

    # Build Vocabulary (Add an <UNK> token for unseen words)
    vocab = {word: idx for idx, word in enumerate(set(word for sentence in train_sentences for word in sentence))}
    vocab["<UNK>"] = len(vocab)  # Add <UNK> token

    return train_sentences, val_sentences, test_sentences, vocab




class NGramDataset(Dataset):
    """ Dataset for N-Gram Language Model """
    def __init__(self, tokenized_corpus, n, vocab):
        self.n = n
        self.vocab = vocab
        self.data = []

        for sentence in tokenized_corpus:
            for i in range(len(sentence) - n):
                context = sentence[i:i + n - 1]
                target = sentence[i + n - 1]
                self.data.append((context, target, sentence))  # Store sentence here!

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target, sentence = self.data[idx]
        context_tensor = torch.tensor([self.vocab.get(word, self.vocab["<UNK>"]) for word in context], dtype=torch.long)
        target_tensor = torch.tensor(self.vocab.get(target, self.vocab["<UNK>"]), dtype=torch.long)
        return context_tensor, target_tensor, " ".join(sentence)  # Return full sentence as string



class FFNNLanguageModel(nn.Module):
    """ Feed Forward Neural Network Language Model """
    def __init__(self, vocab_size, embed_size, hidden_size, n,dropout_rate):
        super(FFNNLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.fc1 = nn.Linear((n-1) * embed_size, hidden_size)
        self.relu = nn.GELU()# TRY Gelu
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc1(x)
        x = self.relu(x)

        x = self.layer_norm(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return self.softmax(x)


def calculate_nll(
    model: torch.nn.Module,
    loader: torch.utils.data.DataLoader,
    device: torch.device,
) -> tuple[list[float], list[str]]:
    model.eval()

    sentence_perplexities, sentences = [], []
    with torch.no_grad():
        for context, target, batch_sentences in loader:
            context, target = context.to(device), target.to(device)
            # each of these are (batch_size, item)
            output = model(context)
            if output.size(0) != target.size(0):
                # Adjust output to match target size
                if output.size(0) < target.size(0):
                    target = target[: output.size(0)]
                else:
                    output = output[: target.size(0)]

            loss = torch.nn.NLLLoss(reduction="none")(output, target)
            num_sentences = len(loss)
            # append the perplexities one by one
            sentence_perplexities.extend(loss.cpu().numpy().tolist())
            sentences.extend(batch_sentences[:num_sentences])
    return sentence_perplexities, sentences

import numpy as np
import torch
from torch.utils.data import DataLoader
from typing import List, Tuple

def set_perplexity(
    model: torch.nn.Module,
    loader: DataLoader,
    device: torch.device,
    file_name: str,
) -> None:
    """
    Computes and saves the perplexity of sentences in the given DataLoader.

    Args:
    - model (torch.nn.Module): Trained model.
    - loader (DataLoader): DataLoader containing the dataset.
    - device (torch.device): CPU/GPU device.
    - file_name (str): Output file name to save perplexity scores.
    """
    nll_losses, sentences = calculate_nll(model, loader, device)
    assert len(nll_losses) == len(
        sentences
    ), "[set_perplexity] nll losses should be same length as sentences"

    sentence_perplexity = defaultdict(list)  
    
    for sentence, nll_loss in zip(sentences, nll_losses):
        sentence_perplexity[sentence].append(nll_loss)
        
    with open(file_name, "w") as f:
        for sentence in sentence_perplexity:
            perplexity = np.mean([np.exp(s) for s in sentence_perplexity[sentence]])
            f.write(f"{sentence}\t\t\t\t{(perplexity)}\n")

        average_perplexity = np.exp(sum(nll_losses) / len(nll_losses))
        f.write(f"\naverage perplexity: {average_perplexity}\n")
        print(f"Average perplexity of {file_name}: ", average_perplexity)


import torch
import torch.nn as nn
import torch.optim as optim

def train(
    model: torch.nn.Module,
    train_loader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: nn.Module,
    device: torch.device,
    epoch: int,
) -> float:
    """
    Train the model for one epoch.

    Args:
    - model (torch.nn.Module): The FFNN language model.
    - train_loader (DataLoader): DataLoader for training dataset.
    - optimizer (torch.optim.Optimizer): Optimizer for training.
    - criterion (nn.Module): Loss function (nn.NLLLoss).
    - device (torch.device): Device to train on (CPU/GPU).

    Returns:
    - float: Average training loss per sample.
    """
    num_items = len(train_loader.dataset)
    assert num_items > 0, "[train] Training data must be present"
    
    model.train()
    total_loss = 0

    for e in range(epoch):
        total_loss=0 
        for context, target,_ in train_loader:  # Expecting dataset to return (context, target, sentence)
            context, target =/kaggle/input/corpus context.to(device), target.to(device)
            optimizer.zero_grad()

            # Forward pass
            output = model(context)

            # Ensure output and target sizes match
            if output.size(0) != target.size(0):
                if output.size(0) < target.size(0):
                    target = target[: output.size(0)]
                else:
                    output = output[: target.size(0)]

            # Compute loss
            loss = criterion(output, target)
            # Backpropagation
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {e} Loss: ",total_loss / num_items)

    return model

def run_experiment(corpus_path, n, model_type="FFNN"):
    # Tokenize, build vocab, etc.
    train_sentences, val_sentences, test_sentences, vocab = tokenize_corpus(corpus_path, n)

    # Create Datasets
    train_dataset = NGramDataset(train_sentences, n, vocab)
    val_dataset = NGramDataset(val_sentences, n, vocab)
    test_dataset = NGramDataset(test_sentences, n, vocab)

    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False, drop_last=False)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, drop_last=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, drop_last=False)
    
    # Initialize model
    vocab_size = len(vocab)
    embed_size = (300)
    hidden_size = (n*80)

    model = FFNNLanguageModel(vocab_size, embed_size, hidden_size, n,0.6)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

     # Set up optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.NLLLoss()

    # Train the model
    print(f"\nTraining FFNN Model for {n}-grams on {corpus_path}...\n")
    epoch=5
    # if(n==5):
    #     epoch=4
    model = train(model, train_loader, optimizer, criterion, device,epoch)

    # Save the trained model
    model_save_path = f"trained_model_{n}_{os.path.basename(corpus_path)}.pt"
    torch.save({
        'model_state_dict': model.state_dict(),
        'vocab': vocab,  # Save the vocab so we can reuse it later
        'n': n
    }, model_save_path)
    print(f"Model saved to {model_save_path}")
    
    # Compute Sentence-Level Perplexity
    criterion = nn.CrossEntropyLoss(reduction='none')
    # Compute Perplexity for Train, Validation, and Test sets & Save to files
    train_file = f"{n}_{corpus_path.split('/')[-1]}_train_perplexity.txt"
    val_file = f"{n}_{corpus_path.split('/')[-1]}_val_perplexity.txt"
    test_file = f"{n}_{corpus_path.split('/')[-1]}_test_perplexity.txt"

    print("\nCalculating and saving Train Perplexity...")
    set_perplexity(model, train_loader, device, train_file)

    print("\nCalculating and saving Validation Perplexity...")
    set_perplexity(model, val_loader, device, val_file)

    print("\nCalculating and saving Test Perplexity...")
    set_perplexity(model, test_loader, device, test_file)

    print(f"Perplexity scores saved in: {train_file}, {val_file}, {test_file}")


    # If you also want to see the old approach (token-level) by summing everything:
    # train_ppl = compute_perplexity(model, train_loader, criterion, vocab, "train_output.txt")
    # test_ppl = compute_perplexity(model, test_loader, criterion, vocab, "test_output.txt")
    # print(f"Token-Level Train PPL: {train_ppl:.4f}, Token-Level Test PPL: {test_ppl:.4f}")

    return model_save_path



if __name__ == "__main__":
    # Corpus Paths
    pride_corpus = "iNLP/Pride_and_Prejudice.txt"
    ulysses_corpus = "iNLP/Ulysses.txt"

    saved_models = []  # Store the saved model paths

    # Run for n=3 and n=5
    for n in [3,5]:
        saved_models.append(run_experiment(pride_corpus, n))
        saved_models.append(run_experiment(ulysses_corpus, n))

    print("\nSaved models:", saved_models)


Training FFNN Model for 3-grams on iNLP/Pride_and_Prejudice.txt...

Epoch 0 Loss:  0.1907750821006137
Epoch 1 Loss:  0.17088116370503403
Epoch 2 Loss:  0.16388738821552107
Epoch 3 Loss:  0.15874147350930187
Epoch 4 Loss:  0.1545520540613739
Model saved to trained_model_3_Pride_and_Prejudice.txt.pt

Calculating and saving Train Perplexity...
Average perplexity of 3_Pride_and_Prejudice.txt_train_perplexity.txt:  80.16537281333902

Calculating and saving Validation Perplexity...
Average perplexity of 3_Pride_and_Prejudice.txt_val_perplexity.txt:  207.3287804036289

Calculating and saving Test Perplexity...
Average perplexity of 3_Pride_and_Prejudice.txt_test_perplexity.txt:  219.7100929023118
Perplexity scores saved in: 3_Pride_and_Prejudice.txt_train_perplexity.txt, 3_Pride_and_Prejudice.txt_val_perplexity.txt, 3_Pride_and_Prejudice.txt_test_perplexity.txt

Training FFNN Model for 3-grams on iNLP/Ulysses.txt...

Epoch 0 Loss:  0.23269224667824212
Epoch 1 Loss:  0.21672130941680667


KeyboardInterrupt: 

In [25]:
import torch
import torch.nn as nn
import sys
import os
import numpy as np
#from FFNN_language_model import FFNNLanguageModel  # Import FFNN model class

def load_model(model_path, device):
    """
    Loads a pre-trained FFNN language model from the given path.

    Args:
    - model_path (str): Path to the trained model file.
    - device (torch.device): Device to load the model on (CPU/GPU).

    Returns:
    - model (torch.nn.Module): Loaded FFNN model.
    - vocab (dict): Vocabulary used in training.
    - n (int): N-gram size used in training.
    """
    checkpoint = torch.load(model_path, map_location=device)

    vocab = checkpoint['vocab']
    n = checkpoint['n']
    vocab_size = len(vocab)
    embed_size = 300
    hidden_size = 256

    # Initialize the model
    model = FFNNLanguageModel(vocab_size, embed_size, hidden_size, n, dropout_rate=0.5)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()

    return model, vocab, n

def predict_next_words(model, vocab, n, input_text, k, device):
    """
    Predicts the top K most probable next words given an input text.

    Args:
    - model (torch.nn.Module): Loaded FFNN model.
    - vocab (dict): Vocabulary mapping words to indices.
    - n (int): N-gram size.
    - input_text (str): Input context for word prediction.
    - k (int): Number of top probable words to return.
    - device (torch.device): Device for model inference.

    Returns:
    - List of tuples (word, probability) representing top-k predictions.
    """
    # Tokenize input text
    words = input_text.strip().split()

    if len(words) < n - 1:
        print(f"[ERROR] Input must have at least {n-1} words.")
        return []

    # Extract last (n-1) words for context
    context_words = words[-(n-1):]

    # Convert words to indices
    context_indices = [vocab.get(word, vocab["<UNK>"]) for word in context_words]
    context_tensor = torch.tensor(context_indices, dtype=torch.long, device=device).unsqueeze(0)

    # Get model predictions
    with torch.no_grad():
        output = model(context_tensor)  # Shape: (1, vocab_size)
        probabilities = torch.exp(output).squeeze(0).cpu().numpy()  # Convert log-probs to normal probs

    # Get the top K words
    top_k_indices = np.argsort(probabilities)[-k:][::-1]  # Sort in descending order
    index_to_word = {idx: word for word, idx in vocab.items()}
    top_k_words = [(index_to_word[idx], probabilities[idx]) for idx in top_k_indices]

    return top_k_words

if __name__ == "__main__":
   

    lm_type = "-f"  # FFNN model
    corpus_path = "/kaggle/input/corpus/Ulysses.txt"  # Path to corpus
    k = 5  # Number of top predicted words


    if lm_type != "-f":
        print("[ERROR] Currently, only FFNN models are supported.")
        sys.exit(1)

    # Find model file corresponding to the corpus
    model_path = f"trained_model_5_{os.path.basename(corpus_path)}.pt"
    if not os.path.exists(model_path):
        print(f"[ERROR] Model file {model_path} not found.")
        sys.exit(1)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load the trained model
    model, vocab, n = load_model(model_path, device)

    # User input loop
    while True:
        input_text = input("\nEnter input sentence (or type 'exit' to quit): ").strip()
        if input_text.lower() == "exit":
            break

        top_k_words = predict_next_words(model, vocab, n, input_text, k, device)
        
        if top_k_words:
            print("\nTop predictions:")
            for word, prob in top_k_words:
                print(f"  {word}: {prob:.4f}")


  checkpoint = torch.load(model_path, map_location=device)



Enter input sentence (or type 'exit' to quit):  He is a very good 



Top predictions:
  job: 0.0431
  old: 0.0388
  day: 0.0307
  idea: 0.0277
  for: 0.0270



Enter input sentence (or type 'exit' to quit):  I am going to



Top predictions:
  be: 0.0376
  throw: 0.0126
  see: 0.0084
  make: 0.0079
  catch: 0.0064



Enter input sentence (or type 'exit' to quit):  he is a nice



Top predictions:
  man: 0.0371
  fellow: 0.0202
  young: 0.0139
  old: 0.0117
  little: 0.0085



Enter input sentence (or type 'exit' to quit):  She is a nice



Top predictions:
  man: 0.0529
  fellow: 0.0200
  song: 0.0150
  young: 0.0131
  day: 0.0093



Enter input sentence (or type 'exit' to quit):  It is time to



Top predictions:
  be: 0.0502
  the: 0.0334
  hell: 0.0244
  get: 0.0243
  see: 0.0111



Enter input sentence (or type 'exit' to quit):  I am going to see



Top predictions:
  the: 0.2257
  them: 0.0555
  you: 0.0410
  her: 0.0355
  me: 0.0343



Enter input sentence (or type 'exit' to quit):  exit


In [8]:
import os
import gc
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.cuda.amp import autocast, GradScaler

# Optimize GPU usage and reduce fragmentation
torch.backends.cudnn.benchmark = True  
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# ---------------------------
# Data Preparation
# ---------------------------
MAX_SEQ_LENGTH = 128  # Maximum tokens per sentence

def tokenize_corpus(corpus_path):
    """
    Reads the corpus, tokenizes it, and splits long sentences.
    If a sentence is longer than MAX_SEQ_LENGTH, it is split into multiple segments.
    Each segment gets a start token "<s>" and an end token "</s>".
    """
    with open(corpus_path, "r", encoding="utf-8") as file:
        corpus = file.read()
    
    # Use your Tokenizer here; if not available, you can use a simple split.
    try:
        tokenizer = Tokenizer(corpus)  # Assumed to be defined elsewhere
        tokenized_sentences = tokenizer.tokenize()
    except Exception:
        tokenized_sentences = [line.split() for line in corpus.splitlines() if line.strip()]
    
    final_tokenized = []
    for sentence in tokenized_sentences:
        if len(sentence) > MAX_SEQ_LENGTH:
            for i in range(0, len(sentence), MAX_SEQ_LENGTH):
                sub_sentence = sentence[i:i + MAX_SEQ_LENGTH]
                final_tokenized.append(["<s>"] + sub_sentence + ["</s>"])
        else:
            final_tokenized.append(["<s>"] + sentence + ["</s>"])
    
    # Split into train (80%), validation (10%), test (10%)
    train_sentences, temp_sentences = train_test_split(final_tokenized, test_size=0.2, random_state=42)
    val_sentences, test_sentences = train_test_split(temp_sentences, test_size=0.5, random_state=42)
    
    # ---------------------------
    # Build Vocabulary with explicit <PAD> token at index 0.
    # ---------------------------
    vocab_set = set(word for sentence in train_sentences for word in sentence)
    vocab = {"<PAD>": 0}  # Reserve index 0 for padding.
    for word in vocab_set:
        if word != "<PAD>":
            vocab[word] = len(vocab)
    # Add <UNK> token if not already in vocab
    if "<UNK>" not in vocab:
        vocab["<UNK>"] = len(vocab)
    
    return train_sentences, val_sentences, test_sentences, vocab

# ---------------------------
# Dataset Definition
# ---------------------------
class SentenceDataset(Dataset):
    """
    Dataset that returns entire sentences as sequences of token IDs.
    For each sentence, input is sentence[:-1] and target is sentence[1:].
    """
    def __init__(self, tokenized_corpus, vocab):
        self.vocab = vocab
        self.sentences = tokenized_corpus  # Each sentence is a list of tokens
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        input_tokens = [self.vocab.get(word, self.vocab["<UNK>"]) for word in sentence[:-1]]
        target_tokens = [self.vocab.get(word, self.vocab["<UNK>"]) for word in sentence[1:]]
        return (torch.tensor(input_tokens, dtype=torch.long),
                torch.tensor(target_tokens, dtype=torch.long),
                " ".join(sentence))

# ---------------------------
# Collate Function for Padding
# ---------------------------
def collate_fn(batch):
    """
    Pads sequences in a batch to the same length.
    Each item in the batch is (input_tensor, target_tensor, sentence_string).
    """
    inputs = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    sentences = [item[2] for item in batch]
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded, sentences

# ---------------------------
# Vanilla RNN Language Model Definition
# ---------------------------
class RNNLanguageModel(nn.Module):
    """ RNN-based language model for full-sentence prediction. """
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, dropout_rate=0.5):
        super(RNNLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.relu = nn.Tanh()  # Using GELU; you can experiment with other activations.
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x, hidden=None):
        embedded = self.embedding(x)  # (batch_size, seq_length, embed_size)
        output, hidden = self.rnn(embedded, hidden)  # (batch_size, seq_length, hidden_size)
        output = self.layer_norm(output)
        output = self.dropout(output)
        logits = self.fc(output)  # (batch_size, seq_length, vocab_size)
        batch_size, seq_len, _ = logits.size()
        logits = logits.contiguous().view(batch_size * seq_len, -1)
        log_probs = self.log_softmax(logits)
        return log_probs, hidden

# ---------------------------
# Training Function (Using AMP)
# ---------------------------
def train(model, train_loader, optimizer, criterion, device, epochs):
    """ Trains the RNN/LSTM model using mixed precision (AMP) and ignores padding tokens. """
    model.train()
    scaler = GradScaler()

    for epoch in range(epochs):
        epoch_loss, total_samples = 0, 0
    
        for inputs, targets, _ in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
    
            with autocast():
                log_probs, _ = model(inputs)
                flat_targets = targets.contiguous().view(-1)
                # With reduction="none", loss is a vector of losses per token
                loss = criterion(log_probs, flat_targets)
                mask = flat_targets != 0  # Assuming 0 is the <PAD> index
                masked_loss = loss[mask]   # Now we can index since loss is not a scalar
                if masked_loss.numel() > 0:
                    loss = masked_loss.mean()
                else:
                    loss = torch.tensor(0.0, device=device)
    
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
    
            epoch_loss += loss.item() * mask.sum().item()
            total_samples += mask.sum().item()
    
        avg_loss = epoch_loss / total_samples if total_samples > 0 else float('inf')
        print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")
    
    return model



# ---------------------------
# Perplexity Calculation Functions
# ---------------------------
def calculate_nll(model, loader, device) -> tuple[list[float], list[str]]:
    """
    Compute the negative log-likelihood (NLL) loss for each sentence.
    For each sentence, flatten predictions and targets and compute mean loss.
    """
    model.eval()
    sentence_losses = []
    sentences_out = []
    
    with torch.no_grad():
        for inputs, targets, sentences in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            log_probs, _ = model(inputs)  # (batch_size * seq_length, vocab_size)
            flat_targets = targets.contiguous().view(-1)
            loss = nn.NLLLoss(reduction="none")(log_probs, flat_targets)
            batch_size, seq_len = inputs.size(0), inputs.size(1)
            loss = loss.view(batch_size, seq_len)
            for i in range(batch_size):
                # Create mask for non-padding tokens (padding is 0, reserved for <PAD>)
                mask = targets[i] != 0
                if mask.sum().item() > 0:
                    mean_loss = loss[i][mask].mean().item()
                else:
                    mean_loss = float('inf')
                sentence_losses.append(mean_loss)
            sentences_out.extend(sentences)
    return sentence_losses, sentences_out

def set_perplexity(model, loader, device, file_name):
    """
    Computes and saves the perplexity of sentences in the given DataLoader.
    Duplicate sentences are merged by averaging their losses.
    """
    nll_losses, sentences = calculate_nll(model, loader, device)
    from collections import defaultdict
    sentence_loss_dict = defaultdict(list)
    for sentence, loss in zip(sentences, nll_losses):
        sentence_loss_dict[sentence].append(loss)
    with open(file_name, "w") as f:
        f.write(f"{'Sentence':<80}{'Perplexity'}\n")
        f.write("=" * 100 + "\n")
        for sentence, losses in sentence_loss_dict.items():
            avg_loss = np.mean(losses)
            ppl = np.exp(avg_loss)
            f.write(f"{sentence:<80}{ppl:.5f}\n")
        overall_ppl = np.exp(np.mean([np.mean(losses) for losses in sentence_loss_dict.values()]))
        f.write("=" * 100 + "\n")
        f.write(f"{'Average Perplexity':<80}{overall_ppl:.5f}\n")
        print(f"Average perplexity of {file_name}: {overall_ppl:.5f}")

# ---------------------------
# Run Experiment Function
# ---------------------------
def run_experiment(corpus_path, model_type="RNN"):
    """
    Runs the full training and evaluation pipeline.
    """
    # Tokenize and prepare data
    train_sentences, val_sentences, test_sentences, vocab = tokenize_corpus(corpus_path)
    
    train_dataset = SentenceDataset(train_sentences, vocab)
    val_dataset = SentenceDataset(val_sentences, vocab)
    test_dataset = SentenceDataset(test_sentences, vocab)
    
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, drop_last=False, collate_fn=collate_fn, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, drop_last=False, collate_fn=collate_fn, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, drop_last=False, collate_fn=collate_fn, pin_memory=True)
    
    vocab_size = len(vocab)
    embed_size = 300
    hidden_size = 512
    
    # Create RNN Model
    model = RNNLanguageModel(vocab_size, embed_size, hidden_size, num_layers=1, dropout_rate=0.3)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.NLLLoss(ignore_index=0,reduction="none")  # 0 is the index of <PAD> token in the vocab
    
    print(f"\nTraining {model_type} Model on {corpus_path}...\n")
    gc.collect()
    torch.cuda.empty_cache()
    model = train(model, train_loader, optimizer, criterion, device, epochs=5)
    
    model_save_path = f"trained_model_{os.path.basename(corpus_path)}_{model_type}.pt"
    torch.save({'model_state_dict': model.state_dict(), 'vocab': vocab}, model_save_path)
    print(f"Model saved to {model_save_path}")
    
    train_file = f"{os.path.basename(corpus_path)}_train_perplexity_{model_type}.txt"
    val_file = f"{os.path.basename(corpus_path)}_val_perplexity_{model_type}.txt"
    test_file = f"{os.path.basename(corpus_path)}_test_perplexity_{model_type}.txt"
    
    print("\nCalculating and saving Train Perplexity...")
    set_perplexity(model, train_loader, device, train_file)
    print("\nCalculating and saving Validation Perplexity...")
    set_perplexity(model, val_loader, device, val_file)
    print("\nCalculating and saving Test Perplexity...")
    set_perplexity(model, test_loader, device, test_file)
    print(f"Perplexity scores saved in: {train_file}, {val_file}, {test_file}")
    
    return model_save_path

# ---------------------------
# Main
# ---------------------------
if __name__ == "__main__":
    pride_corpus = "/kaggle/input/corpus/Pride_and_Prejudice.txt"
    ulysses_corpus = "/kaggle/input/corpus/Ulysses.txt"
    
    saved_models = []
    for corpus in [pride_corpus, ulysses_corpus]:
        saved_models.append(run_experiment(corpus, model_type="RNN"))
    
    print("\nSaved models:", saved_models)



Training RNN Model on /kaggle/input/corpus/Pride_and_Prejudice.txt...



  scaler = GradScaler()
  with autocast():


Epoch 1 - Average Loss: 5.8174
Epoch 2 - Average Loss: 5.0246
Epoch 3 - Average Loss: 4.5974
Epoch 4 - Average Loss: 4.2508
Epoch 5 - Average Loss: 3.9306
Model saved to trained_model_Pride_and_Prejudice.txt_RNN.pt

Calculating and saving Train Perplexity...
Average perplexity of Pride_and_Prejudice.txt_train_perplexity_RNN.txt: 23.73036

Calculating and saving Validation Perplexity...
Average perplexity of Pride_and_Prejudice.txt_val_perplexity_RNN.txt: 183.07515

Calculating and saving Test Perplexity...
Average perplexity of Pride_and_Prejudice.txt_test_perplexity_RNN.txt: 188.38155
Perplexity scores saved in: Pride_and_Prejudice.txt_train_perplexity_RNN.txt, Pride_and_Prejudice.txt_val_perplexity_RNN.txt, Pride_and_Prejudice.txt_test_perplexity_RNN.txt

Training RNN Model on /kaggle/input/corpus/Ulysses.txt...

Epoch 1 - Average Loss: 7.0944
Epoch 2 - Average Loss: 6.4351
Epoch 3 - Average Loss: 6.0795
Epoch 4 - Average Loss: 5.7297
Epoch 5 - Average Loss: 5.3746
Model saved to tra

In [15]:
import torch
import torch.nn as nn

def load_model(model_path, device):
    """
    Loads a pre-trained RNN language model from a checkpoint.
    
    Args:
      model_path (str): Path to the trained model file.
      device (torch.device): Device to load the model on (CPU/GPU).
      
    Returns:
      model (torch.nn.Module): Loaded RNN model.
      vocab (dict): Vocabulary used in training.
    """
    checkpoint = torch.load(model_path, map_location=device)
    
    vocab = checkpoint['vocab']
    vocab_size = len(vocab)
    embed_size = 300    # Ensure this matches the training config
    hidden_size = 512   # Ensure this matches the training config
    num_layers = 1      # As used during training
    dropout_rate = 0.3  # As used during training

    # Initialize the model
    model = RNNLanguageModel(vocab_size, embed_size, hidden_size, num_layers, dropout_rate)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()  # Set to evaluation mode

    return model, vocab


def predict_next_words(model, vocab, input_sentence, k, device):
    """
    Given an input sentence, returns the top k predicted next words along with their probabilities.
    
    Args:
      model (torch.nn.Module): The trained RNN language model.
      vocab (dict): Vocabulary mapping words to indices.
      input_sentence (str): The input sentence (context) as a string.
      k (int): Number of top predicted words to return.
      device (torch.device): The device on which to run the inference.
      
    Returns:
      List[Tuple[str, float]]: A list of tuples where each tuple is (word, probability).
    """
    # Tokenize the input sentence (using simple whitespace splitting)
    words = input_sentence.strip().split()
    if not words:
        print("[ERROR] Input sentence is empty.")
        return []
    
    # Convert words to indices (using <UNK> for out-of-vocabulary words)
    indices = [vocab.get(word, vocab["<UNK>"]) for word in words]
    
    # Create input tensor with shape (1, seq_length)
    input_tensor = torch.tensor(indices, dtype=torch.long, device=device).unsqueeze(0)
    
    with torch.no_grad():
        # Get model output; our model returns flattened log_probs, so we reshape it
        log_probs, _ = model(input_tensor)
    
    # Determine the sequence length from the input tensor
    seq_length = input_tensor.size(1)
    vocab_size = len(vocab)
    
    # Reshape log_probs from (batch_size*seq_length, vocab_size) to (batch_size, seq_length, vocab_size)
    log_probs = log_probs.view(1, seq_length, vocab_size)
    
    # We want to use the prediction for the last time step (i.e. next word prediction)
    last_log_probs = log_probs[:, -1, :]  # Shape: (1, vocab_size)
    probabilities = torch.exp(last_log_probs).squeeze(0)  # Shape: (vocab_size,)
    
    # Get the top k predictions, sorted in descending order
    topk = torch.topk(probabilities, k, largest=True, sorted=True)
    topk_indices = topk.indices.cpu().numpy()
    topk_probs = topk.values.cpu().numpy()
    
    # Create an inverse mapping from index to word
    index_to_word = {idx: word for word, idx in vocab.items()}
    
    predictions = [(index_to_word.get(int(idx), "<UNK>"), float(prob)) for idx, prob in zip(topk_indices, topk_probs)]
    
    return predictions

# Example usage:
if __name__ == "__main__":
    # Assume model and vocab are already loaded, for example:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model, vocab = load_model("trained_model_Pride_and_Prejudice.txt_RNN.pt", device)  # Your loading function
    # For demonstration, we'll assume they are defined.
    
    input_sentence = input("Enter an input sentence: ").strip()
    k = 5
    print("RNN Next-Word Generator")
    while True:
        input_text = input("\nEnter input sentence (or type 'exit' to quit): ").strip()
        if input_text.lower() == "exit":
            break
        preds = predict_next_words(model, vocab, input_text, k, device)
        print("\nTop predictions:")
        for word, prob in preds:
            print(f"  {word}: {prob:.4f}")


  checkpoint = torch.load(model_path, map_location=device)


Enter an input sentence:  He is a 


RNN Next-Word Generator



Enter input sentence (or type 'exit' to quit):  I am



Top predictions:
  entitled: 0.0356
  convinced: 0.0297
  determined: 0.0259
  glad: 0.0236
  gone: 0.0221



Enter input sentence (or type 'exit' to quit):  He is



Top predictions:
  exquisite: 0.1613
  right: 0.0183
  blessed: 0.0144
  recovered: 0.0142
  mercenary: 0.0132



Enter input sentence (or type 'exit' to quit):  He is a



Top predictions:
  joke: 0.1265
  son: 0.0870
  wide: 0.0624
  most: 0.0458
  stubbornness: 0.0275



Enter input sentence (or type 'exit' to quit):  He is the most



Top predictions:
  amusing: 0.1953
  unforgiving: 0.1466
  flattering: 0.0367
  remarkable: 0.0210
  elevated: 0.0202



Enter input sentence (or type 'exit' to quit):  She is a very good



Top predictions:
  fun: 0.1202
  girl: 0.1104
  creature: 0.0336
  distance: 0.0327
  sort: 0.0287



Enter input sentence (or type 'exit' to quit):  He is a very good



Top predictions:
  fun: 0.1202
  girl: 0.1104
  creature: 0.0336
  distance: 0.0327
  sort: 0.0287



Enter input sentence (or type 'exit' to quit):  We are a nice



Top predictions:
  long: 0.1220
  which: 0.0475
  aspect: 0.0365
  and: 0.0361
  of: 0.0334



Enter input sentence (or type 'exit' to quit):  I am going to



Top predictions:
  be: 0.0867
  make: 0.0623
  introduce: 0.0601
  sit: 0.0434
  hope: 0.0309



Enter input sentence (or type 'exit' to quit):  Why are you



Top predictions:
  must: 0.1006
  have: 0.0348
  pleased: 0.0314
  know: 0.0288
  are: 0.0266



Enter input sentence (or type 'exit' to quit):  He is the most



Top predictions:
  amusing: 0.1953
  unforgiving: 0.1466
  flattering: 0.0367
  remarkable: 0.0210
  elevated: 0.0202



Enter input sentence (or type 'exit' to quit):  he is the



Top predictions:
  most: 0.2013
  handsomest: 0.0938
  very: 0.0296
  same: 0.0267
  son: 0.0223



Enter input sentence (or type 'exit' to quit):  she is the



Top predictions:
  most: 0.0434
  youngest: 0.0404
  greatest: 0.0248
  handsomest: 0.0239
  same: 0.0220



Enter input sentence (or type 'exit' to quit):  he is the handsomest young



Top predictions:
  man: 0.6015
  woman: 0.1477
  ladies: 0.0795
  men: 0.0451
  lady: 0.0374



Enter input sentence (or type 'exit' to quit):  he is the handsomest young man that ever was



Top predictions:
  not: 0.1076
  a: 0.0729
  the: 0.0557
  over: 0.0487
  so: 0.0383



Enter input sentence (or type 'exit' to quit):  exit


In [16]:
import os
import gc
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.cuda.amp import autocast, GradScaler

# Uncomment or define your Tokenizer class as needed.
# from tokenizer import Tokenizer

# For better GPU memory management
torch.backends.cudnn.benchmark = True
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# ---------------------------
# Data Preparation
# ---------------------------
MAX_SEQ_LENGTH = 128  # Maximum tokens per sentence

def tokenize_corpus(corpus_path):
    """
    Reads the corpus, tokenizes it, and ensures that no sentence exceeds MAX_SEQ_LENGTH.
    If a sentence is longer, it is split into multiple sentences of length MAX_SEQ_LENGTH.
    Start ("<s>") and end ("</s>") tokens are added to each segment.
    """
    with open(corpus_path, "r", encoding="utf-8") as file:
        corpus = file.read()
    
    tokenizer = Tokenizer(corpus)  # Assuming Tokenizer is defined elsewhere
    tokenized_sentences = tokenizer.tokenize()

    final_tokenized = []
    
    for sentence in tokenized_sentences:
        if len(sentence) > MAX_SEQ_LENGTH:
            # Split into multiple sentences of length MAX_SEQ_LENGTH
            for i in range(0, len(sentence), MAX_SEQ_LENGTH):
                sub_sentence = sentence[i:i + MAX_SEQ_LENGTH]  # Take 128 tokens at a time
                final_tokenized.append(["<s>"] + sub_sentence + ["</s>"])
        else:
            # Keep sentence as is if within length limit
            final_tokenized.append(["<s>"] + sentence + ["</s>"])
    
    # Split dataset: 80% Train, 10% Validation, 10% Test
    train_sentences, temp_sentences = train_test_split(final_tokenized, test_size=0.2, random_state=42)
    val_sentences, test_sentences = train_test_split(temp_sentences, test_size=0.5, random_state=42)
    
    # Build vocabulary from training sentences; add <UNK> token
    vocab = {word: idx for idx, word in enumerate(set(word for sentence in train_sentences for word in sentence))}
    vocab["<UNK>"] = len(vocab)
    
    return train_sentences, val_sentences, test_sentences, vocab

# ---------------------------
# Dataset Definition (Full Sentence)
# ---------------------------
class SentenceDataset(Dataset):
    """
    Dataset that returns entire sentences as sequences of token IDs.
    For each sentence, the input is sentence[:-1] and the target is sentence[1:].
    """
    def __init__(self, tokenized_corpus, vocab):
        self.vocab = vocab
        self.sentences = tokenized_corpus  # Each sentence is a list of tokens
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        input_tokens = [self.vocab.get(word, self.vocab["<UNK>"]) for word in sentence[:-1]]
        target_tokens = [self.vocab.get(word, self.vocab["<UNK>"]) for word in sentence[1:]]
        return (torch.tensor(input_tokens, dtype=torch.long),
                torch.tensor(target_tokens, dtype=torch.long),
                " ".join(sentence))

# ---------------------------
# Collate Function for Padding
# ---------------------------
def collate_fn(batch):
    """
    Pads sequences in a batch.
    Each item in the batch is (input_tensor, target_tensor, sentence_string).
    """
    inputs = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    sentences = [item[2] for item in batch]
    
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded, sentences

# ---------------------------
# Vanilla LSTM Language Model Definition
# ---------------------------
# ---------------------------
# Vanilla LSTM Language Model Definition
# ---------------------------
class LSTMLanguageModel(nn.Module):
    """
    Vanilla LSTM Language Model.
    
    The model embeds input tokens, processes the sequence with an LSTM,
    and uses the final hidden state to predict the next word.
    
    Args:
      vocab_size (int): Size of the vocabulary.
      embed_size (int): Dimensionality of word embeddings.
      hidden_size (int): Size of the LSTM hidden state.
      num_layers (int): Number of LSTM layers.
      dropout_rate (float): Dropout rate.
    """
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, dropout_rate=0.5):
        super(LSTMLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.num_layers = num_layers
        self.hidden_size = hidden_size

    def forward(self, x, hidden=None):
        """
        Args:
          x (Tensor): Input tensor of shape (batch_size, seq_length)
          hidden (tuple): Optional initial (hidden, cell) state.
          
        Returns:
          log_probs (Tensor): Log-probabilities with shape (batch_size*seq_length, vocab_size)
          hidden (tuple): Final (hidden, cell) state.
        """
        batch_size = x.size(0)  # Get batch size dynamically

        # Initialize hidden and cell states if not provided
        if hidden is None:
            h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=x.device)
            c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=x.device)
            hidden = (h0, c0)

        embedded = self.embedding(x)  # (batch_size, seq_length, embed_size)
        output, hidden = self.lstm(embedded, hidden)  # (batch_size, seq_length, hidden_size)
        output = self.layer_norm(output)
        output = self.dropout(output)
        logits = self.fc(output)  # (batch_size, seq_length, vocab_size)
        
        batch_size, seq_len, _ = logits.size()
        logits = logits.contiguous().view(batch_size * seq_len, -1)  # Flatten for loss computation
        log_probs = self.log_softmax(logits)

        return log_probs, hidden


# ---------------------------
# Training Function (Using AMP)
# ---------------------------
def train(model, train_loader, optimizer, criterion, device, epochs):
    """ Trains the RNN/LSTM model using mixed precision (AMP) and ignores padding tokens. """
    model.train()
    scaler = GradScaler()

    for epoch in range(epochs):
        epoch_loss, total_samples = 0, 0
    
        for inputs, targets, _ in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
    
            with autocast():
                log_probs, _ = model(inputs)
                flat_targets = targets.contiguous().view(-1)
                # With reduction="none", loss is a vector of losses per token
                loss = criterion(log_probs, flat_targets)
                mask = flat_targets != 0  # Assuming 0 is the <PAD> index
                masked_loss = loss[mask]   # Now we can index since loss is not a scalar
                if masked_loss.numel() > 0:
                    loss = masked_loss.mean()
                else:
                    loss = torch.tensor(0.0, device=device)
    
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
    
            epoch_loss += loss.item() * mask.sum().item()
            total_samples += mask.sum().item()
    
        avg_loss = epoch_loss / total_samples if total_samples > 0 else float('inf')
        print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")
    
    return model

# ---------------------------
# Perplexity Calculation Functions
# ---------------------------
def calculate_nll(model, loader, device):
    """
    Compute the negative log-likelihood (NLL) loss for each sentence.
    For each sentence, flatten predictions and targets and compute mean loss.
    """
    model.eval()
    sentence_losses = []
    sentences_out = []
    
    with torch.no_grad():
        for inputs, targets, sentences in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            log_probs, _ = model(inputs)  # (batch_size * seq_len, vocab_size)
            flat_targets = targets.contiguous().view(-1)
            loss = nn.NLLLoss(reduction="none")(log_probs, flat_targets)
            # Reshape loss to (batch_size, seq_len)
            batch_size, seq_len = inputs.size(0), inputs.size(1)
            loss = loss.view(batch_size, seq_len)
            for i in range(batch_size):
                # Create mask for non-padding tokens (assuming padding value 0)
                mask = targets[i] != 0
                if mask.sum().item() > 0:
                    mean_loss = loss[i][mask].mean().item()
                else:
                    mean_loss = float('inf')
                sentence_losses.append(mean_loss)
            sentences_out.extend(sentences)
    return sentence_losses, sentences_out

def set_perplexity(model, loader, device, file_name):
    """
    Computes and saves sentence-level perplexity.
    Duplicate sentences are merged by averaging their losses.
    """
    nll_losses, sentences = calculate_nll(model, loader, device)
    # Merge duplicate sentences
    loss_dict = defaultdict(list)
    for sent, loss in zip(sentences, nll_losses):
        loss_dict[sent].append(loss)
    
    with open(file_name, "w") as f:
        f.write(f"{'Sentence':<80}{'Perplexity'}\n")
        f.write("=" * 100 + "\n")
        for sent, losses in loss_dict.items():
            avg_loss = np.mean(losses)
            ppl = np.exp(avg_loss)
            f.write(f"{sent:<80}{ppl:.5f}\n")
        overall_ppl = np.exp(np.mean([np.mean(losses) for losses in loss_dict.values()]))
        f.write("=" * 100 + "\n")
        f.write(f"{'Average Perplexity':<80}{overall_ppl:.5f}\n")
        print(f"Average perplexity of {file_name}: {overall_ppl:.5f}")

# ---------------------------
# Run Experiment (Training and Evaluation)
# ---------------------------
def run_experiment(corpus_path, model_type="LSTM"):
    # Tokenize and prepare data
    train_sentences, val_sentences, test_sentences, vocab = tokenize_corpus(corpus_path)
    train_dataset = SentenceDataset(train_sentences, vocab)
    val_dataset = SentenceDataset(val_sentences, vocab)
    test_dataset = SentenceDataset(test_sentences, vocab)
    
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, drop_last=False, collate_fn=collate_fn, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, drop_last=False, collate_fn=collate_fn, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, drop_last=False, collate_fn=collate_fn, pin_memory=True)
    
    vocab_size = len(vocab)
    embed_size = 300
    hidden_size = 256
    
    # Create LSTM model instance
    model = LSTMLanguageModel(vocab_size, embed_size, hidden_size, num_layers=1, dropout_rate=0.3)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.NLLLoss(ignore_index=0,reduction="none")  # 0 is the index of <PAD> token in the vocab
    
    print(f"\nTraining {model_type} Model on {corpus_path}...\n")
    gc.collect()
    torch.cuda.empty_cache()
    model = train(model, train_loader, optimizer, criterion, device, epochs=5)
    
    model_save_path = f"trained_model_{os.path.basename(corpus_path)}_{model_type}.pt"
    torch.save({'model_state_dict': model.state_dict(), 'vocab': vocab}, model_save_path)
    print(f"Model saved to {model_save_path}")
    
    # Calculate perplexity for each set
    train_file = f"{os.path.basename(corpus_path)}_train_perplexity_{model_type}.txt"
    val_file = f"{os.path.basename(corpus_path)}_val_perplexity_{model_type}.txt"
    test_file = f"{os.path.basename(corpus_path)}_test_perplexity_{model_type}.txt"
    
    print("\nCalculating and saving Train Perplexity...")
    set_perplexity(model, train_loader, device, train_file)
    print("\nCalculating and saving Validation Perplexity...")
    set_perplexity(model, val_loader, device, val_file)
    print("\nCalculating and saving Test Perplexity...")
    set_perplexity(model, test_loader, device, test_file)
    print(f"Perplexity scores saved in: {train_file}, {val_file}, {test_file}")
    
    return model_save_path



# ---------------------------
# Main
# ---------------------------
if __name__ == "__main__":
    pride_corpus = "/kaggle/input/corpus/Pride_and_Prejudice.txt"
    ulysses_corpus = "/kaggle/input/corpus/Ulysses.txt"
    
    saved_models = []
    for corpus in [pride_corpus,ulysses_corpus]:
        saved_models.append(run_experiment(corpus, model_type="LSTM"))

    print("\nSaved models:", saved_models)


Training LSTM Model on /kaggle/input/corpus/Pride_and_Prejudice.txt...



  scaler = GradScaler()
  with autocast():


Epoch 1 - Average Loss: 5.7155
Epoch 2 - Average Loss: 4.9206
Epoch 3 - Average Loss: 4.4996
Epoch 4 - Average Loss: 4.1361
Epoch 5 - Average Loss: 3.7850
Model saved to trained_model_Pride_and_Prejudice.txt_LSTM.pt

Calculating and saving Train Perplexity...
Average perplexity of Pride_and_Prejudice.txt_train_perplexity_LSTM.txt: 19.25903

Calculating and saving Validation Perplexity...
Average perplexity of Pride_and_Prejudice.txt_val_perplexity_LSTM.txt: 155.25994

Calculating and saving Test Perplexity...
Average perplexity of Pride_and_Prejudice.txt_test_perplexity_LSTM.txt: 161.31459
Perplexity scores saved in: Pride_and_Prejudice.txt_train_perplexity_LSTM.txt, Pride_and_Prejudice.txt_val_perplexity_LSTM.txt, Pride_and_Prejudice.txt_test_perplexity_LSTM.txt

Training LSTM Model on /kaggle/input/corpus/Ulysses.txt...

Epoch 1 - Average Loss: 6.9025
Epoch 2 - Average Loss: 6.2322
Epoch 3 - Average Loss: 5.7683
Epoch 4 - Average Loss: 5.3127
Epoch 5 - Average Loss: 4.8855
Model save

In [17]:
import torch
import torch.nn as nn
import sys
import os
import numpy as np


# ---------------------------
# Function to Load the Pre-trained LSTM Model
# ---------------------------
def load_model(model_path, device):
    """
    Loads a pre-trained LSTM language model from a checkpoint.
    
    Args:
      model_path (str): Path to the trained model file.
      device (torch.device): Device to load the model on.
      
    Returns:
      model (torch.nn.Module): Loaded LSTM model.
      vocab (dict): Vocabulary used in training.
    """
    checkpoint = torch.load(model_path, map_location=device)
    vocab = checkpoint['vocab']
    vocab_size = len(vocab)
    embed_size = 300   # Must match training configuration
    hidden_size = 256   # Must match training configuration
    num_layers = 1      # As used during training
    dropout_rate = 0.3  # As used during training

    model = LSTMLanguageModel(vocab_size, embed_size, hidden_size, num_layers, dropout_rate)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()
    return model, vocab

# ---------------------------
# Next-Word Prediction Function for LSTM
# ---------------------------
def predict_next_words(model, vocab, input_text, k, device):
    """
    Given an input sentence, predicts the top k most probable next words.
    
    The function:
      1. Tokenizes the input sentence (whitespace splitting).
      2. Converts words to indices.
      3. Creates an input tensor of shape (1, seq_length).
      4. Passes the tensor through the model.
      5. Reshapes the model output to (1, seq_length, vocab_size).
      6. Extracts the probabilities for the last time step.
      7. Returns the top k words sorted by probability.
    
    Args:
      model (torch.nn.Module): The trained LSTM model.
      vocab (dict): Vocabulary mapping words to indices.
      input_text (str): The input sentence.
      k (int): Number of top predictions to return.
      device (torch.device): Device for inference.
      
    Returns:
      List[Tuple[str, float]]: List of tuples containing (word, probability).
    """
    # Tokenize input sentence
    words = input_text.strip().split()
    if not words:
        print("[ERROR] Input sentence cannot be empty.")
        return []
    
    # Convert words to indices; use <UNK> if not found
    context_indices = [vocab.get(word, vocab["<UNK>"]) for word in words]
    # Create input tensor (batch_size=1)
    context_tensor = torch.tensor(context_indices, dtype=torch.long, device=device).unsqueeze(0)
    print(context_indices)
    with torch.no_grad():
        log_probs, _ = model(context_tensor)
    
    # Our model returns log_probs in flattened form: shape = (batch_size * seq_length, vocab_size)
    # We need to reshape to get predictions for the final time step.
    batch_size = context_tensor.size(0)
    seq_len = context_tensor.size(1)
    vocab_size = len(vocab)
    log_probs = log_probs.view(batch_size, seq_len, vocab_size)
    
    # Use the final time step (last token in the input sequence) for prediction.
    last_log_probs = log_probs[:, -1, :]  # Shape: (1, vocab_size)
    probabilities = torch.exp(last_log_probs).squeeze(0)  # Shape: (vocab_size,)
    
    # Get top k indices sorted in descending order.
    topk = torch.topk(probabilities, k, largest=True, sorted=True)
    topk_indices = topk.indices.cpu().numpy().flatten()
    topk_probs = topk.values.cpu().numpy().flatten()
    
    # Create an inverse mapping: index -> word
    index_to_word = {idx: word for word, idx in vocab.items()}
    
    predictions = [(index_to_word.get(int(idx), "<UNK>"), float(prob)) for idx, prob in zip(topk_indices, topk_probs)]
    return predictions

# ---------------------------
# Main Script for Generation
# ---------------------------
if __name__ == "__main__":
    # Hardcoded parameters:
    # Using LSTM model, corpus path is "Pride_and_Prejudice.txt", and k = 5.
    model_path = "trained_model_Pride_and_Prejudice.txt_LSTM.pt"  # Ensure this checkpoint exists.
    corpus_path = "/kaggle/input/corpus/Pride_and_Prejudice.txt"  # Hardcoded corpus path.
    k = 5
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load the pre-trained LSTM model and vocabulary.
    model, vocab = load_model(model_path, device)
    
    print("LSTM Next-Word Generator")
    while True:
        input_text = input("\nEnter input sentence (or type 'exit' to quit): ").strip()
        if input_text.lower() == "exit":
            break
        
        predictions = predict_next_words(model, vocab, input_text, k, device)
        
        if predictions:
            print("\nTop predictions:")
            for word, prob in predictions:
                print(f"  {word}: {prob:.4f}")


  checkpoint = torch.load(model_path, map_location=device)


LSTM Next-Word Generator



Enter input sentence (or type 'exit' to quit):  He went to


[5959, 2825, 4799]

Top predictions:
  the: 0.0921
  bed: 0.0904
  see: 0.0435
  be: 0.0398
  her: 0.0391



Enter input sentence (or type 'exit' to quit):  He went to the


[5959, 2825, 4799, 4764]

Top predictions:
  library: 0.1075
  rest: 0.0621
  house: 0.0432
  girls: 0.0366
  family: 0.0216



Enter input sentence (or type 'exit' to quit):  She is a very good


[5959, 1244, 4965, 4528, 1210]

Top predictions:
  notion: 0.3046
  match: 0.1051
  luck: 0.0692
  man: 0.0166
  sort: 0.0155



Enter input sentence (or type 'exit' to quit):  She is a good


[5959, 1244, 4965, 1210]

Top predictions:
  luck: 0.1773
  deal: 0.1761
  match: 0.0413
  cook: 0.0368
  journey: 0.0327



Enter input sentence (or type 'exit' to quit):  He is a good


[5959, 1244, 4965, 1210]

Top predictions:
  luck: 0.1773
  deal: 0.1761
  match: 0.0413
  cook: 0.0368
  journey: 0.0327



Enter input sentence (or type 'exit' to quit):  He is going to


[5959, 1244, 1176, 4799]

Top predictions:
  be: 0.3092
  the: 0.0398
  gretna: 0.0278
  a: 0.0265
  find: 0.0195



Enter input sentence (or type 'exit' to quit):  He is the


[5959, 1244, 4764]

Top predictions:
  most: 0.1124
  greatest: 0.0956
  son: 0.0709
  case: 0.0547
  matter: 0.0216



Enter input sentence (or type 'exit' to quit):  She is the


[5959, 1244, 4764]

Top predictions:
  most: 0.1124
  greatest: 0.0956
  son: 0.0709
  case: 0.0547
  matter: 0.0216



Enter input sentence (or type 'exit' to quit):  He is going to be a


[5959, 1244, 1176, 4799, 2019, 4965]

Top predictions:
  most: 0.1072
  little: 0.0591
  great: 0.0514
  man: 0.0497
  very: 0.0415



Enter input sentence (or type 'exit' to quit):  He is going to be a very


[5959, 1244, 1176, 4799, 2019, 4965, 4528]

Top predictions:
  agreeable: 0.1968
  different: 0.1271
  great: 0.0355
  handsome: 0.0259
  sensible: 0.0256



Enter input sentence (or type 'exit' to quit):  He is going to be a big


[5959, 1244, 1176, 4799, 2019, 4965, 5959]

Top predictions:
  </s>: 0.1149
  and: 0.1047
  woman: 0.0713
  man: 0.0577
  agreeable: 0.0339



Enter input sentence (or type 'exit' to quit):  I am going to


[5959, 1997, 1176, 4799]

Top predictions:
  be: 0.2162
  the: 0.1241
  gretna: 0.0380
  a: 0.0352
  have: 0.0315



Enter input sentence (or type 'exit' to quit):  I am feeling


[5959, 1997, 2323]

Top predictions:
  that: 0.1606
  </s>: 0.1460
  a: 0.0753
  the: 0.0521
  in: 0.0495



Enter input sentence (or type 'exit' to quit):  I feel sad


[5959, 3035, 2748]

Top predictions:
  business: 0.1187
  omen: 0.0993
  premises: 0.0147
  setting: 0.0058
  depended: 0.0058



Enter input sentence (or type 'exit' to quit):  I a feeling very


[5959, 4965, 2323, 4528]

Top predictions:
  large: 0.0392
  different: 0.0385
  pleasing: 0.0299
  great: 0.0288
  woman: 0.0284



Enter input sentence (or type 'exit' to quit):  It is difficult for me to


[5959, 1244, 4949, 433, 5146, 4799]

Top predictions:
  follow: 0.0456
  do: 0.0411
  accept: 0.0345
  pass: 0.0300
  pay: 0.0263



Enter input sentence (or type 'exit' to quit):  he is the


[472, 1244, 4764]

Top predictions:
  most: 0.1286
  case: 0.1101
  greatest: 0.0921
  son: 0.0735
  matter: 0.0299



Enter input sentence (or type 'exit' to quit):  he is the son


[472, 1244, 4764, 1615]

Top predictions:
  of: 0.7905
  </s>: 0.0915
  in: 0.0145
  he: 0.0120
  and: 0.0102



Enter input sentence (or type 'exit' to quit):  he is the handsomest young


[472, 1244, 4764, 2491, 3700]

Top predictions:
  man: 0.8379
  lady: 0.0944
  woman: 0.0121
  men: 0.0108
  ladies: 0.0077



Enter input sentence (or type 'exit' to quit):  Her carriage remained at the


[5959, 2067, 4431, 455, 4764]

Top predictions:
  door: 0.2386
  same: 0.0964
  instrument: 0.0344
  parsonage: 0.0339
  inn: 0.0255



Enter input sentence (or type 'exit' to quit):  Her carriage


[5959, 2067]

Top predictions:
  and: 0.1070
  </s>: 0.0604
  by: 0.0434
  the: 0.0423
  drove: 0.0366



Enter input sentence (or type 'exit' to quit):  You have no


[5959, 1112, 1820]

Top predictions:
  objection: 0.0638
  charms: 0.0407
  reason: 0.0382
  more: 0.0301
  compassion: 0.0269



Enter input sentence (or type 'exit' to quit):  Can you possibly


[5959, 3205, 1809]

Top predictions:
  guess: 0.0403
  and: 0.0348
  </s>: 0.0331
  get: 0.0207
  wonder: 0.0168



Enter input sentence (or type 'exit' to quit):  exit


In [9]:
!zip -r my_files.zip /kaggle/working/

updating: kaggle/working/ (stored 0%)
updating: kaggle/working/trained_model_5_Pride_and_Prejudice.txt.pt (deflated 8%)
updating: kaggle/working/3_Ulysses.txt_val_perplexity.txt (deflated 58%)
updating: kaggle/working/trained_model_5_Ulysses.txt.pt (deflated 7%)
updating: kaggle/working/3_Pride_and_Prejudice.txt_test_perplexity.txt (deflated 61%)
updating: kaggle/working/.virtual_documents/ (stored 0%)
updating: kaggle/working/trained_model_3_Pride_and_Prejudice.txt.pt (deflated 8%)
updating: kaggle/working/5_Ulysses.txt_train_perplexity.txt (deflated 61%)
updating: kaggle/working/trained_model_3_Ulysses.txt.pt (deflated 8%)
updating: kaggle/working/3_Ulysses.txt_train_perplexity.txt (deflated 58%)
updating: kaggle/working/3_Pride_and_Prejudice.txt_train_perplexity.txt (deflated 63%)
updating: kaggle/working/3_Pride_and_Prejudice.txt_val_perplexity.txt (deflated 61%)
updating: kaggle/working/5_Pride_and_Prejudice.txt_test_perplexity.txt (deflated 63%)
updating: kaggle/working/5_Ulysses

In [14]:
# List files in the working directory to confirm
import shutil

# Compress the .pt file
shutil.make_archive("/kaggle/working/model_backup", 'zip', "/kaggle/working", "trained_model_3_Pride_and_Prejudice.txt.pt")


'/kaggle/working/model_backup.zip'