In [1]:
#@title Import Required Libraries
import re
from gensim.models import KeyedVectors
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset , Dataset
import random
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from torch.nn.utils.rnn import pad_sequence , pack_padded_sequence, pad_packed_sequence


nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [3]:
def preprocess_text(text):
    """
    Preprocess the text by:
    - Tokenizing sentences
    - Removing special characters and digits
    - Converting to lowercase
    """
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    for sentence in sentences:
        sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
        sentence = sentence.lower()
        words = word_tokenize(sentence)
        if words:
            cleaned_sentences.append(words)

    return cleaned_sentences

def load_data(file_path):
    """
    Loads text from a file and returns the cleaned sentences.
    """
    with open(file_path, 'r') as f:
        text = f.read()  # Read the entire text
    return preprocess_text(text)

# Load and preprocess the data
file_path = '/kaggle/input/training/Auguste_Maquet.txt'
sentences = load_data(file_path)


In [4]:
def build_vocab(sentences, glove_dictionary):
    vocab = {'<UNK>': 0 ,'<PAD>':1}
    for sent in sentences:
        for word in sent:
            if word in glove_dictionary:
                if word not in vocab:
                    vocab[word] = len(vocab)
    return vocab

In [5]:
glove_dict = {}
with open('/kaggle/input/gloves/glove.6B.300d.txt', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        word_embedding = [float(x) for x in values[1:]]
        glove_dict[word] = word_embedding

In [6]:
random.shuffle(sentences)
train_split = int(0.7 * len(sentences))
test_split = int(0.9 * len(sentences))

train_sentences = sentences[:train_split]
test_sentences = sentences[train_split:test_split]
val_sentences = sentences[test_split:]

print(f"Train sentences: {len(train_sentences)}")
print(f"Test sentences: {len(test_sentences)}")
print(f"Validation sentences: {len(val_sentences)}")


Train sentences: 24785
Test sentences: 7082
Validation sentences: 3541


In [7]:
vocab = build_vocab(train_sentences, glove_dict)

In [8]:
index_to_word = {index: word for word, index in vocab.items()}
index = 204
word = index_to_word.get(index, '<UNK>')
print(f"The word with index {index} is '{word}'")

The word with index 204 is 'guarantees'


In [9]:
embedding_matrix = [glove_dict[word] if word in glove_dict else [0]*300 for word in vocab]


In [10]:
class TextDataset(Dataset):
    def __init__(self, sentences, vocab, embedding_matrix):
        self.sentences = sentences
        self.vocab = vocab
        self.embedding_matrix = embedding_matrix

        # Filter out empty sequences and sequences with zero length
        self.data = self.create_sequences()

    def create_sequences(self):
        sequences = []
        for sent in self.sentences:
            if len(sent) > 1:  # Ensure the sentence has more than one word
                input_seq = [self.vocab.get(word, self.vocab['<UNK>']) for word in sent[:-1]]
                output_seq = [self.vocab.get(word, self.vocab['<UNK>']) for word in sent[1:]]
                if len(input_seq) > 0 and len(output_seq) > 0:  # Check if sequences are non-empty
                    sequences.append((input_seq, output_seq))
        return sequences

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_seq, output_seq = self.data[idx]
        input_embeds = torch.tensor(input_seq, dtype=torch.long)
        output_seq = torch.tensor(output_seq, dtype=torch.long)
        return input_embeds, output_seq

train_dataset = TextDataset(train_sentences, vocab, embedding_matrix)
val_dataset = TextDataset(val_sentences, vocab, embedding_matrix)
test_dataset = TextDataset(test_sentences, vocab, embedding_matrix)

In [11]:
print(len(train_dataset))
print(type(train_dataset[0]))
print(train_dataset[1])


24273
<class 'tuple'>
(tensor([21, 22, 23,  3, 24, 13, 25, 26, 11, 27]), tensor([22, 23,  3, 24, 13, 25, 26, 11, 27, 28]))


In [12]:
def collate_fn(batch):
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    sequences, targets = zip(*batch)
    sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=vocab['<PAD>'])
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=vocab['<PAD>'])
    lengths = [len(seq) for seq in sequences]
    return sequences_padded, targets_padded,torch.tensor(lengths)
BATCH_SIZE=64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [13]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class LSTMModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, num_layers=2, dropout=0.5):
        super(LSTMModel, self).__init__()

        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=True)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                            dropout=dropout, batch_first=True)

        self.fc = nn.Linear(hidden_dim, vocab_size)

        self.dropout = nn.Dropout(dropout)

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

    def forward(self, input_seq, lengths, hidden_state=None):
        # Embed the input sequence
        embedded = self.dropout(self.embedding(input_seq))

        packed_input = pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)

        if hidden_state is None:
            h_0 = torch.zeros(self.num_layers, input_seq.size(0), self.hidden_dim).to(input_seq.device)
            c_0 = torch.zeros(self.num_layers, input_seq.size(0), self.hidden_dim).to(input_seq.device)
            hidden_state = (h_0, c_0)

        packed_output, hidden_state = self.lstm(packed_input, hidden_state)
        lstm_out, _ = pad_packed_sequence(packed_output, batch_first=True)
        output = self.fc(lstm_out)
        return output, hidden_state

    def init_hidden(self, batch_size, device):
        h_0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        c_0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return h_0, c_0

In [19]:
embedding_matrix=np.array(embedding_matrix)
model = LSTMModel(embedding_matrix, hidden_dim=300, num_layers=2, dropout=0.5).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])


In [20]:
def train_model(model, train_loader, val_loader, num_epochs=10, patience=3):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])
    vocab_size = len(vocab)

    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for input_seq, target_seq, lengths in train_loader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)

            # Initialize hidden state at the start of each batch
            hidden = model.init_hidden(input_seq.size(0), device)

            optimizer.zero_grad()

            # Forward pass
            output, hidden = model(input_seq, lengths, hidden)

            # Reshape output and target for the loss calculation
            loss = loss_fn(output.view(-1, vocab_size), target_seq.view(-1))

            # Backpropagation and optimization
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for input_seq, target_seq, lengths in val_loader:
                input_seq, target_seq = input_seq.to(device), target_seq.to(device)

                # No need to pass hidden state between batches during evaluation
                output, _ = model(input_seq, lengths)

                # Calculate validation loss
                loss = loss_fn(output.view(-1, vocab_size), target_seq.view(-1))
                val_loss += loss.item()

        # Calculate average validation loss
        val_loss /= len(val_loader)
        print(f"Epoch {epoch + 1}, Validation Loss: {val_loss:.4f}")

        # Early stopping logic
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), 'best_model.pth')
            print(f"Model saved with validation loss: {best_val_loss:.4f}")
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print("Early stopping!")
                break

In [21]:
train_model(model, train_loader, val_loader, num_epochs=10, patience=3)

Epoch 1, Loss: 6.741740704837598
Epoch 1, Validation Loss: 6.4021
Model saved with validation loss: 6.4021
Epoch 2, Loss: 6.367736385997973
Epoch 2, Validation Loss: 6.0732
Model saved with validation loss: 6.0732
Epoch 3, Loss: 6.100526918862995
Epoch 3, Validation Loss: 5.8513
Model saved with validation loss: 5.8513
Epoch 4, Loss: 5.899937745144492
Epoch 4, Validation Loss: 5.6865
Model saved with validation loss: 5.6865
Epoch 5, Loss: 5.7370334248793755
Epoch 5, Validation Loss: 5.5412
Model saved with validation loss: 5.5412
Epoch 6, Loss: 5.596185021651419
Epoch 6, Validation Loss: 5.4194
Model saved with validation loss: 5.4194
Epoch 7, Loss: 5.475631191855983
Epoch 7, Validation Loss: 5.3262
Model saved with validation loss: 5.3262
Epoch 8, Loss: 5.376115893062792
Epoch 8, Validation Loss: 5.2557
Model saved with validation loss: 5.2557
Epoch 9, Loss: 5.288211617971721
Epoch 9, Validation Loss: 5.1999
Model saved with validation loss: 5.1999
Epoch 10, Loss: 5.210024907714442
Ep

In [22]:
model.load_state_dict(torch.load('best_model.pth'))

  model.load_state_dict(torch.load('best_model.pth'))


<All keys matched successfully>

In [30]:
def calculate_perplexity(model, data_loader, vocab, device):
    """
    Calculate the perplexity of the model on the provided data_loader.

    Args:
        model (nn.Module): The trained LSTM model.
        data_loader (DataLoader): DataLoader for the dataset (test or validation).
        vocab (dict): Vocabulary dictionary mapping words to indices.
        device (torch.device): The device to run the computations on.

    Returns:
        float: The calculated perplexity.
    """
    model.eval()
    data_loss = 0
    vocab_size=len(vocab)
    loss_fn = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])

    with torch.no_grad():
        for input_seq, target_seq, lengths in data_loader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)

            # No need to pass hidden state between batches during evaluation
            output, _ = model(input_seq, lengths)

            # Calculate validation loss
            loss = loss_fn(output.view(-1, vocab_size), target_seq.view(-1))
            data_loss += loss.item()

    # Calculate average validation loss
    data_loss /= len(data_loader)
    perplexity=torch.exp(torch.tensor(data_loss))
    return perplexity,data_loss


In [31]:
# Assuming you have a trained model, test_loader, vocab, and device
test_perplexity,test_loss = calculate_perplexity(model, test_loader, vocab, device)
print(f"Test Perplexity: {test_perplexity:.4f}")
print(f"Test Loss: {test_loss:.4f}")

validation_perplexity,validation_loss = calculate_perplexity(model, val_loader, vocab, device)
print(f"Validation Perplexity: {validation_perplexity:.4f}")
print(f"Validation Loss: {validation_loss :.4f}")

train_perplexity,train_loss = calculate_perplexity(model, train_loader, vocab, device)
print(f"Train Perplexity: {train_perplexity:.4f}")
print(f"Train Loss: {train_loss :.4f}")

Test Perplexity: 167.2379
Test Loss: 5.1194
Validation Perplexity: 162.6948
Validation Loss: 5.0919
Train Perplexity: 113.0193
Train Loss: 4.7276


In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F

def calculate_perplexity_and_save(model, data_loader, vocab, device, file_name):
    """
    Calculate the perplexity of the model on the provided data_loader and save it to a file.

    Args:
        model (nn.Module): The trained LSTM model.
        data_loader (DataLoader): DataLoader for the dataset (test or validation).
        vocab (dict): Vocabulary dictionary mapping words to indices.
        device (torch.device): The device to run the computations on.
        file_name (str): The file name to save the batch perplexities.
    """
    model.eval()
    data_loss = 0
    vocab_size = len(vocab)
    loss_fn = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])
    batch_losses = []

    with torch.no_grad():
        for batch_idx, (input_seq, target_seq, lengths) in enumerate(data_loader):
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)

            # No need to pass hidden state between batches during evaluation
            output, _ = model(input_seq, lengths)

            # Calculate loss for each batch
            loss = loss_fn(output.view(-1, vocab_size), target_seq.view(-1))
            batch_loss = loss.item()
            batch_losses.append(batch_loss)

            # Save cumulative loss for average calculation
            data_loss += batch_loss

            # Calculate perplexity for the batch
            batch_perplexity = torch.exp(torch.tensor(batch_loss)).item()

            # Save the perplexity score for this batch to the file
            with open(file_name, 'a') as f:
                f.write(f'Batch-{batch_idx + 1}\t{batch_perplexity}\n')

    # Calculate the average loss and perplexity
    avg_loss = data_loss / len(data_loader)
    avg_perplexity = torch.exp(torch.tensor(avg_loss)).item()

    # Append the average perplexity to the file
    with open(file_name, 'a') as f:
        f.write(f'Average\t{avg_perplexity}\n')

    return avg_perplexity, avg_loss


In [36]:
BATCH_SIZE_FILE=1
train_loader_file = DataLoader(train_dataset, batch_size=BATCH_SIZE_FILE, shuffle=True, collate_fn=collate_fn)
val_loader_file = DataLoader(val_dataset, batch_size=BATCH_SIZE_FILE, shuffle=False, collate_fn=collate_fn)
test_loader_file = DataLoader(test_dataset, batch_size=BATCH_SIZE_FILE, shuffle=False, collate_fn=collate_fn)

In [37]:
calculate_perplexity_and_save(model, train_loader_file, vocab, device, '2021101075-LM2-train-perplexity.txt')
calculate_perplexity_and_save(model, test_loader_file, vocab, device, '2021101075-LM2-test-perplexity.txt')
calculate_perplexity_and_save(model, val_loader_file, vocab, device, '2021101075-LM2-val-perplexity.txt')


(162.69479370117188, 5.091875908591531)