In [11]:
from collections import Counter
import torch
import torch.nn as nn
from torch.optim import Adam
import random
import numpy as np
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
!nvidia-smi

In [12]:
#Hyperparameters
embed_size = 100#size for each word embedding
hidden_size=256
num_layers=1
num_epochs=2
batch_size=1000
dropout_rate=0.5
sequence_length=1#no of words in a sequence
word_frequency=15
learning_rate=0.01

seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)


In [13]:
#Data Preprocessing

#read the file
#convert the words in integers
def read_file(file):
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

# Function to build vocabulary
def build_vocab(text, frequency=20):
    words = text.split()
    word_count = Counter(words)

    word_dict = {'<unk>': 0}
    current_index = 1

    for word, freq in word_count.items():
        if freq > frequency:
            word_dict[word] = current_index
            current_index += 1
    return word_dict


def tokenize(text, word_dict):
    words = text.split()
    int_text = [word_dict.get(word, word_dict['<unk>']) for word in words]
    return int_text


def create_sequences_and_labels(int_text, sequence_length):
    sequences = []
    labels = []

    for i in range(0, len(int_text) - sequence_length-1):
        # Select a sequence of integers as the input
        sequence = int_text[i:i + sequence_length]
        sequences.append(sequence)
        
        # Select the next integer (word) as the label
        label_sequence = int_text[i+1:i+sequence_length+1]
        labels.append(label_sequence)

    return sequences, labels


In [14]:
#Data Preprocessing

#train data

train_text=read_file('wiki2.train.txt')
train_vocab = build_vocab(train_text)
vocab_size=len(train_vocab)+1
train_int_text = tokenize(train_text, train_vocab)
input_sequences,labels=create_sequences_and_labels(train_int_text,sequence_length)

#validation data
valid_text=read_file('wiki2.valid.txt')
valid_int_text = tokenize(valid_text, train_vocab) # Use the same word_dict as the training data
input_sequences_val,labels_val=create_sequences_and_labels(valid_int_text,sequence_length)



#validation data
test_text=read_file('wiki2.test.txt')
test_int_text = tokenize(test_text, train_vocab)
input_sequences_test,labels_test=create_sequences_and_labels(test_int_text,sequence_length)













In [15]:
from torch.utils.data import DataLoader, TensorDataset

# Convert sequences and labels into PyTorch tensors
train_data_tensor = torch.tensor(input_sequences)
train_labels_tensor = torch.tensor(labels)
print(train_data_tensor.size())
print(train_labels_tensor.size())

# Create a dataset and data loader
dataset_train = TensorDataset(train_data_tensor, train_labels_tensor)
data_loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)


#validation data
val_data_tensor = torch.tensor(input_sequences_val)
val_labels_tensor = torch.tensor(labels_val)

dataset_valid= TensorDataset(val_data_tensor, val_labels_tensor)
data_loader_valid = DataLoader(dataset_valid, batch_size=batch_size, shuffle=False)

# Create a dataset and data loader for test
test_data_tensor = torch.tensor(input_sequences_test)
test_labels_tensor = torch.tensor(labels_test)

datsset_test= TensorDataset(test_data_tensor, test_labels_tensor)
data_loader_test = DataLoader(datsset_test, batch_size=batch_size, shuffle=False)


torch.Size([2051908, 1])
torch.Size([2051908, 1])


In [16]:
#RNN Architecture

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout_rate):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.RNN=nn.RNN(embed_size,hidden_size,batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self,x,prev_state):
        embed = self.embedding(x)
        out,state=self.RNN(embed,prev_state)
        out = self.dropout(out)
        out = self.fc(out)
        return out,state
    
    def init_state(self, batch_size):
        return torch.zeros(1,batch_size, self.hidden_size).to(device) #no of layers=1
    
        

In [17]:
#Instantiating the model

model = RNNModel(vocab_size, embed_size, hidden_size,1, dropout_rate).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(),lr=learning_rate)


In [18]:
def train(model, train_loader, epochs, batch_size):
        model.train()
        train_perplexities = []
        val_perplexities = []
    
        for epoch in range(epochs):
            total_loss = 0
            total_nll = 0  # Total negative log likelihood for the training set
            total_words = 0  # Total number of words (excluding padding)
    
            for batch, (x, y) in enumerate(train_loader):
                x, y = x.to(device), y.to(device)  # Move data to the device
                batch_size = x.size(0)
                state_h = model.init_state(batch_size).to(device) 
                optimizer.zero_grad()
    
                y_pred, state_h = model(x, state_h)
                loss = criterion(y_pred.transpose(1, 2), y)
                total_loss += loss.item()* batch_size
    
                loss.backward()
                optimizer.step()
                state_h = state_h.detach()
    
            avg_train_loss = total_loss / len(train_loader.dataset)
            train_perplexity = math.exp(avg_train_loss)
            train_perplexities.append(train_perplexity)
    
            print(f'Epoch {epoch} Train Perplexity: {train_perplexity}')
            # Validation loop
            model.eval()
            with torch.no_grad():
                total_val_loss = 0
    
            for batch, (x, y) in enumerate(data_loader_valid):
                x, y = x.to(device), y.to(device) 
                batch_size = x.size(0)
                state_h = model.init_state(batch_size)
    
                y_pred, state_h = model(x, state_h)
                loss = criterion(y_pred.transpose(1, 2), y)
                total_val_loss += loss.item() * batch_size  # Multiply by batch_size to get total loss
    
            # Calculate average validation loss and perplexity
            avg_val_loss = total_val_loss / len(data_loader_valid.dataset)
            val_perplexity = math.exp(avg_val_loss)
            val_perplexities.append(val_perplexity)
    
            print(f"Epoch {epoch} - "
              f"Train Loss: {avg_train_loss:.3f}, "
              f"Train Perplexity: {train_perplexity:.3f}, "
              f"Val Loss: {avg_val_loss:.3f}, "
              f"Val Perplexity: {val_perplexity:.3f}")
        
        return train_perplexities, val_perplexities
    


In [19]:
#Train the model
import time

#Train the model
import time


start_time = time.time()
train_perplexities, val_perplexities =train(model,data_loader_train, num_epochs, batch_size) 
end_time = time.time() # Define data_loader and batch_size appropriately
total_time = end_time - start_time
print(f"Training completed in {total_time:.2f} seconds")

torch.save(model.state_dict(), 'model_weights.pth')







Epoch 0 Train Perplexity: 289.638907397432
Epoch 0 - Train Loss: 5.669, Train Perplexity: 289.639, Val Loss: 5.192, Val Perplexity: 179.834
Epoch 1 Train Perplexity: 171.90149898499723
Epoch 1 - Train Loss: 5.147, Train Perplexity: 171.901, Val Loss: 5.134, Val Perplexity: 169.733
Training completed in 299.54 seconds


In [None]:
import matplotlib.pyplot as plt

# Plotting
epochs = range(1, num_epochs + 1)
plt.plot(epochs, train_perplexities, 'bo-', label='Training Perplexity')
plt.plot(epochs, val_perplexities, 'ro-', label='Validation Perplexity')
plt.title('Training and Validation Perplexity')
plt.xlabel('Epochs')
plt.ylabel('Perplexity')
plt.legend()
plt.show()