First lets start by importing the libraries we need

In [5]:
import numpy as np#for numerical operations on matrices
import pandas as pd#for data manipulation
import torch#a framework for training nural networks
import torch.nn as nn # provides modules and functions for working with NNs
import torch.nn.functional as F # same as above
from sklearn.model_selection import train_test_split#data processing
from sklearn.metrics import classification_report#model evaluation
import transformers#a librariy from hugging face to work with pretrained transfomer models
from transformers import AutoModelForMaskedLM, BertTokenizerFast#loads pre_trained transformer models dynamically,a tokenizer optimized for bert models respctively.

# specify GPU
device = torch.device("cuda")

Next we will need to load our corpus

In [8]:
df = pd.read_csv("/content/1661-0.txt", delimiter="\t")
df.head()

Unnamed: 0,"Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle"
0,This eBook is for the use of anyone anywhere a...
1,almost no restrictions whatsoever. You may co...
2,re-use it under the terms of the Project Guten...
3,with this eBook or online at www.gutenberg.net
4,Title: The Adventures of Sherlock Holmes


now that we loaded our text data we need to tokanize our data to convert our text into numerical representation making it easier for our model to handle

In [9]:
MODEL_NAME = "bert-base-uncased"  # You can try "distilbert-base-uncased" for a smaller model
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

now that we initialize our tokanizer we can tokenize our text data



In [11]:
# Convert DataFrame to a single text string since our text data is loaded into a dataframe should have read the file in as text from the start
text = " ".join(df.astype(str).values.flatten())

# Tokenize the text
tokens = tokenizer.tokenize(text)

# Split tokens into chunks of the specified size
chunk_size = 512
chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

# Convert tokens to input IDs
input_ids = [tokenizer.convert_tokens_to_ids(chunk) for chunk in chunks]

now that tokanizing is done we can start preparing our training sequence(data)

In [12]:
#preparing training data
def create_sequences(input_ids, seq_length=10):
    sequences = []
    labels = []

    for chunk in input_ids:
        for i in range(len(chunk) - seq_length):
            sequences.append(chunk[i:i + seq_length])
            labels.append(chunk[i + seq_length])  # Next word

    return torch.tensor(sequences), torch.tensor(labels)

# Create sequences for each chunk separately
X, y = create_sequences(input_ids, seq_length=10)


now we can split our data into testing and training data

In [13]:
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

now that we have split our data we need to convert them to a pytorch dataset since tensordataset allows for more optimized training

In [14]:
train_data = torch.utils.data.TensorDataset(X_train, y_train)
val_data = torch.utils.data.TensorDataset(X_val, y_val)

In [15]:
train_loader = torch.utils.data.DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=16, shuffle=False)
# creating data loaders to efficiently batch, shuffle (for training) and feeding them to the moded during training and validaiton

now we need to define our model using a pre_defined bert model (AutoModelForMaskedLM) to generate word embeddings

In [16]:
class NextWordPredictor(nn.Module):
    def __init__(self, model_name):
        super(NextWordPredictor, self).__init__()
        self.bert = AutoModelForMaskedLM.from_pretrained(model_name)#loading the pre_trained bert model for maskde language modeling
        self.fc = nn.Linear(self.bert.config.hidden_size, tokenizer.vocab_size)#FC Layer
    #Forward Prop
    def forward(self, input_ids):
        outputs = self.bert(input_ids).logits#pass input through bert model to get the hidden states
        logits = outputs[:, -1, :]  # Takes the last tokens hidden state
        return logits

now that we have defined our model we need to initialize it then define the loss and optimizer

In [17]:
#initializing the model
model = NextWordPredictor(MODEL_NAME).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

In [18]:
#defining the loss fuction
criterion = nn.CrossEntropyLoss()
#defining the optimizer to update the weights
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

we used AdamW(Adam with weight decay) which is an improved version of the adam optimiztion which helps avoid overfitting and helps with penalizing large weights

now we are defining the training phase and the process of updating the weights we will use this later to train our model

In [19]:
def train_model(model, train_loader, val_loader, epochs=3):
    model.train()#setting the model to trainig
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:#iterating through batches from the training data
            inputs, targets = batch#getting the input data and actual next word which is the targets
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()#clearing precvious gradients and reseting them to zero
            outputs = model(inputs)#pass inputs through the model for output predictions
            loss = criterion(outputs, targets)#calc loss
            loss.backward()#calc grad through back prop
            optimizer.step()#update weights using grads
            total_loss += loss.item()#add this batchs loss to total_loss

        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

now train the model using the above func

In [20]:
train_model(model, train_loader, val_loader, epochs=3)

Epoch 1, Loss: 4.015423148525932
Epoch 2, Loss: 3.203996466418911
Epoch 3, Loss: 2.579227076961815


now we define the prediction fucntion to predict the next word of a given sentence

In [21]:
def predict_next_word(text, model, tokenizer, seq_length=10):
    model.eval()  # Set the model to evaluation mode
    tokens = tokenizer.tokenize(text.lower())[-seq_length:]  # Tokenize the input text
    input_ids = tokenizer.convert_tokens_to_ids(tokens)  # Convert tokens to IDs
    input_tensor = torch.tensor([input_ids]).to(device)  # Convert to tensor and move to device

    with torch.no_grad():  # Disable gradient calculation during prediction
        logits = model(input_tensor)  # Get logits for next word prediction
        predicted_id = torch.argmax(logits, dim=-1).item()  # Get the token with highest probability

    return tokenizer.decode(predicted_id)  # Decode the predicted token ID back to a word

This function predicts the next word in a sequence by passing the input text through the trained model and selecting the most likely word based on the output logits.

this is the tests for our model

In [None]:
print(predict_next_word("i was aware of", model, tokenizer))

that


In [None]:
print(predict_next_word("i want to thank your", model, tokenizer))

majesty


In [None]:
print(predict_next_word("i did not gain very", model, tokenizer))

much


In [None]:
print(predict_next_word("i would", model, tokenizer))

have


In [None]:
print(predict_next_word("You are not", model, tokenizer))

what


In [None]:
print(predict_next_word("I drank a", model, tokenizer))

little


In [None]:
print(predict_next_word("we had", model, tokenizer))

been


In [None]:
print(predict_next_word("I am", model, tokenizer))

a


In [None]:
print(predict_next_word("we do", model, tokenizer))

not


In [None]:
print(predict_next_word("I have seldom heard him say some bad things about a ", model, tokenizer))

woman


In [None]:
print(predict_next_word("I would like to breaak his ", model, tokenizer))

soul


In [None]:
print(predict_next_word("i want to fullfil my duty as a ", model, tokenizer))

king


In [None]:
print(predict_next_word("break his legs and throw him out of the ", model, tokenizer))

way


In [30]:
print(predict_next_word("I could not ", model, tokenizer))

be
