In [None]:
!python -m pip install -U skorch



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torchtext
from torchtext.data.utils import get_tokenizer
from sklearn.model_selection import train_test_split
import collections
from collections import Counter
from torch.utils.data import TensorDataset,DataLoader
from skorch import NeuralNetClassifier
from skorch.callbacks import EpochScoring
import warnings
warnings.filterwarnings('ignore')



In [None]:
import zipfile
zip_ref = zipfile.ZipFile('/content/penn_tree.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
train_text_data=pd.read_table('/content/ptbdataset/ptb.train.txt',header=None)
test_text_data=pd.read_table('/content/ptbdataset/ptb.test.txt',header=None)
train=train_text_data[0].tolist()
test=test_text_data[0].tolist()
train_text_data.head(),test_text_data.head()

(                                                   0
 0   aer banknote berlitz calloway centrust cluett...
 1   pierre <unk> N years old will join the board ...
 2   mr. <unk> is chairman of <unk> n.v. the dutch...
 3   rudolph <unk> N years old and former chairman...
 4   a form of asbestos once used to make kent cig...,
                                                    0
 0                        no it was n't black monday 
 1   but while the new york stock exchange did n't...
 2   some circuit breakers installed after the oct...
 3   the N stock specialist firms on the big board...
 4   big investment banks refused to step up to th...)

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
lemmatizer= WordNetLemmatizer()
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize
    #tokens = [token for token in tokens if token not in stop_words]  # Remove stop words
    #tokens = [stemmer.stem(token) for token in tokens]  # Stemming
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens
train_tokenize = [preprocess_text(text) for text in train]
test_tokenize = [preprocess_text(text) for text in test]

In [None]:
train_tokenize[0],test_tokenize[0]

(['aer',
  'banknote',
  'berlitz',
  'calloway',
  'centrust',
  'cluett',
  'fromstein',
  'gitano',
  'guterman',
  'hydroquebec',
  'ipo',
  'kia',
  'memotec',
  'mlx',
  'nahb',
  'punt',
  'rake',
  'regatta',
  'rubens',
  'sim',
  'snackfood',
  'ssangyong',
  'swapo',
  'wachter'],
 ['no', 'it', 'wa', 'nt', 'black', 'monday'])

In [None]:
all_train_tokens = [token for text in train_tokenize for token in text]
all_test_tokens = [token for text in test_tokenize for token in text]

# Combine both token lists
all_tokens = all_train_tokens + all_test_tokens

# Build vocabulary
counter = Counter(all_tokens)
voc = {word: i+1 for i, (word, _) in enumerate(counter.most_common())}
voc['<unk>'] = 0

In [None]:
print(voc)



In [None]:
def tokens_to_indices(tokens, vocab):
    return [vocab.get(token, vocab['<unk>']) for token in tokens]

train_indices = [tokens_to_indices(text, voc) for text in train_tokenize]
test_indices = [tokens_to_indices(text, voc) for text in test_tokenize]

In [None]:
train_indices[0],test_indices[0]

([8636,
  8637,
  8638,
  8639,
  3756,
  8640,
  8641,
  8642,
  8643,
  8644,
  8645,
  8646,
  8647,
  8648,
  8649,
  8650,
  8651,
  8652,
  8653,
  8654,
  8655,
  8656,
  8657,
  8658],
 [102, 9, 22, 31, 537, 411])

In [None]:
def create_sequences(indices):
    sequences = []
    for seq in indices:
        for i in range(1, len(seq)):
            sequences.append(seq[:i+1])
    return sequences

train_sequences = create_sequences(train_indices)
max_length_train = max(len(seq) for seq in train_sequences)
test_sequences = create_sequences(test_indices)
max_length_test = max(len(seq) for seq in test_sequences)

In [None]:
train_sequences,test_sequences

([[8636, 8637],
  [8636, 8637, 8638],
  [8636, 8637, 8638, 8639],
  [8636, 8637, 8638, 8639, 3756],
  [8636, 8637, 8638, 8639, 3756, 8640],
  [8636, 8637, 8638, 8639, 3756, 8640, 8641],
  [8636, 8637, 8638, 8639, 3756, 8640, 8641, 8642],
  [8636, 8637, 8638, 8639, 3756, 8640, 8641, 8642, 8643],
  [8636, 8637, 8638, 8639, 3756, 8640, 8641, 8642, 8643, 8644],
  [8636, 8637, 8638, 8639, 3756, 8640, 8641, 8642, 8643, 8644, 8645],
  [8636, 8637, 8638, 8639, 3756, 8640, 8641, 8642, 8643, 8644, 8645, 8646],
  [8636,
   8637,
   8638,
   8639,
   3756,
   8640,
   8641,
   8642,
   8643,
   8644,
   8645,
   8646,
   8647],
  [8636,
   8637,
   8638,
   8639,
   3756,
   8640,
   8641,
   8642,
   8643,
   8644,
   8645,
   8646,
   8647,
   8648],
  [8636,
   8637,
   8638,
   8639,
   3756,
   8640,
   8641,
   8642,
   8643,
   8644,
   8645,
   8646,
   8647,
   8648,
   8649],
  [8636,
   8637,
   8638,
   8639,
   3756,
   8640,
   8641,
   8642,
   8643,
   8644,
   8645,
   8646,
   86

In [None]:
def pad_sequences(sequences):
    max_length = max(len(seq) for seq in sequences)
    pad= [([0] * (max_length - len(seq))) + seq for seq in sequences]
    return torch.tensor(pad,dtype=torch.long )
train_input_padded = pad_sequences(train_sequences)
test_input_padded = pad_sequences(test_sequences)

In [None]:
train_input_padded_tensor = torch.tensor(train_input_padded, dtype=torch.long).to(device)
test_input_padded_tensor = torch.tensor(test_input_padded, dtype=torch.long).to(device)

In [None]:
train_input_padded_tensor,test_input_padded_tensor

(tensor([[   0,    0,    0,  ...,    0, 8636, 8637],
         [   0,    0,    0,  ..., 8636, 8637, 8638],
         [   0,    0,    0,  ..., 8637, 8638, 8639],
         ...,
         [   0,    0,    0,  ..., 3551,  235,    6],
         [   0,    0,    0,  ...,  235,    6,   21],
         [   0,    0,    0,  ...,    6,   21,    2]], device='cuda:0'),
 tensor([[  0,   0,   0,  ...,   0, 102,   9],
         [  0,   0,   0,  ..., 102,   9,  22],
         [  0,   0,   0,  ...,   9,  22,  31],
         ...,
         [  0,   0,   0,  ..., 337, 137, 371],
         [  0,   0,   0,  ..., 137, 371,  20],
         [  0,   0,   0,  ..., 371,  20,  45]], device='cuda:0'))

In [None]:
input_train=train_input_padded_tensor[:,:-1]
target_train=train_input_padded_tensor[:,-1]
input_test=test_input_padded_tensor[:,:-1]
target_test=test_input_padded_tensor[:,-1]

In [None]:
train_data = TensorDataset(torch.tensor(input_train, dtype=torch.long), torch.tensor(target_train, dtype=torch.long))
test_data = TensorDataset(torch.tensor(input_test, dtype=torch.long), torch.tensor(target_test, dtype=torch.long))
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

class NextWordPredictorrrr(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(NextWordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.relu=nn.ReLU()
        self.fc = nn.Linear(hidden_dim, vocab_size)


    def init_hidden(self, batch_size):
        hidden_state = torch.zeros(self.lstm.num_layers, batch_size, self.lstm.hidden_size).to(device)
        cell_state = torch.zeros(self.lstm.num_layers, batch_size, self.lstm.hidden_size).to(device)
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, (h_n,c_n) = self.lstm(x,init_hidden(32))
        lstm_out = self.relu(lstm_out)
        out = self.fc(lstm_out[:, -1, :])

        return out

In [None]:
class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(NextWordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden_state=None, cell_state=None):
        x = self.embedding(x)

        if hidden_state is None or cell_state is None:
            # Initialize hidden and cell states if they are not provided
            batch_size = x.size(0)
            hidden_state = torch.zeros(self.lstm.num_layers, batch_size, self.lstm.hidden_size).to(x.device)
            cell_state = torch.zeros(self.lstm.num_layers, batch_size, self.lstm.hidden_size).to(x.device)

        # Pass through LSTM
        lstm_out, (hidden_state, cell_state) = self.lstm(x, (hidden_state, cell_state))

        # Apply ReLU activation
        lstm_out = self.relu(lstm_out)

        # Use the output from the last time step
        out = self.fc(lstm_out[:, -1, :])

        return out,hidden_state,cell_state

In [None]:
def accuracy_fn(y_true, y_pred):
    y_pred_labels = y_pred.argmax(dim=1)
    correct = (y_pred_labels == y_true).sum().item()
    accuracy = correct / len(y_true) * 100
    return accuracy

In [None]:
model=NextWordPredictor(len(voc),300,256,2).to(device)
loss_fn=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=0.001)

In [None]:
def train_model(model, train_loader, loss_fn, optimizer, accuracy_fn):
    model.train()
    train_loss = 0
    train_acc = 0
    for batch_idx, (data, targets) in enumerate(train_loader):
        data, targets = data.to(device), targets.to(device)
        optimizer.zero_grad()

        preds,hn,cn = model(data)

        loss = loss_fn(preds, targets)
        train_loss += loss.item()
        train_acc += accuracy_fn(targets, preds)

        loss.backward()
        optimizer.step()

        # Print average loss and accuracy after each epoch
    avg_loss = train_loss / len(train_loader)
    avg_acc = train_acc / len(train_loader)
    print(f"Training_Loss: {avg_loss:.4f}, Training_Accuracy: {avg_acc:.2f}%")
def test_model(model,test_dataloader,loss_fn,optimizer,accuracy_fn):
    model.eval()
    test_loss=0
    test_acc=0
    with torch.inference_mode():
        for batch_idx, (data, targets) in enumerate(test_dataloader):
            data, targets = data.to(device), targets.to(device)
            preds,hn,cn = model(data)
            loss = loss_fn(preds, targets)
            test_loss+=loss.item()
            test_acc+=accuracy_fn(targets,preds)
        avg_test_loss=test_loss/len(test_dataloader)
        avg_test_acc=test_acc/len(test_dataloader)

    print(f"\nTest_Loss: {avg_test_loss:.4f}, Test_Accuracy: {avg_test_acc:.2f}%\n")




In [None]:
from timeit import default_timer as timer
def print_train_time(start: float, end: float, device: torch.device = None):
    total_time = end - start
    print(f"Train time on {device}: {total_time:.3f} seconds")
    return total_time

In [None]:
from tqdm.auto import tqdm

# Set the seed and start the timer
torch.manual_seed(42)
train_time_start_on_gpu = timer()

# Set the number of epochs (we'll keep this small for faster training times)
epochs = 50

# Create training and testing loop
for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch}\n-------")

    train_model(model,train_loader,loss_fn,optimizer,accuracy_fn)
    test_model(model,test_loader,loss_fn,optimizer,accuracy_fn)

train_time_end_on_gpu = timer()
total_train_time_model = print_train_time(start=train_time_start_on_gpu,
                                           end=train_time_end_on_gpu,
                                          )

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0
-------
Training_Loss: 5.7471, Training_Accuracy: 15.93%

Test_Loss: 5.3801, Test_Accuracy: 18.56%

Epoch: 1
-------
Training_Loss: 5.2291, Training_Accuracy: 20.03%

Test_Loss: 5.2600, Test_Accuracy: 19.89%

Epoch: 2
-------
Training_Loss: 5.0333, Training_Accuracy: 21.45%

Test_Loss: 5.2165, Test_Accuracy: 20.47%

Epoch: 3
-------
Training_Loss: 4.9095, Training_Accuracy: 22.33%

Test_Loss: 5.2055, Test_Accuracy: 20.81%

Epoch: 4
-------
Training_Loss: 4.8163, Training_Accuracy: 22.99%

Test_Loss: 5.2088, Test_Accuracy: 21.28%

Epoch: 5
-------
Training_Loss: 4.7392, Training_Accuracy: 23.52%

Test_Loss: 5.2295, Test_Accuracy: 21.26%

Epoch: 6
-------
Training_Loss: 4.6786, Training_Accuracy: 23.94%

Test_Loss: 5.2209, Test_Accuracy: 21.37%

Epoch: 7
-------
Training_Loss: 4.6273, Training_Accuracy: 24.26%

Test_Loss: 5.2398, Test_Accuracy: 21.46%

Epoch: 8
-------
Training_Loss: 4.5794, Training_Accuracy: 24.59%

Test_Loss: 5.2576, Test_Accuracy: 21.58%

Epoch: 9
-------
Tr

In [None]:
def check(model, text,voc, max_len=10):
    model.eval()
    token = preprocess_text(text)
    indices = tokens_to_indices(token, voc)

    # Start generating words until max_len
    for i in range(max_len - len(indices)):
        seq = create_sequences([indices])
        padding = pad_sequences(seq)
        input_tensor = torch.tensor(padding, dtype=torch.long).to(device)

        with torch.inference_mode():
            pred = model(input_tensor)
            predicted_index = torch.argmax(pred, dim=1).item()
            predicted_word = list(voc.keys())[list(voc.values()).index(predicted_index)]

            indices.append(predicted_index)
            token.append(predicted_word)

    return " ".join(token)

text = 'but while the new york'
out = check(model, text,voc)
print(out)

Epoch 1/100, Loss: 5.7546, Accuracy: 15.88%
Epoch 2/100, Loss: 5.2267, Accuracy: 20.15%
Epoch 3/100, Loss: 5.0258, Accuracy: 21.59%
Epoch 4/100, Loss: 4.8941, Accuracy: 22.51%
Epoch 5/100, Loss: 4.7968, Accuracy: 23.21%
Epoch 6/100, Loss: 4.7182, Accuracy: 23.72%
Epoch 7/100, Loss: 4.6515, Accuracy: 24.18%
Epoch 8/100, Loss: 4.5948, Accuracy: 24.58%
Epoch 9/100, Loss: 4.5461, Accuracy: 24.90%
Epoch 10/100, Loss: 4.5033, Accuracy: 25.16%


KeyboardInterrupt: 