In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torchtext
from torchtext.data.utils import get_tokenizer
from sklearn.model_selection import train_test_split
import collections
from collections import Counter
from torch.utils.data import TensorDataset,DataLoader



In [2]:
import zipfile
zip_ref = zipfile.ZipFile('/content/penn_tree.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [3]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
train_text_data=pd.read_table('/content/ptbdataset/ptb.train.txt',header=None)
test_text_data=pd.read_table('/content/ptbdataset/ptb.test.txt',header=None)
train=train_text_data[0].tolist()
test=test_text_data[0].tolist()
train_text_data.head(),test_text_data.head()

(                                                   0
 0   aer banknote berlitz calloway centrust cluett...
 1   pierre <unk> N years old will join the board ...
 2   mr. <unk> is chairman of <unk> n.v. the dutch...
 3   rudolph <unk> N years old and former chairman...
 4   a form of asbestos once used to make kent cig...,
                                                    0
 0                        no it was n't black monday 
 1   but while the new york stock exchange did n't...
 2   some circuit breakers installed after the oct...
 3   the N stock specialist firms on the big board...
 4   big investment banks refused to step up to th...)

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [9]:
lemmatizer= WordNetLemmatizer()
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize
    #tokens = [token for token in tokens if token not in stop_words]  # Remove stop words
    #tokens = [stemmer.stem(token) for token in tokens]  # Stemming
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens
train_tokenize = [preprocess_text(text) for text in train]
test_tokenize = [preprocess_text(text) for text in test]

In [10]:
all_train_tokens = [token for text in train_tokenize for token in text]
all_test_tokens = [token for text in test_tokenize for token in text]

# Combine both token lists
all_tokens = all_train_tokens + all_test_tokens

# Build vocabulary
counter = Counter(all_tokens)
voc = {word: i+1 for i, (word, _) in enumerate(counter.most_common())}
voc['<unk>'] = 0

In [11]:
def tokens_to_indices(tokens, vocab):
    return [vocab.get(token, vocab['<unk>']) for token in tokens]

train_indices = [tokens_to_indices(text, voc) for text in train_tokenize]
test_indices = [tokens_to_indices(text, voc) for text in test_tokenize]

In [12]:
def create_sequences(indices, max_length):
    sequences = []
    for seq in indices:
        for i in range(1, len(seq)):
            sequences.append(seq[:i+1])
    return sequences

train_sequences = create_sequences(train_indices, max(len(seq) for seq in train_indices))
max_length_train = max(len(seq) for seq in train_sequences)
test_sequences = create_sequences(test_indices, max(len(seq) for seq in test_indices))
max_length_test = max(len(seq) for seq in test_sequences)

In [13]:
def pad_sequences(sequences):
    max_length = max(len(seq) for seq in sequences)
    pad= [([0] * (max_length - len(seq))) + seq for seq in sequences]
    return torch.tensor(pad,dtype=torch.long )
train_input_padded = pad_sequences(train_sequences)
test_input_padded = pad_sequences(test_sequences)

In [14]:
train_input_padded_tensor = torch.tensor(train_input_padded, dtype=torch.long).to(device)
test_input_padded_tensor = torch.tensor(test_input_padded, dtype=torch.long).to(device)

  train_input_padded_tensor = torch.tensor(train_input_padded, dtype=torch.long).to(device)
  test_input_padded_tensor = torch.tensor(test_input_padded, dtype=torch.long).to(device)


In [15]:
input_train=train_input_padded_tensor[:,:-1]
target_train=train_input_padded_tensor[:,-1]
input_test=test_input_padded_tensor[:,:-1]
target_test=test_input_padded_tensor[:,-1]

In [16]:


train_data = TensorDataset(torch.tensor(input_train, dtype=torch.long), torch.tensor(target_train, dtype=torch.long))
test_data = TensorDataset(torch.tensor(input_test, dtype=torch.long), torch.tensor(target_test, dtype=torch.long))
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)



  train_data = TensorDataset(torch.tensor(input_train, dtype=torch.long), torch.tensor(target_train, dtype=torch.long))
  test_data = TensorDataset(torch.tensor(input_test, dtype=torch.long), torch.tensor(target_test, dtype=torch.long))


In [33]:
class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout=0.5):
        super(NextWordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.relu=nn.ReLU()
        self.fc = nn.Linear(hidden_dim, vocab_size)



    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        lstm_out = self.relu(lstm_out)
        out = self.fc(lstm_out[:, -1, :])

        return out

In [34]:
def accuracy_fn(y_true, y_pred):
    y_pred_labels = y_pred.argmax(dim=1)
    correct = (y_pred_labels == y_true).sum().item()
    accuracy = correct / len(y_true) * 100
    return accuracy

In [35]:
model=NextWordPredictor(len(voc),100,128,2).to(device)
loss_fn=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=0.01,weight_decay=1e-5)

In [36]:
def train_model(model, train_loader, loss_fn, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_acc = 0
        for batch_idx, (data, targets) in enumerate(train_loader):
            data, targets = data.to(device), targets.to(device)
            optimizer.zero_grad()

            preds = model(data)

            loss = loss_fn(preds, targets)
            train_loss += loss.item()
            train_acc += accuracy_fn(targets, preds)

            loss.backward()
            optimizer.step()

        # Print average loss and accuracy after each epoch
        avg_loss = train_loss / len(train_loader)
        avg_acc = train_acc / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_acc:.2f}%")




In [37]:
train_model(model,train_loader,loss_fn,optimizer,100)

Epoch 1/100, Loss: 6.0074, Accuracy: 12.55%
Epoch 2/100, Loss: 5.8100, Accuracy: 14.29%
Epoch 3/100, Loss: 5.7534, Accuracy: 14.80%
Epoch 4/100, Loss: 5.7281, Accuracy: 15.00%
Epoch 5/100, Loss: 5.7114, Accuracy: 15.20%
Epoch 6/100, Loss: 5.7031, Accuracy: 15.18%
Epoch 7/100, Loss: 5.7008, Accuracy: 15.25%
Epoch 8/100, Loss: 5.6959, Accuracy: 15.31%
Epoch 9/100, Loss: 5.6907, Accuracy: 15.33%
Epoch 10/100, Loss: 5.6854, Accuracy: 15.44%
Epoch 11/100, Loss: 5.6831, Accuracy: 15.47%
Epoch 12/100, Loss: 5.6829, Accuracy: 15.45%
Epoch 13/100, Loss: 5.6859, Accuracy: 15.36%
Epoch 14/100, Loss: 5.6905, Accuracy: 15.42%
Epoch 15/100, Loss: 5.6902, Accuracy: 15.43%
Epoch 16/100, Loss: 5.6868, Accuracy: 15.49%
Epoch 17/100, Loss: 5.6870, Accuracy: 15.46%
Epoch 18/100, Loss: 5.6878, Accuracy: 15.46%
Epoch 19/100, Loss: 5.6880, Accuracy: 15.43%


KeyboardInterrupt: 