# Data Preprocessing

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# %cd /content/drive/MyDrive/MSC_Intro_to_NLP_Group_Project/

/content/drive/MyDrive/MSC_Intro_to_NLP_Group_Project


In [3]:
!pip install nltk
!pip install gensim
!pip install 'transformers[torch]'
!pip install datasets




In [4]:
import json
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from transformers import AutoTokenizer

from datasets import load_dataset

In [5]:
# Load the preprocessed data from the JSON file
data_files={
    "train":"data_train_rule_based_preprocess.json",
}

dataset = load_dataset("json", data_files=data_files)

In [6]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['transformation', 'topic', 'id'],
        num_rows: 104562
    })
})


In [7]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
# Tokenize the informal sentences
def preprocess_function(examples, input_field="informal", target_field="formal.ref0"):
    inputs = [ex[input_field] for ex in examples["transformation"]]
    targets = [ex[target_field] for ex in examples["transformation"]]

    new_examples = tokenizer(
        inputs, text_target=targets, max_length=64, truncation=True, padding="max_length"
    )

    return new_examples

In [9]:
def create_multi_ref_dataset(dataset):
  for i, target_field in enumerate(['formal.ref0', 'formal.ref1', 'formal.ref2', 'formal.ref3']):
    new_dataset = preprocess_function(dataset, 'informal', target_field)
    dataset = dataset.add_column(f'labels_{i}', new_dataset['labels'])
    if i == 0:
      dataset = dataset.add_column('input_ids', new_dataset['input_ids'])
      dataset = dataset.add_column('token_type_ids', new_dataset['token_type_ids'])
      dataset = dataset.add_column('attention_mask', new_dataset['attention_mask'])

  return dataset

In [10]:
train_dataset = dataset['train'].map(
    preprocess_function,
    batched=True,
)

In [11]:
tokenizer.convert_ids_to_tokens(train_dataset['input_ids'][0])

['[CLS]',
 'Sure',
 ',',
 'it',
 "'",
 's',
 'ok',
 ',',
 'but',
 'I',
 'always',
 'have',
 'let',
 'the',
 'guy',
 'ask',
 'me',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

# RNN Model

# RNN Try 1

In [12]:
import torch
from torch import nn

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.rnn(x)
        output = self.fc(output)
        return output

RNN try 2

In [13]:
# # splitting the train dataset to use only 10% of it
# train_dataset = train_dataset.train_test_split(test_size=0.9, shuffle=True, seed=42)
# train_dataset = train_dataset['train']

In [14]:
# print(train_dataset)

In [15]:
# import torch
# import torch.nn as nn
# import torch.optim as optim

# class Seq2SeqRNN(nn.Module):
#     def __init__(self, input_size, embedding_size, hidden_size, output_size):
#         super(Seq2SeqRNN, self).__init__()
#         self.embedding = nn.Embedding(input_size, embedding_size)
#         self.rnn = nn.RNN(embedding_size, hidden_size)
#         self.fc = nn.Linear(hidden_size, output_size)

#     def forward(self, input_seq, hidden=None):
#         embedded = self.embedding(input_seq)
#         output, hidden = self.rnn(embedded, hidden)
#         output = self.fc(output)
#         return output, hidden

# # Define hyperparameters
# input_size = len(tokenizer.get_vocab())
# embedding_size = 256
# hidden_size = 512
# output_size = len(tokenizer.get_vocab())

# # Instantiate the model
# model = Seq2SeqRNN(input_size, embedding_size, hidden_size, output_size)

# # Define loss function and optimizer
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Training loop
# num_epochs = 1
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# for epoch in range(num_epochs):
#     for batch in train_dataset:
#         inputs = torch.tensor(batch["input_ids"]).to(device)
#         targets = torch.tensor(batch["labels"]).to(device)

#         optimizer.zero_grad()

#         output, _ = model(inputs)

#         # Reshape the output to be 2D (batch_size * sequence_length, vocab_size)
#         output = output.view(-1, output_size)

#         loss = criterion(output, targets.view(-1))
#         loss.backward()
#         optimizer.step()

#         # Print the loss for monitoring training progress
#         print(f"Epoch: {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# # Save the trained model
# torch.save(model.state_dict(), "seq2seq_rnn_model.pth")


In [16]:
# # Load the model
# model = Seq2SeqRNN(input_size, embedding_size, hidden_size, output_size)
# model.load_state_dict(torch.load("seq2seq_rnn_model.pth"))
# model.eval()  # Set the model to evaluation mode

# # Define the input sequence
# input_seq = torch.tensor(tokenizer.encode("Yo! Sidd, what's up!!!")).unsqueeze(0)

# # Predict the output
# output, _ = model(input_seq)
# output = torch.argmax(output, dim=2)  # Get the most probable next token

# # Decode the output
# decoded_output = tokenizer.decode(output[0])

# print(decoded_output)

## RNN Try 2

In [17]:
import torch
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output)
        return output

In [18]:
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# choosing GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading preprocessed data in train_dataset
input_ids = torch.tensor(train_dataset['input_ids']).to(device)
labels_0 = torch.tensor(train_dataset['labels']).to(device)

# Define your model
input_size = len(tokenizer.get_vocab())
print("||" * 10)
print(input_size)
print("||" * 10)
hidden_size = 512
output_size = len(tokenizer.get_vocab())
# output_size = 512

model = RNNModel(input_size, hidden_size, output_size)
model.to(device)

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 5
batch_size = 32

for epoch in range(num_epochs):
    for i in range(0, len(input_ids), batch_size):
        inputs = input_ids[i:i+batch_size]
        targets = labels_0[i:i+batch_size]

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, output_size), targets.view(-1))
        loss.backward()
        optimizer.step()

        if (i // batch_size) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i // batch_size}/{len(input_ids) // batch_size}], Loss: {loss.item()}')

# Save the trained model
torch.save(model.state_dict(), "seq2seq_rnn_model.pth")
print('Training finished!')


||||||||||||||||||||
28996
||||||||||||||||||||
Epoch [1/5], Step [0/3267], Loss: 10.308734893798828
Epoch [1/5], Step [10/3267], Loss: 2.4906623363494873
Epoch [1/5], Step [20/3267], Loss: 1.713649034500122
Epoch [1/5], Step [30/3267], Loss: 1.515468716621399
Epoch [1/5], Step [40/3267], Loss: 1.5696322917938232
Epoch [1/5], Step [50/3267], Loss: 1.4720643758773804
Epoch [1/5], Step [60/3267], Loss: 1.6231441497802734
Epoch [1/5], Step [70/3267], Loss: 1.5128427743911743
Epoch [1/5], Step [80/3267], Loss: 1.5815153121948242
Epoch [1/5], Step [90/3267], Loss: 1.6748604774475098
Epoch [1/5], Step [100/3267], Loss: 1.724317193031311
Epoch [1/5], Step [110/3267], Loss: 1.4342445135116577
Epoch [1/5], Step [120/3267], Loss: 1.440632700920105
Epoch [1/5], Step [130/3267], Loss: 1.6602866649627686
Epoch [1/5], Step [140/3267], Loss: 1.3173856735229492
Epoch [1/5], Step [150/3267], Loss: 1.5122061967849731
Epoch [1/5], Step [160/3267], Loss: 1.6246155500411987
Epoch [1/5], Step [170/3267], Lo

In [19]:
# Load the model
loaded_model = RNNModel(input_size, hidden_size, output_size)
loaded_model.load_state_dict(torch.load('seq2seq_rnn_model.pth'))
loaded_model.eval()  # Set the model to evaluation mode


RNNModel(
  (embedding): Embedding(28996, 512)
  (lstm): LSTM(512, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=28996, bias=True)
)

In [20]:
def preprocess_unseen_data(unseen_data, tokenizer):
    inputs = tokenizer(unseen_data, return_tensors='pt', max_length=64, truncation=True, padding="max_length")
    return inputs

def predict(model, input_ids):
    with torch.no_grad():
        outputs = model(input_ids)
    return outputs

def decode_predictions(outputs, tokenizer):
    predicted_ids = torch.argmax(outputs, dim=-1)
    predicted_text = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
    return predicted_text


In [21]:
unseen_data = ["Here's the data that needs to be checked!!!"]

# Preprocess unseen data
unseen_inputs = preprocess_unseen_data(unseen_data, tokenizer)

# Make predictions
predictions = predict(loaded_model, unseen_inputs['input_ids'])

# Decode predictions
decoded_predictions = decode_predictions(predictions, tokenizer)

# Print the results
for input_text, output_text in zip(unseen_data, decoded_predictions):
    print(f'Input: {input_text}')
    print(f'Predicted Output: {output_text}\n')


Input: Here's the data that needs to be checked!!!
Predicted Output: Here is s the the that to to be be



## RNN Try 3 with encoder decoder arch

In [22]:
# class Encoder(nn.Module):
#     def __init__(self, input_size, embedding_size, hidden_size):
#         super(Encoder, self).__init__()
#         self.embedding = nn.Embedding(input_size, embedding_size)
#         self.rnn = nn.GRU(embedding_size, hidden_size, batch_first=True)

#     def forward(self, input_seq):
#         embedded = self.embedding(input_seq)
#         outputs, hidden = self.rnn(embedded)
#         return hidden

# class Decoder(nn.Module):
#     def __init__(self, output_size, embedding_size, hidden_size):
#         super(Decoder, self).__init__()
#         self.embedding = nn.Embedding(output_size, embedding_size)
#         self.rnn = nn.GRU(embedding_size, hidden_size, batch_first=True)
#         self.fc = nn.Linear(hidden_size, output_size)

#     def forward(self, input_token, hidden):
#         embedded = self.embedding(input_token.unsqueeze(1))
#         output, hidden = self.rnn(embedded, hidden)
#         output = self.fc(output.squeeze(1))
#         return output, hidden

# class Seq2Seq(nn.Module):
#     def __init__(self, encoder, decoder):
#         super(Seq2Seq, self).__init__()
#         self.encoder = encoder
#         self.decoder = decoder

#     def forward(self, src_seq, trg_seq):
#         batch_size = src_seq.shape[0]
#         trg_len = trg_seq.shape[1]
#         trg_vocab_size = self.decoder.fc.out_features

#         outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(src_seq.device)

#         hidden = self.encoder(src_seq)

#         input_token = trg_seq[:, 0]

#         for t in range(1, trg_len):
#             output, hidden = self.decoder(input_token, hidden)
#             outputs[:, t] = output
#             input_token = output.argmax(1)

#         return outputs

In [23]:
# # Loading preprocessed data in train_dataset
# input_ids = torch.tensor(train_dataset['input_ids']).to(device)
# labels_0 = torch.tensor(train_dataset['labels']).to(device)

# # Define your model
# input_size = len(tokenizer.get_vocab())
# hidden_size = 512
# output_size = len(tokenizer.get_vocab())


# encoder = Encoder(input_size, embedding_size, hidden_size)
# decoder = Decoder(output_size, embedding_size, hidden_size)

# model = Seq2Seq(encoder, decoder)

# # Define your loss function and optimizer
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Train the model
# num_epochs = 5
# batch_size = 32
# model.to(device)

# for epoch in range(num_epochs):
#     for i in range(0, len(input_ids), batch_size):
#         inputs = input_ids[i:i+batch_size]
#         targets = labels_0[i:i+batch_size]

#         optimizer.zero_grad()
#         outputs = model(inputs, targets)
#         output = output.view(-1, output.shape[-1])
#         targets = targets.view(-1)

#         loss = criterion(outputs.view(-1, output_size), targets.view(-1))
#         loss.backward()
#         optimizer.step()

#         if (i // batch_size) % 10 == 0:
#             print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i // batch_size}/{len(input_ids) // batch_size}], Loss: {loss.item()}')

# # Save the trained model
# torch.save(model.state_dict(), "seq2seq_rnn_model.pth")
# print('Training finished!')