# Data Preprocessing

In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
%cd /content/drive/MyDrive/MSC_Intro_to_NLP_Group_Project/

/content/drive/MyDrive/MSC_Intro_to_NLP_Group_Project


In [33]:
!pip install nltk
!pip install gensim
!pip install 'transformers[torch]'
!pip install datasets




In [34]:
import json
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from transformers import AutoTokenizer

from datasets import load_dataset

In [35]:
# Load the preprocessed data from the JSON file
data_files={
    "train":"data_train_rule_based_preprocess.json",
}

dataset = load_dataset("json", data_files=data_files)

In [36]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['transformation', 'id', 'topic'],
        num_rows: 104562
    })
})


In [37]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [38]:
# Tokenize the informal sentences
def preprocess_function(examples, input_field="informal", target_field="formal.ref0"):
    inputs = [ex[input_field] for ex in examples["transformation"]]
    targets = [ex[target_field] for ex in examples["transformation"]]

    new_examples = tokenizer(
        inputs, text_target=targets, max_length=64, truncation=True, padding="max_length"
    )

    return new_examples

In [39]:
def create_multi_ref_dataset(dataset):
  for i, target_field in enumerate(['formal.ref0', 'formal.ref1', 'formal.ref2', 'formal.ref3']):
    new_dataset = preprocess_function(dataset, 'informal', target_field)
    dataset = dataset.add_column(f'labels_{i}', new_dataset['labels'])
    if i == 0:
      dataset = dataset.add_column('input_ids', new_dataset['input_ids'])
      dataset = dataset.add_column('token_type_ids', new_dataset['token_type_ids'])
      dataset = dataset.add_column('attention_mask', new_dataset['attention_mask'])

  return dataset

In [40]:
train_dataset = dataset['train'].map(
    preprocess_function,
    batched=True,
)

In [41]:
tokenizer.convert_ids_to_tokens(train_dataset['input_ids'][0])

['[CLS]',
 'Sure',
 ',',
 'it',
 "'",
 's',
 'ok',
 ',',
 'but',
 'I',
 'always',
 'have',
 'let',
 'the',
 'guy',
 'ask',
 'me',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [42]:
# # splitting the train dataset to use only 10% of it
# train_dataset = train_dataset.train_test_split(test_size=0.9, shuffle=True, seed=42)
# train_dataset = train_dataset['train']

# RNN Model

In [43]:
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset



# Using the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
gpu_name = torch.cuda.get_device_name(device)
print(gpu_name)


cuda
Tesla T4


# RNN Try 1

In [44]:
# RNN first try

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.rnn(x)
        output = self.fc(output)
        return output

In [45]:
# class Seq2SeqRNN(nn.Module):
#     def __init__(self, input_size, embedding_size, hidden_size, output_size):
#         super(Seq2SeqRNN, self).__init__()
#         self.embedding = nn.Embedding(input_size, embedding_size)
#         self.rnn = nn.RNN(embedding_size, hidden_size)
#         self.fc = nn.Linear(hidden_size, output_size)

#     def forward(self, input_seq, hidden=None):
#         embedded = self.embedding(input_seq)
#         output, hidden = self.rnn(embedded, hidden)
#         output = self.fc(output)
#         return output, hidden

# # Define hyperparameters
# input_size = len(tokenizer.get_vocab())
# embedding_size = 256
# hidden_size = 512
# output_size = len(tokenizer.get_vocab())

# # Instantiate the model
# model = Seq2SeqRNN(input_size, embedding_size, hidden_size, output_size)

# # Define loss function and optimizer
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Training loop
# num_epochs = 1
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# for epoch in range(num_epochs):
#     for batch in train_dataset:
#         inputs = torch.tensor(batch["input_ids"]).to(device)
#         targets = torch.tensor(batch["labels"]).to(device)

#         optimizer.zero_grad()

#         output, _ = model(inputs)

#         # Reshape the output to be 2D (batch_size * sequence_length, vocab_size)
#         output = output.view(-1, output_size)

#         loss = criterion(output, targets.view(-1))
#         loss.backward()
#         optimizer.step()

#         # Print the loss for monitoring training progress
#         print(f"Epoch: {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# # Save the trained model
# torch.save(model.state_dict(), "seq2seq_rnn_model.pth")


In [46]:
# # Load the model
# model = Seq2SeqRNN(input_size, embedding_size, hidden_size, output_size)
# model.load_state_dict(torch.load("seq2seq_rnn_model.pth"))
# model.eval()  # Set the model to evaluation mode

# # Define the input sequence
# input_seq = torch.tensor(tokenizer.encode("Yo! Sidd, what's up!!!")).unsqueeze(0)

# # Predict the output
# output, _ = model(input_seq)
# output = torch.argmax(output, dim=2)  # Get the most probable next token

# # Decode the output
# decoded_output = tokenizer.decode(output[0])

# print(decoded_output)

## RNN Try 2

In [47]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output)
        return output

In [48]:
# import torch.optim as optim
# from torch.utils.data import DataLoader, TensorDataset

# # choosing GPU
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Loading preprocessed data in train_dataset
# input_ids = torch.tensor(train_dataset['input_ids']).to(device)
# labels_0 = torch.tensor(train_dataset['labels']).to(device)

# # Define your model
# input_size = len(tokenizer.get_vocab())
# print("||" * 10)
# print(input_size)
# print("||" * 10)
# hidden_size = 512
# output_size = len(tokenizer.get_vocab())
# # output_size = 512

# model = RNNModel(input_size, hidden_size, output_size)
# model.to(device)

# # Define your loss function and optimizer
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Train the model
# num_epochs = 5
# batch_size = 32

# for epoch in range(num_epochs):
#     for i in range(0, len(input_ids), batch_size):
#         inputs = input_ids[i:i+batch_size]
#         targets = labels_0[i:i+batch_size]

#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs.view(-1, output_size), targets.view(-1))
#         loss.backward()
#         optimizer.step()

#         if (i // batch_size) % 10 == 0:
#             print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i // batch_size}/{len(input_ids) // batch_size}], Loss: {loss.item()}')

# # Save the trained model
# torch.save(model.state_dict(), "seq2seq_rnn_model.pth")
# print('Training finished!')


In [49]:
# # Load the model
# loaded_model = RNNModel(input_size, hidden_size, output_size)
# loaded_model.load_state_dict(torch.load('seq2seq_rnn_model.pth'))
# loaded_model.eval()  # Set the model to evaluation mode


In [50]:
# def preprocess_unseen_data(unseen_data, tokenizer):
#     inputs = tokenizer(unseen_data, return_tensors='pt', max_length=64, truncation=True, padding="max_length")
#     return inputs

# def predict(model, input_ids):
#     with torch.no_grad():
#         outputs = model(input_ids)
#     return outputs

# def decode_predictions(outputs, tokenizer):
#     predicted_ids = torch.argmax(outputs, dim=-1)
#     predicted_text = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
#     return predicted_text


In [51]:
# unseen_data = ["Here's the data that needs to be checked!!!"]

# # Preprocess unseen data
# unseen_inputs = preprocess_unseen_data(unseen_data, tokenizer)

# # Make predictions
# predictions = predict(loaded_model, unseen_inputs['input_ids'])

# # Decode predictions
# decoded_predictions = decode_predictions(predictions, tokenizer)

# # Print the results
# for input_text, output_text in zip(unseen_data, decoded_predictions):
#     print(f'Input: {input_text}')
#     print(f'Predicted Output: {output_text}\n')


## RNN Try 3 with encoder decoder arch

In [52]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size, batch_first=True)

    def forward(self, input_seq):
        embedded = self.embedding(input_seq)
        outputs, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_token, hidden):
        embedded = self.embedding(input_token.unsqueeze(1))
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output.squeeze(1))
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src_seq, trg_seq):
        batch_size = src_seq.shape[0]
        trg_len = trg_seq.shape[1]
        trg_vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(src_seq.device)

        hidden = self.encoder(src_seq)

        input_token = trg_seq[:, 0]

        for t in range(1, trg_len):
            output, hidden = self.decoder(input_token, hidden)
            outputs[:, t] = output
            input_token = output.argmax(1)

        return outputs

In [53]:
# Loading preprocessed data in train_dataset
input_ids = torch.tensor(train_dataset['input_ids']).to(device)
labels_0 = torch.tensor(train_dataset['labels']).to(device)

# Define your model
input_size = len(tokenizer.get_vocab())
hidden_size = 512
embedding_size = 512
output_size = len(tokenizer.get_vocab())


encoder = Encoder(input_size, embedding_size, hidden_size)
decoder = Decoder(output_size, embedding_size, hidden_size)

model = Seq2Seq(encoder, decoder)

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 1
batch_size = 32
model.to(device)

for epoch in range(num_epochs):
    for i in range(0, len(input_ids), batch_size):
        inputs = input_ids[i:i+batch_size]
        targets = labels_0[i:i+batch_size]

        optimizer.zero_grad()
        outputs = model(inputs, targets)
        output = outputs.view(-1, outputs.shape[-1])
        targets = targets.view(-1)

        loss = criterion(outputs.view(-1, output_size), targets.view(-1))
        loss.backward()
        optimizer.step()

        if (i // batch_size) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i // batch_size}/{len(input_ids) // batch_size}], Loss: {loss.item()}')

# Save the trained model
torch.save(model.state_dict(), "seq2seq_rnn_model.pth")
print('Training finished!')

Epoch [1/1], Step [0/3267], Loss: 10.26727294921875
Epoch [1/1], Step [10/3267], Loss: 2.4065234661102295
Epoch [1/1], Step [20/3267], Loss: 1.5702769756317139
Epoch [1/1], Step [30/3267], Loss: 1.6176707744598389
Epoch [1/1], Step [40/3267], Loss: 1.6567312479019165
Epoch [1/1], Step [50/3267], Loss: 1.6792460680007935
Epoch [1/1], Step [60/3267], Loss: 1.8667702674865723
Epoch [1/1], Step [70/3267], Loss: 1.692054271697998
Epoch [1/1], Step [80/3267], Loss: 1.7448334693908691
Epoch [1/1], Step [90/3267], Loss: 1.9046194553375244
Epoch [1/1], Step [100/3267], Loss: 1.9233150482177734
Epoch [1/1], Step [110/3267], Loss: 1.6469289064407349
Epoch [1/1], Step [120/3267], Loss: 1.6849910020828247
Epoch [1/1], Step [130/3267], Loss: 1.901947259902954
Epoch [1/1], Step [140/3267], Loss: 1.5244415998458862
Epoch [1/1], Step [150/3267], Loss: 1.723103642463684
Epoch [1/1], Step [160/3267], Loss: 1.85185706615448
Epoch [1/1], Step [170/3267], Loss: 1.59792959690094
Epoch [1/1], Step [180/3267],

In [61]:
# Load the pre-trained model
loaded_model = Seq2Seq(Encoder(input_size, embedding_size, hidden_size),
                       Decoder(output_size, embedding_size, hidden_size))
loaded_model.load_state_dict(torch.load("seq2seq_rnn_model.pth"))
loaded_model.eval()
loaded_model.to(device)

# Assuming you have an unseen data preprocessing function
def preprocess_unseen_data(unseen_data, tokenizer):
    inputs = tokenizer(unseen_data, return_tensors='pt', max_length=64, truncation=True, padding="max_length")
    return inputs

# Placeholder for unseen data
unseen_data = ["Here's the unseen data that needs to be predicted!!!"]

# Preprocess unseen data
unseen_inputs = preprocess_unseen_data(unseen_data, tokenizer)
src_seq = unseen_inputs['input_ids'].to(device)

# Placeholder for target sequence (trg_seq) during inference
max_target_length = input_size

trg_seq = torch.zeros((src_seq.shape[0], max_target_length), dtype=torch.long).to(device)

# Make predictions
with torch.no_grad():
    predictions = loaded_model(src_seq, trg_seq)

# Decode predictions
def decode_predictions(outputs, tokenizer):
    predicted_ids = torch.argmax(outputs, dim=-1)
    predicted_text = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
    return predicted_text

decoded_predictions = decode_predictions(predictions, tokenizer)  # Implement or use your decoding function

# Print the results
for input_text, output_text in zip(unseen_data, decoded_predictions):
    print(f'Input: {input_text}')
    print(f'Predicted Output: {output_text}\n')


Input: Here's the unseen data that needs to be predicted!!!
Predicted Output: is be be be the be.

