# Data Preprocessing

In [10]:
!pip install nltk
!pip install gensim
!pip install 'transformers[torch]'
!pip install datasets


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable




In [11]:
import json
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from transformers import AutoTokenizer

from datasets import load_dataset

In [12]:
# Load the preprocessed data from the JSON file
data_files={
    "train":"data_train_rule_based_preprocess.json",
}

dataset = load_dataset("json", data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'topic', 'transformation'],
        num_rows: 104562
    })
})


In [14]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [15]:
# Tokenize the informal sentences
def preprocess_function(examples, input_field="informal", target_field="formal.ref0"):
    inputs = [ex[input_field] for ex in examples["transformation"]]
    targets = [ex[target_field] for ex in examples["transformation"]]

    new_examples = tokenizer(
        inputs, text_target=targets, max_length=64, truncation=True, padding="max_length"
    )

    return new_examples

In [16]:
def create_multi_ref_dataset(dataset):
  for i, target_field in enumerate(['formal.ref0', 'formal.ref1', 'formal.ref2', 'formal.ref3']):
    new_dataset = preprocess_function(dataset, 'informal', target_field)
    dataset = dataset.add_column(f'labels_{i}', new_dataset['labels'])
    if i == 0:
      dataset = dataset.add_column('input_ids', new_dataset['input_ids'])
      dataset = dataset.add_column('token_type_ids', new_dataset['token_type_ids'])
      dataset = dataset.add_column('attention_mask', new_dataset['attention_mask'])

  return dataset

In [17]:
train_dataset = dataset['train'].map(
    preprocess_function,
    batched=True,
)

Map:   0%|          | 0/104562 [00:00<?, ? examples/s]

In [18]:
tokenizer.convert_ids_to_tokens(train_dataset['input_ids'][0])

['[CLS]',
 'Sure',
 ',',
 'it',
 "'",
 's',
 'ok',
 ',',
 'but',
 'I',
 'always',
 'have',
 'let',
 'the',
 'guy',
 'ask',
 'me',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

# RNN Model

In [19]:
# splitting the train dataset to use only 10% of it
train_dataset = train_dataset.train_test_split(test_size=0.9, shuffle=True, seed=42)
train_dataset = train_dataset['train']


## RNN Try 1

In [41]:
import torch
import torch.nn as nn
import torch.optim as optim

class Seq2SeqRNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size):
        super(Seq2SeqRNN, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq, hidden=None):
        embedded = self.embedding(input_seq)
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output)
        return output, hidden
    
# Define hyperparameters
input_size = len(tokenizer.get_vocab())
embedding_size = 256
hidden_size = 512
output_size = len(tokenizer.get_vocab())

# Instantiate the model
model = Seq2SeqRNN(input_size, embedding_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    for batch in train_dataset:
        inputs = torch.tensor(batch["input_ids"]).to(device)
        targets = torch.tensor(batch["labels"]).to(device)

        optimizer.zero_grad()

        output, _ = model(inputs)

        # Reshape the output to be 2D (batch_size * sequence_length, vocab_size)
        output = output.view(-1, output_size)

        loss = criterion(output, targets.view(-1))
        loss.backward()
        optimizer.step()

        # Print the loss for monitoring training progress
        print(f"Epoch: {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Save the trained model
torch.save(model.state_dict(), "seq2seq_rnn_model.pth")


Epoch: 1/1, Loss: 10.300232887268066
Epoch: 1/1, Loss: 7.758367538452148
Epoch: 1/1, Loss: 6.417394638061523
Epoch: 1/1, Loss: 3.9386544227600098
Epoch: 1/1, Loss: 2.8961384296417236
Epoch: 1/1, Loss: 3.6165552139282227
Epoch: 1/1, Loss: 2.7240357398986816
Epoch: 1/1, Loss: 3.3699467182159424
Epoch: 1/1, Loss: 1.5739232301712036
Epoch: 1/1, Loss: 3.1948723793029785
Epoch: 1/1, Loss: 2.5449304580688477
Epoch: 1/1, Loss: 5.965273857116699
Epoch: 1/1, Loss: 4.383166313171387
Epoch: 1/1, Loss: 1.2844417095184326
Epoch: 1/1, Loss: 1.9858113527297974
Epoch: 1/1, Loss: 1.3372080326080322
Epoch: 1/1, Loss: 1.279614806175232
Epoch: 1/1, Loss: 7.6645426750183105
Epoch: 1/1, Loss: 2.813234806060791
Epoch: 1/1, Loss: 1.3958736658096313
Epoch: 1/1, Loss: 4.071939945220947
Epoch: 1/1, Loss: 2.31996750831604
Epoch: 1/1, Loss: 3.0995094776153564
Epoch: 1/1, Loss: 1.5994542837142944
Epoch: 1/1, Loss: 3.138902425765991
Epoch: 1/1, Loss: 2.567776679992676
Epoch: 1/1, Loss: 1.4238203763961792
Epoch: 1/1, 

In [9]:
class Seq2SeqRNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size):
        super(Seq2SeqRNN, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq, hidden=None):
        embedded = self.embedding(input_seq)
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output)
        return output, hidden

    # Define hyperparameters
input_size = len(tokenizer.get_vocab())
embedding_size = 256
hidden_size = 512
output_size = len(tokenizer.get_vocab())

# Instantiate the model
model = Seq2SeqRNN(input_size, embedding_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


NameError: name 'tokenizer' is not defined

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim

In [8]:
# Load the model
model = Seq2SeqRNN(input_size, embedding_size, hidden_size, output_size)
model.load_state_dict(torch.load("seq2seq_rnn_model.pth"))
model.eval()  # Set the model to evaluation mode

# Define the input sequence
input_seq = torch.tensor(tokenizer.encode("Yo! Sidd, what;s up!!!")).unsqueeze(0)

# Predict the output
output, _ = model(input_seq)
output = torch.argmax(output, dim=2)  # Get the most probable next token

# Decode the output
decoded_output = tokenizer.decode(output[0])

print(decoded_output)

NameError: name 'input_size' is not defined