# Data Preprocessing

In [21]:
!pip install nltk
!pip install gensim
!pip install 'transformers[torch]'
!pip install datasets




In [None]:
import json
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from transformers import AutoTokenizer

from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load the preprocessed data from the JSON file
data_files={
    "train":"data_train_rule_based_preprocess.json",
}

dataset = load_dataset("json", data_files=data_files)

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['transformation', 'topic', 'id'],
        num_rows: 104562
    })
})


In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
# Tokenize the informal sentences
def preprocess_function(examples, input_field="informal", target_field="formal.ref0"):
    inputs = [ex[input_field] for ex in examples["transformation"]]
    targets = [ex[target_field] for ex in examples["transformation"]]

    new_examples = tokenizer(
        inputs, text_target=targets, max_length=64, truncation=True, padding="max_length"
    )

    return new_examples

In [None]:
def create_multi_ref_dataset(dataset):
  for i, target_field in enumerate(['formal.ref0', 'formal.ref1', 'formal.ref2', 'formal.ref3']):
    new_dataset = preprocess_function(dataset, 'informal', target_field)
    dataset = dataset.add_column(f'labels_{i}', new_dataset['labels'])
    if i == 0:
      dataset = dataset.add_column('input_ids', new_dataset['input_ids'])
      dataset = dataset.add_column('token_type_ids', new_dataset['token_type_ids'])
      dataset = dataset.add_column('attention_mask', new_dataset['attention_mask'])

  return dataset

In [None]:
train_dataset = dataset['train'].map(
    preprocess_function,
    batched=True,
)

In [None]:
tokenizer.convert_ids_to_tokens(train_dataset['input_ids'][0])

['[CLS]',
 'Sure',
 ',',
 'it',
 "'",
 's',
 'ok',
 ',',
 'but',
 'I',
 'always',
 'have',
 'let',
 'the',
 'guy',
 'ask',
 'me',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

# RNN Model

## RNN Try 1

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class Seq2SeqRNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size):
        super(Seq2SeqRNN, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq, hidden=None):
        embedded = self.embedding(input_seq)
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output)
        return output, hidden
    
# Define hyperparameters
input_size = len(tokenizer.get_vocab())
embedding_size = 256
hidden_size = 512
output_size = len(tokenizer.get_vocab())

# Instantiate the model
model = Seq2SeqRNN(input_size, embedding_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    for batch in train_dataset:
        inputs = torch.tensor(batch["input_ids"]).to(device)
        targets = torch.tensor(batch["labels"]).to(device)

        optimizer.zero_grad()

        output, _ = model(inputs)

        # Reshape the output to be 2D (batch_size * sequence_length, vocab_size)
        output = output.view(-1, output_size)

        loss = criterion(output, targets.view(-1))
        loss.backward()
        optimizer.step()

        # Print the loss for monitoring training progress
        print(f"Epoch: {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Save the trained model
torch.save(model.state_dict(), "seq2seq_rnn_model.pth")


Epoch: 1/1, Loss: 10.111695289611816
Epoch: 1/1, Loss: 7.8522210121154785
Epoch: 1/1, Loss: 5.460385322570801
Epoch: 1/1, Loss: 4.175291538238525
Epoch: 1/1, Loss: 2.688713788986206
Epoch: 1/1, Loss: 3.232405424118042
Epoch: 1/1, Loss: 2.7364399433135986
Epoch: 1/1, Loss: 2.6727135181427
Epoch: 1/1, Loss: 1.2003051042556763
Epoch: 1/1, Loss: 1.693930983543396
Epoch: 1/1, Loss: 2.5487565994262695
Epoch: 1/1, Loss: 3.3033862113952637
Epoch: 1/1, Loss: 1.6789312362670898
Epoch: 1/1, Loss: 2.2341103553771973
Epoch: 1/1, Loss: 2.112222194671631
Epoch: 1/1, Loss: 1.7019405364990234
Epoch: 1/1, Loss: 2.104928970336914
Epoch: 1/1, Loss: 1.9503283500671387
Epoch: 1/1, Loss: 2.0283279418945312
Epoch: 1/1, Loss: 1.9377413988113403
Epoch: 1/1, Loss: 2.081826686859131
Epoch: 1/1, Loss: 3.506798505783081
Epoch: 1/1, Loss: 1.9307730197906494
Epoch: 1/1, Loss: 2.4637258052825928
Epoch: 1/1, Loss: 4.508030891418457
Epoch: 1/1, Loss: 2.7526891231536865
Epoch: 1/1, Loss: 1.0736217498779297
Epoch: 1/1, Lo

KeyboardInterrupt: 

RNN 3