# Data Preprocessing

In [31]:
!pip install nltk
!pip install gensim
!pip install 'transformers[torch]'
!pip install datasets




In [32]:
import json
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from transformers import AutoTokenizer

from datasets import load_dataset

In [33]:
# Load the preprocessed data from the JSON file
data_files={
    "train":"data_train_rule_based_preprocess.json",
}

dataset = load_dataset("json", data_files=data_files)

In [34]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['transformation', 'topic', 'id'],
        num_rows: 104562
    })
})


In [35]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [36]:
# Tokenize the informal sentences
def preprocess_function(examples, input_field="informal", target_field="formal.ref0"):
    inputs = [ex[input_field] for ex in examples["transformation"]]
    targets = [ex[target_field] for ex in examples["transformation"]]

    new_examples = tokenizer(
        inputs, text_target=targets, max_length=64, truncation=True, padding="max_length"
    )

    return new_examples

In [37]:
def create_multi_ref_dataset(dataset):
  for i, target_field in enumerate(['formal.ref0', 'formal.ref1', 'formal.ref2', 'formal.ref3']):
    new_dataset = preprocess_function(dataset, 'informal', target_field)
    dataset = dataset.add_column(f'labels_{i}', new_dataset['labels'])
    if i == 0:
      dataset = dataset.add_column('input_ids', new_dataset['input_ids'])
      dataset = dataset.add_column('token_type_ids', new_dataset['token_type_ids'])
      dataset = dataset.add_column('attention_mask', new_dataset['attention_mask'])

  return dataset

In [38]:
train_dataset = dataset['train'].map(
    preprocess_function,
    batched=True,
)

In [39]:
tokenizer.convert_ids_to_tokens(train_dataset['input_ids'][0])

# RNN Model

In [None]:
# splitting the train dataset to use only 10% of it
train_dataset = train_dataset.train_test_split(test_size=0.9, shuffle=True, seed=42)
train_dataset = train_dataset['train']


## RNN Try 1

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class Seq2SeqRNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size):
        super(Seq2SeqRNN, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq, hidden=None):
        embedded = self.embedding(input_seq)
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output)
        return output, hidden
    
# Define hyperparameters
input_size = len(tokenizer.get_vocab())
embedding_size = 256
hidden_size = 512
output_size = len(tokenizer.get_vocab())

# Instantiate the model
model = Seq2SeqRNN(input_size, embedding_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    for batch in train_dataset:
        inputs = torch.tensor(batch["input_ids"]).to(device)
        targets = torch.tensor(batch["labels"]).to(device)

        optimizer.zero_grad()

        output, _ = model(inputs)

        # Reshape the output to be 2D (batch_size * sequence_length, vocab_size)
        output = output.view(-1, output_size)

        loss = criterion(output, targets.view(-1))
        loss.backward()
        optimizer.step()

        # Print the loss for monitoring training progress
        print(f"Epoch: {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Save the trained model
torch.save(model.state_dict(), "seq2seq_rnn_model.pth")


Epoch: 1/1, Loss: 10.47869873046875
Epoch: 1/1, Loss: 8.450996398925781
Epoch: 1/1, Loss: 6.320003986358643
Epoch: 1/1, Loss: 4.943516254425049
Epoch: 1/1, Loss: 3.1964590549468994
Epoch: 1/1, Loss: 3.413167715072632
Epoch: 1/1, Loss: 2.8473544120788574
Epoch: 1/1, Loss: 2.683708429336548
Epoch: 1/1, Loss: 1.3199167251586914
Epoch: 1/1, Loss: 1.603636384010315
Epoch: 1/1, Loss: 2.589775800704956
Epoch: 1/1, Loss: 3.302884817123413
Epoch: 1/1, Loss: 1.9691418409347534
Epoch: 1/1, Loss: 2.619746685028076
Epoch: 1/1, Loss: 2.4253463745117188
Epoch: 1/1, Loss: 2.0646462440490723
Epoch: 1/1, Loss: 2.5758752822875977
Epoch: 1/1, Loss: 2.5621070861816406
Epoch: 1/1, Loss: 2.461064338684082
Epoch: 1/1, Loss: 1.97645103931427
Epoch: 1/1, Loss: 2.3240647315979004
Epoch: 1/1, Loss: 3.5956146717071533
Epoch: 1/1, Loss: 1.9071331024169922
Epoch: 1/1, Loss: 2.5535507202148438
Epoch: 1/1, Loss: 4.3478102684021
Epoch: 1/1, Loss: 3.0635390281677246
Epoch: 1/1, Loss: 1.4705787897109985
Epoch: 1/1, Loss:

KeyboardInterrupt: 