# Data Preprocessing

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# %cd /content/drive/MyDrive/MSC_Intro_to_NLP_Group_Project/

In [None]:
!pip install nltk
!pip install gensim
!pip install 'transformers[torch]'
!pip install datasets
!pip install tensorflow


In [None]:
import json
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from transformers import AutoTokenizer

from datasets import load_dataset
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd


In [None]:
# Load the preprocessed data from the JSON file
data_files={
    "train":"data_train_rule_based_preprocess.json",
}

dataset = load_dataset("json", data_files=data_files)
print(dataset)

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# Tokenize the informal sentences
def preprocess_function(examples, input_field="informal", target_field="formal.ref0"):
    inputs = [ex[input_field] for ex in examples["transformation"]]
    targets = [ex[target_field] for ex in examples["transformation"]]

    new_examples = tokenizer(
        inputs, text_target=targets, max_length=64, truncation=True, padding="max_length"
    )

    return new_examples

def create_multi_ref_dataset(dataset):
  for i, target_field in enumerate(['formal.ref0', 'formal.ref1', 'formal.ref2', 'formal.ref3', 'rule_based_preprocessed']):
    new_dataset = preprocess_function(dataset, 'informal', target_field)
    dataset = dataset.add_column(f'labels_{i}', new_dataset['labels'])
    if i == 0:
      dataset = dataset.add_column('input_ids', new_dataset['input_ids'])
      dataset = dataset.add_column('token_type_ids', new_dataset['token_type_ids'])
      dataset = dataset.add_column('attention_mask', new_dataset['attention_mask'])

  return dataset

train_dataset = dataset['train'].map(
    preprocess_function,
    batched=True,
)

# tokenizer.convert_ids_to_tokens(train_dataset['input_ids'][0])


In [None]:
print(train_dataset['input_ids'][0])

GPU check

In [None]:
# Using the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
gpu_name = torch.cuda.get_device_name(device)
print(gpu_name)

# BiRNN

In [None]:
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim.downloader as api
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense


In [None]:
# Load the pre-trained GloVe model
glove_model = api.load("glove-twitter-25")
nltk.download('punkt')
nltk.download('stopwords')


In [None]:
# Load the preprocessed data from the JSON file
data_files = {
    "train": "data_train_rule_based_preprocess.json",
}

dataset = load_dataset("json", data_files=data_files)


In [None]:
def preprocess_function(examples, input_field="informal", target_field="formal.ref0"):
    input_sequences = []
    target_sequences = []

    for ex in examples["transformation"]:
        input_sentence = ex[input_field]
        target_sentence = ex[target_field]

        # Tokenize, remove stopwords, and convert to lowercase for both input and target
        input_tokens = [token.lower() for token in word_tokenize(input_sentence) if token.isalpha()]
        input_tokens = [token for token in input_tokens if token not in stopwords.words("english")]

        target_tokens = [token.lower() for token in word_tokenize(target_sentence) if token.isalpha()]
        target_tokens = [token for token in target_tokens if token not in stopwords.words("english")]

        # Convert each word to its word embedding for both input and target
        input_embeddings = [glove_model[word] for word in input_tokens if word in glove_model]
        target_embeddings = [glove_model[word] for word in target_tokens if word in glove_model]

        input_sequences.append(input_embeddings)
        target_sequences.append(target_embeddings)

    return {"input_sequences": input_sequences, "target_sequences": target_sequences}

    # Process the data
train_dataset = dataset["train"].map(
    preprocess_function,
    batched=True,
)

# Pad sequences to have the same length
X_padded = pad_sequences(train_dataset["input_sequences"], padding='post', dtype='float32')
Y_padded = pad_sequences(train_dataset["target_sequences"], padding='post', dtype='float32')

# Reshape X_padded and Y_padded to be 3D tensors (batch_size, sequence_length, input_size)
X_padded = X_padded.reshape(X_padded.shape[0], X_padded.shape[1], -1)
Y_padded = Y_padded.reshape(Y_padded.shape[0], Y_padded.shape[1], -1)



In [None]:
print(len(X_padded[0]))

In [None]:
print(len(Y_padded[0]))

developing and training the model using pytorch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np


In [None]:
# Define the Bidirectional RNN model
class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.EmbeddingBag(input_size, hidden_size, sparse=True)
        self.birnn = nn.RNN(hidden_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2 * hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.birnn(embedded)
        output = self.fc(output)
        return output


In [None]:
# Convert numpy arrays to PyTorch tensors
X_tensor = torch.from_numpy(X_padded)
Y_tensor = torch.from_numpy(Y_padded)


In [None]:
# Create DataLoader for training and validation sets
train_dataset = TensorDataset(X_tensor, Y_tensor)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Instantiate the model
input_size = X_padded.shape[-1]
hidden_size = 64
output_size = Y_padded.shape[-1]

model = BiRNN(input_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_X, batch_Y in train_loader:
        optimizer.zero_grad()

        # Reshape the input tensor if needed
        batch_X = batch_X.view(batch_X.size(0), -1)

        output = model(batch_X)
        loss = criterion(output, batch_Y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}')


# Another Try

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

# Define the BiRNN model
model = Sequential()
model.add(Embedding(input_dim=len(glove_model.index_to_key), output_dim=25, input_length=X_padded.shape[1]))
model.add(Bidirectional(LSTM(50, return_sequences=True)))
model.add(Dense(25, activation='linear'))  # Adjust the activation based on your problem

# Compile the model
model.compile(optimizer='adam', loss='mse')  # Adjust the loss based on your problem

# Display the model summary
model.summary()


# Trying with proper tokenization

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from datasets import load_dataset
from torch.nn.utils.rnn import pad_sequence


# Load the preprocessed data from the JSON file
data_files = {
    "train": "data_train_rule_based_preprocess.json",
}

dataset = load_dataset("json", data_files=data_files)
print(dataset)

# Tokenizer for informal sentences
def tokenize_informal(sentence):
    return word_tokenize(sentence.lower())  # You may need to adjust this based on your specific requirements

# Tokenizer for formal sentences
def tokenize_formal(sentence):
    return word_tokenize(sentence.lower())  # You may need to adjust this based on your specific requirements

# Tokenize the informal and formal sentences
def preprocess_function(examples, input_field="informal", target_field="formal.ref0"):
    inputs = [tokenize_informal(ex[input_field]) for ex in examples["transformation"]]
    targets = [tokenize_formal(ex[target_field]) for ex in examples["transformation"]]

    return inputs, targets

train_inputs, train_targets = preprocess_function(dataset['train'])


# Convert tokens to indices
vocab = set(word for sentence in train_inputs + train_targets for word in sentence)
word_to_index = {word: index + 2 for index, word in enumerate(vocab)}
word_to_index['<pad>'] = 0
word_to_index['<unk>'] = 1
index_to_word = {index: word for word, index in word_to_index.items()}

def sentence_to_indices(sentence, word_to_index):
    return [word_to_index.get(word, word_to_index['<unk>']) for word in sentence]

train_inputs_indices = [torch.tensor(sentence_to_indices(sentence, word_to_index)) for sentence in train_inputs]
train_targets_indices = [torch.tensor(sentence_to_indices(sentence, word_to_index)) for sentence in train_targets]

# Create PyTorch Dataset and DataLoader
class CustomDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

# Update the data preparation
train_inputs_indices = [torch.tensor(sentence_to_indices(sentence, word_to_index)) for sentence in train_inputs]
train_targets_indices = [torch.tensor(sentence_to_indices(sentence, word_to_index)) for sentence in train_targets]

# Create PyTorch Dataset and DataLoader
train_dataset = CustomDataset(train_inputs_indices, train_targets_indices)
def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True)
    targets_padded = pad_sequence(targets, batch_first=True)
    return inputs_padded, targets_padded



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# Bi-directional LSTM model
class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2 * hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output[:, -1, :])  # Use the last time step's output
        return output

# Hyperparameters
vocab_size = len(word_to_index)
embedding_dim = 50
hidden_size = 64
output_size = vocab_size

# Initialize the model, loss function, and optimizer
model = BiLSTMModel(vocab_size, embedding_dim, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, targets.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}")

# Save the trained model if needed
torch.save(model.state_dict(), "bilstm_model.pth")


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the JSON file into a list of dictionaries
with open("data_train_rule_based_preprocess.json", "r") as file:
    data = json.load(file)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)

# Ensure the DataFrame has "informal" and "formal.ref0" columns
# You may need to adjust column names based on the actual structure of your data
df = df.rename(columns={"transformation": "data"})  # Assuming "transformation" contains the relevant data
df["informal"] = df["data"].apply(lambda x: x["informal"])
df["formal.ref0"] = df["data"].apply(lambda x: x["formal.ref0"])
df["rule_based_preprocessed"] = df["data"].apply(lambda x: x["rule_based_preprocessed"])

# Optional: Drop unnecessary columns
df = df[["informal", "formal.ref0", "rule_based_preprocessed"]]

# Display the DataFrame
print(df.head())



In [None]:
# Tokenization function
def tokenize_sentence(sentence):
    return word_tokenize(sentence)

# Apply tokenization to each sentence in the DataFrame
df['informal_tokens'] = df['informal'].apply(tokenize_sentence)
df['formal_tokens'] = df['formal.ref0'].apply(tokenize_sentence)

# Split the data into training and testing sets
# train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Define a PyTorch Dataset
class FormalDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        informal_tokens = self.data.iloc[idx]['informal_tokens']
        formal_tokens = self.data.iloc[idx]['formal_tokens']

        return {
            'informal_tokens': torch.tensor([informal_tokens]).squeeze(),
            'formal_tokens': torch.tensor([formal_tokens]).squeeze()
        }

# Create datasets and data loaders
train_dataset = FormalDataset(df)
# test_dataset = FormalDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Define the Bi-directional RNN model
class BiRNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(BiRNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.birnn = nn.GRU(embedding_dim, hidden_size, bidirectional=True)
        self.fc = nn.Linear(2 * hidden_size, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.birnn(embedded)
        output = self.fc(output)
        return output

# Hyperparameters
vocab_size = 30522
embedding_dim = 300
hidden_size = 256
learning_rate = 0.001
num_epochs = 10

# Initialize the model, loss function, and optimizer
model = BiRNNModel(vocab_size, embedding_dim, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)



In [None]:
# Iterate through batches of data for training
for epoch in range(num_epochs):
    for batch in train_loader:
        informal_tokens = batch['informal_tokens'].squeeze(0)  # Remove the batch dimension
        formal_tokens = batch['formal_tokens'].squeeze(0)  # Remove the batch dimension

        # Forward pass
        outputs = model(informal_tokens)

        # Compute the loss
        loss = criterion(outputs.view(-1, vocab_size), formal_tokens.view(-1))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print the training loss for each epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')


# try again

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import numpy as np
import pandas as pd

# Load the preprocessed data from the JSON file
data_files = {
    "train": "data_train_rule_based_preprocess.json",
}

# Assuming you have your data in a JSON file
with open(data_files["train"], "r") as json_file:
    data = json.load(json_file)

# Tokenize the informal sentences
def tokenize_sentences(sentences):
    tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
    return tokenized_sentences

# Sample data
informal_sentences = [entry["transformation"]["informal"] for entry in data]
preprocessed_sentences = [entry["transformation"]["rule_based_preprocessed"] for entry in data]

# Tokenize the informal sentences
tokenized_informal = tokenize_sentences(informal_sentences)
tokenized_preprocessed = tokenize_sentences(preprocessed_sentences)

# Find the maximum sequence length in the dataset
max_informal_seq_length = max(len(sentence) for sentence in tokenized_informal)
max_preprocessed_seq_length = max(len(sentence) for sentence in tokenized_preprocessed)
print("Max Informal: ", max_informal_seq_length)
print("Max Preprocessed: ", max_preprocessed_seq_length)
# Calculate the average sequence length
avg_informal_seq_length = int(np.mean([len(sentence) for sentence in tokenized_informal]))
avg_preprocessed_seq_length = int(np.mean([len(sentence) for sentence in tokenized_preprocessed]))
print("Avg Informal: ", avg_informal_seq_length)
print("Avg Preprocessed: ", avg_preprocessed_seq_length)

# Create a vocabulary mapping words to indices
word2index = {word: index for index, word in enumerate(set(np.concatenate(tokenized_informal)))}

# Convert sentences to indices and pad sequences
indexed_sentences = [[word2index[word] for word in sentence] + [0] * (max_seq_length - len(sentence)) for sentence in tokenized_informal]
# Convert sentences to indices and pad sequences to the average length
# indexed_sentences = [[word2index[word] for word in sentence] + [0] * (avg_seq_length - len(sentence)) for sentence in tokenized_informal]



In [None]:
print(len(indexed_sentences[0]))

In [None]:
# Define a PyTorch Dataset class
class FormalDataset(Dataset):
    def __init__(self, indexed_sentences, formal_sentences):
        self.indexed_sentences = indexed_sentences
        self.formal_sentences = formal_sentences

    def __len__(self):
        return len(self.indexed_sentences)

    def __getitem__(self, idx):
        sample = {
            "indexed_sentence": self.indexed_sentences[idx],
            "formal_sentence": self.formal_sentences[idx],
        }
        return sample
# Create a PyTorch Dataset
formal_dataset = FormalDataset(indexed_sentences, [entry["transformation"]["formal.ref0"] for entry in data])


In [None]:
# Create a PyTorch Dataset
formal_dataset = FormalDataset(indexed_sentences, [entry["transformation"]["formal.ref0"] for entry in data])

# Sample DataLoader with collate_fn for dynamic padding
dataloader = DataLoader(
    formal_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=lambda batch: {
        "indexed_sentence": torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(item["indexed_sentence"]) for item in batch],
            batch_first=True,
            padding_value=0,
        ),
        "formal_sentence": [item["formal_sentence"] for item in batch],
    },
)


In [None]:
# Example of using DataLoader
for batch in dataloader:
    indexed_sentences_batch = batch["indexed_sentence"]
    formal_sentences_batch = batch["formal_sentence"]
    print(batch)
    print(indexed_sentences_batch)
    print(formal_sentences_batch)
