In [None]:
# !pip install transformers datasets torch

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForMaskedLM, Trainer, TrainingArguments, BartForConditionalGeneration

import os
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
import random
import pandas as pd
import json

import time
import ast
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split

In [None]:
VERSION=1


# isTEST='test-'
isTEST=''

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

MAX_LEN=29


# MODEL_NAME = "roberta"
MODEL_NAME = "bart"

## TOKENIZER & DATASET

In [None]:
def load_json(file_path):
  
    with open(file_path, 'r') as file:
        # Read all lines, strip whitespace, and filter out empty lines
        vocabulary = [line.strip() for line in file if line.strip()]
    return vocabulary 


def load_and_convert_json(json_file):
    with open(json_file, 'r') as f:
        # Read each line and parse the stringified JSON into a Python dict
        data = [json.loads(line) for line in f]

    # Convert the list of dicts into a Pandas DataFrame
    df = pd.DataFrame(data)
    return df


def convert_to_list(val):
    return ast.literal_eval(val)


# Function to apply the conversion using multithreading
def load_parallel_dataframe_apply(df, func, n_workers=4):
    with ThreadPoolExecutor(max_workers=n_workers) as executor:
        # Apply the function in parallel for 'input' and 'target' columns
        input_data = list(executor.map(func, df['input']))
        target_data = list(executor.map(func, df['target']))
    
    return input_data, target_data


# Function to process a single line of JSON data and convert it to a dictionary
def process_json_line(line):
    return json.loads(line)

# Function to load the JSON data in parallel
def load_json_in_parallel(file_path, n_workers=4):
    # Read the file line by line
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Use ThreadPoolExecutor to parallelize the JSON parsing
    with ThreadPoolExecutor(max_workers=n_workers) as executor:
        data = list(executor.map(process_json_line, lines))
    
   
    # Convert the list of dictionaries into a pandas DataFrame
    df= pd.DataFrame(data[0])
    
    return df


In [None]:
# Initialize tokenizer 
class CharLevelTokenizer:
    def __init__(self, vocab):
        self.char_vocab = vocab
        self.char_to_id = {char: idx for idx, char in enumerate(vocab)}
        self.id_to_char = {idx: char for idx, char in enumerate(vocab)}

    def encode(self, text):
        return [self.char_to_id[char] if char in self.char_to_id else self.char_to_id["_"] for char in text]

    def decode(self, token_ids):
        return "".join([self.id_to_char[token_id] for token_id in token_ids])

    def save_pretrained(self, save_directory):
        """Implement a save method that writes the tokenizer data to disk."""
        os.makedirs(save_directory, exist_ok=True)
        # Save the vocab
        with open(os.path.join(save_directory, 'vocab.json'), 'w') as f:
            json.dump(self.char_vocab, f)
        
        # # Optionally, save any other tokenizer-related data
        # # For example, special tokens file, etc.
        # special_tokens = {'pad_token': self.pad_token_id, 'mask_token': self.mask_token}
        # with open(os.path.join(save_directory, 'special_tokens_map.json'), 'w') as f:
        #     json.dump(special_tokens, f)

    def from_pretrained(self, pretrained_directory):
        """Load the tokenizer from a saved directory."""
        with open(os.path.join(pretrained_directory, 'vocab.json'), 'r') as f:
            self.char_vocab = json.load(f)
        # Optionally, load other data like special tokens




# def char_level_data_collator(batch, MAX_LEN, device, pad_token_id):

#     input_ids = []
#     target_ids = []
#     for item in batch:
#         # Pad sequences to the max length in the batch
#         input_ids.append(item["input_ids"] + [pad_token_id] * (MAX_LEN - len(item["input_ids"])))
#         target_ids.append(item["target_ids"] + [pad_token_id] * (MAX_LEN - len(item["target_ids"])))

#     # Convert to tensors
#     return {
#         "input_ids": torch.tensor(input_ids, dtype=torch.long),
#         "target_ids": torch.tensor(target_ids, dtype=torch.long),
#     }



In [None]:
class CharLevelDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, mask_token_id, pad_token_id, max_length=29, device='cpu'):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.mask_token_id = mask_token_id
        self.pad_token_id = pad_token_id
        self.max_length = max_length
        self.device = device 

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        # Get input and target text at the given index
        input_text = self.inputs.iloc[idx] if isinstance(self.inputs, pd.Series) else self.inputs[idx]
        target_text = self.targets.iloc[idx] if isinstance(self.targets, pd.Series) else self.targets[idx]

        # Tokenize input and target text
        input_ids = self.tokenizer.encode(input_text)[:self.max_length]  # Truncate to max_length
        target_ids = self.tokenizer.encode(target_text)[:self.max_length]  # Truncate to max_length

        # Convert to tensors and move to the specified device
        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long, device=self.device),
            "labels": torch.tensor(target_ids, dtype=torch.long, device=self.device),  # Use "labels" instead of "target_ids"
        }


In [None]:

def prepare_model_for_char_vocab(model_type, char_vocab):
   
    if model_type == "roberta":
        model = RobertaForMaskedLM.from_pretrained("roberta-base")
    elif model_type == "bart":
        model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
    else:
        raise ValueError("Unsupported model type. Choose 'roberta' or 'bart'.")

    # Resize token embeddings to match character-level vocabulary size
    model.resize_token_embeddings(len(char_vocab))

    return model


In [None]:
# # Example input
# input_data = [["_", "_", "_"], ["a", "_", "_"], ["_", "a", "_"], ["_", "_", "b"], 
#               ["a", "a", "_"], ["a", "_", "b"], ["_", "a", "b"], ["a", "a", "b"]]

# # Target for prediction
# target_data = [["a", "a", "b"], ["a", "a", "b"], ["a", "a", "b"], ["a", "a", "b"],
#                ["a", "a", "b"], ["a", "a", "b"], ["a", "a", "b"], ["a", "a", "b"]]


In [None]:
print("\n==> LOADING DATASET")


## Load dataset [DF]
# df = pd.read_csv(f"./datasets/{isTEST}dataset.csv", sep='|') 
# print("DATASET SHAPE : ", df.shape)

# input_data  = df['input'].apply(ast.literal_eval)
# target_data = df['target'].apply(ast.literal_eval)


# # Apply parallel processing to 'input' and 'target' columns
# input_data, target_data = load_parallel_dataframe_apply(df, convert_to_list, n_workers=8)


## LOAD Dataset [json]
start = time.time()
df = load_json_in_parallel(f"./datasets/{isTEST}dataset.json" , n_workers=8)
input_data  = df.input
target_data = df.target
NUM_SAMPLES = df.shape[0]

end = time.time()

print(f"\nTIME SPENT : {end-start}")

In [None]:
# # Custom tokenizer to tokenize by lowercase characters only
# char_vocab = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "[MASK]", "[PAD]", "_"]


# # Initialize the custom lowercase character-level tokenizer
# char_tokenizer = CharLevelTokenizer(char_vocab)
# mask_token_id = char_tokenizer.char_to_id["[MASK]"]
# pad_token_id = char_tokenizer.char_to_id["[PAD]"]



# # Split the data into training and testing sets
# train_input_data, test_input_data, train_target_data, test_target_data = train_test_split(
#     input_data, target_data, test_size=0.2, random_state=42
# )

# # Initialize the training and testing datasets
# train_dataset = CharLevelDataset(train_input_data, train_target_data, char_tokenizer, mask_token_id)
# test_dataset = CharLevelDataset(test_input_data, test_target_data, char_tokenizer, mask_token_id)

# # train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [None]:
# Custom tokenizer to tokenize by lowercase characters only
char_vocab = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "[MASK]", "[PAD]", "_"]


char_tokenizer = CharLevelTokenizer(char_vocab)
mask_token_id = char_tokenizer.char_to_id["[MASK]"]
pad_token_id = char_tokenizer.char_to_id["[PAD]"]


# Split the data into training and testing sets
train_input_data, test_input_data, train_target_data, test_target_data = train_test_split(
    input_data, target_data, test_size=0.2, random_state=42)


# Initialize dataset
train_dataset = CharLevelDataset(train_input_data, train_target_data, char_tokenizer, mask_token_id, pad_token_id, MAX_LEN, device)
test_dataset = CharLevelDataset(test_input_data, test_target_data, char_tokenizer, mask_token_id, pad_token_id, MAX_LEN, device)


# Initialize dataloader
# train_dataset_loader = DataLoader(train_dataset, batch_size=64, collate_fn=char_level_data_collator)
# test_dataset_loader = DataLoader(test_dataset, batch_size=64, collate_fn=char_level_data_collator)


# for batch in train_dataset_loader:  # Correctly iterates over batches
#     print("Input IDs:", batch["input_ids"])
#     print("Target IDs:", batch["target_ids"])
#     break

## MODEL

In [None]:
print("\n==> LOADING MODEL")

In [None]:
# # Load pre-trained RoBERTa model for Masked Language Modeling (MLM)
# model = RobertaForMaskedLM.from_pretrained("roberta-base")

# # Resize the model's token embeddings to match the character-level vocab size
# model.resize_token_embeddings(len(char_tokenizer.char_vocab))  # Resize for lowercase char-level tokens

model = prepare_model_for_char_vocab(MODEL_NAME, char_tokenizer.char_vocab)


In [None]:

# Move model to MPS
model.to(device)

# Setup the training arguments
training_args = TrainingArguments(
    output_dir=f'.models/model-{MODEL_NAME}-{VERSION}/results', 
    evaluation_strategy="steps", 
    num_train_epochs=5,
    per_device_train_batch_size=min(128, int(NUM_SAMPLES/4)), 
    logging_dir=f'.models/model-{MODEL_NAME}-{VERSION}/logs', 
    logging_steps=500,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=char_tokenizer,  
    # data_collator=None,  
)


# Fine-tune the model
trainer.train()


In [None]:
# Save the fine-tuned model
model.save_pretrained(f".models/model-{MODEL_NAME}-{VERSION}/fine_tuned_{MODEL_NAME}")
print("\n==> MODEL SAVED")

## INFERENCE

In [None]:
def predict_masked_characters(input_sequence, tokenizer, model, mask_token_id):
    # Convert the input sequence to token IDs using the tokenizer
    input_ids = tokenizer.encode(input_sequence)

    # Convert input_ids to tensor and move it to the right device
    input_tensor = torch.tensor([input_ids])

    # Run the model to predict masked token positions
    with torch.no_grad():
        outputs = model(input_tensor)
        logits = outputs.logits

    # Extract the predicted token IDs for each masked position
    predicted_ids = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()

    # Convert predicted IDs to characters using the tokenizer
    predicted_sequence = tokenizer.decode(predicted_ids)
    return predicted_sequence

In [None]:
def loadFinetunedModel(model_name , model_path = None):

    if model_path and model_name == "roberta":
        model = RobertaForMaskedLM.from_pretrained(model_path)
    elif model_path and model_name == "bart":
        model = BartForConditionalGeneration.from_pretrained(model_path)
    else:
        print("ENTER VALID MODEL PATH!!")
        raise ValueError("Unsupported model type. Choose 'roberta' or 'bart'.")


    # Set the model to evaluation mode for inference
    model.eval()
    
    return model
    

In [None]:
model_path = f".models/model-{MODEL_NAME}-{VERSION}/fine_tuned_{MODEL_NAME}"
model = loadFinetunedModel(MODEL_NAME, model_path)

# Load the tokenizer (same tokenizer used during training)
char_tokenizer = CharLevelTokenizer(char_vocab)
mask_token_id = char_tokenizer.char_to_id["[MASK]"]
pad_token_id = char_tokenizer.char_to_id["[PAD]"]

In [None]:
# Example masked input
print("\n==> TESTNG INFERENCE")
masked_input = ["_", "a", "_", "s"]

answer = "aahs"


# Call the prediction function
predicted_output = predict_masked_characters(masked_input, char_tokenizer, model, mask_token_id)

print(f"Predicted Output: {predicted_output}\nANSWER: {answer}")
