In [1]:
# !pip install transformers datasets torch

In [2]:
import os
from sklearn.model_selection import train_test_split
import pandas as pd
import json
import time
import sys
import ast
from concurrent.futures import ThreadPoolExecutor


In [None]:
GPU_INDEX=2
# GPU_INDEX = "2,4,5"


isGPU = True


VERSION=0


isTEST='test-'
# isTEST=''


BATCH_SIZE=256 
EPOCHES=15

MODEL_NAME = "roberta"
# MODEL_NAME = "bart"


MAX_LEN = 31 
PAD_TOKEN = '[PAD]'
MASK_TOKEN = '_'
BOS_TOKEN = "[START]"
EOS_TOKEN = "[END]"

# Define special tokens
SPECIAL_TOKENS = {"pad_token": PAD_TOKEN, "bos_token": BOS_TOKEN, "eos_token":EOS_TOKEN, 'mask_token':MASK_TOKEN }


In [None]:
# HF model
# HF_API_TOKEN = os.getenv("HF_API_TOKEN")


if isGPU:
    # os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" ## to avoid Context Switching 
    os.environ["HF_HOME"]= "/data2/meithnav/.hfcache/"
    os.environ["CUDA_VISIBLE_DEVICES"]=str(GPU_INDEX) # not changing GPU. Only
    os.environ["WANDB_DISABLED"] = "true"


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaForMaskedLM, Trainer, TrainingArguments, BartForConditionalGeneration, DefaultDataCollator

if isGPU:
    # torch.cuda.set_device(0) ## setgpu
    # print("\n\n--> CONNECTED TO GPU NO: ", torch.cuda.current_device())
    print("--> GPU_INDEX: ", GPU_INDEX)
        
    # GPU (MPS for Apple Silicon, CUDA for Nvidia GPUs, or CPU)

    torch.cuda.empty_cache() # clear GPU cache
    torch.cuda.reset_max_memory_allocated()


device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")

# APPEND ROOT DIRECTORY
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('hangman'), '..')))


print(os.getcwd())

  from .autonotebook import tqdm as notebook_tqdm


--> GPU_INDEX:  7
/data2/meithnav/projects/hangman




## TOKENIZER & DATASET

In [6]:
def load_json(file_path):
  
    with open(file_path, 'r') as file:
        # Read all lines, strip whitespace, and filter out empty lines
        vocabulary = [line.strip() for line in file if line.strip()]
    return vocabulary 


def load_and_convert_json(json_file):
    with open(json_file, 'r') as f:
        # Read each line and parse the stringified JSON into a Python dict
        data = [json.loads(line) for line in f]

    # Convert the list of dicts into a Pandas DataFrame
    df = pd.DataFrame(data)
    return df


def convert_to_list(val):
    return ast.literal_eval(val)


# Function to apply the conversion using multithreading
def load_parallel_dataframe_apply(df, func, n_workers=4):
    with ThreadPoolExecutor(max_workers=n_workers) as executor:
        # Apply the function in parallel for 'input' and 'target' columns
        input_data = list(executor.map(func, df['input']))
        target_data = list(executor.map(func, df['target']))
    
    return input_data, target_data


# Function to process a single line of JSON data and convert it to a dictionary
def process_json_line(line):
    return json.loads(line)

# Function to load the JSON data in parallel
def load_json_in_parallel(file_path, n_workers=4):
    # Read the file line by line
    print(f"\n  =>LOADING {file_path}")
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Use ThreadPoolExecutor to parallelize the JSON parsing
    with ThreadPoolExecutor(max_workers=n_workers) as executor:
        data = list(executor.map(process_json_line, lines))
    
   
    # Convert the list of dictionaries into a pandas DataFrame
    df= pd.DataFrame(data[0])
    
    return df


In [None]:
# Initialize tokenizer 
class CharLevelTokenizer:
    def __init__(self, vocab, special_tokens):
        self.char_vocab = vocab
        self.char_to_id = {char: idx for idx, char in enumerate(vocab)}
        self.id_to_char = {idx: char for idx, char in enumerate(vocab)}
        self.special_tokens = special_tokens

    def encode(self, text):
        return [self.char_to_id[char] for char in text]

    def decode(self, token_ids):
        ignore_tokens = ['[PAD]', '_', '[START]', '[END]' ]
        ignore_idx = [self.char_to_id[token] for token in ignore_tokens]
        return "".join([ self.id_to_char[token_id] if token_id not in ignore_idx else '' for token_id in token_ids ])

    def save_pretrained(self, save_directory):
        """Implement a save method that writes the tokenizer data to disk."""
        os.makedirs(save_directory, exist_ok=True)
        # Save the vocab
        with open(os.path.join(save_directory, 'vocab.json'), 'w') as f:
            json.dump(self.char_vocab, f)
        

    def from_pretrained(self, pretrained_directory):
        """Load the tokenizer from a saved directory."""
        with open(os.path.join(pretrained_directory, 'vocab.json'), 'r') as f:
            self.char_vocab = json.load(f)



# Use the default data collator or ensure tensors are on CPU
data_collator = DefaultDataCollator(return_tensors="pt")


In [8]:
class CharLevelDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, special_tokens, max_length=29, device='cpu'):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.special_tokens = special_tokens
        self.max_length = max_length
        self.device = device 

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        # Get input and target text at the given index
        input_text = self.inputs.iloc[idx] if isinstance(self.inputs, pd.Series) else self.inputs[idx]
        target_text = self.targets.iloc[idx] if isinstance(self.targets, pd.Series) else self.targets[idx]

        # Tokenize input and target text
        input_ids = self.tokenizer.encode(input_text)[:self.max_length]  # Truncate to max_length
        target_ids = self.tokenizer.encode(target_text)[:self.max_length]  # Truncate to max_length

        # Convert to tensors and move to the specified device
        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "labels": torch.tensor(target_ids, dtype=torch.long),  # Use "labels" instead of "target_ids"
        }


In [9]:

def prepare_model_for_char_vocab(model_type, char_vocab):
   
    if model_type == "roberta":
        model = RobertaForMaskedLM.from_pretrained("roberta-base")
    elif model_type == "bart":
        model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
    else:
        raise ValueError("Unsupported model type. Choose 'roberta' or 'bart'.")

    # Resize token embeddings to match character-level vocabulary size
    model.resize_token_embeddings(len(char_vocab))

    return model


In [10]:
# # Example input
# input_data = [["_", "_", "_"], ["a", "_", "_"], ["_", "a", "_"], ["_", "_", "b"], 
#               ["a", "a", "_"], ["a", "_", "b"], ["_", "a", "b"], ["a", "a", "b"]]

# # Target for prediction
# target_data = [["a", "a", "b"], ["a", "a", "b"], ["a", "a", "b"], ["a", "a", "b"],
#                ["a", "a", "b"], ["a", "a", "b"], ["a", "a", "b"], ["a", "a", "b"]]


In [11]:
print("\n## LOADING DATASET")


## Load dataset [DF]
# df = pd.read_csv(f"./datasets/{isTEST}dataset.csv", sep='|') 
# print("DATASET SHAPE : ", df.shape)

# input_data  = df['input'].apply(ast.literal_eval)
# target_data = df['target'].apply(ast.literal_eval)


# # Apply parallel processing to 'input' and 'target' columns
# input_data, target_data = load_parallel_dataframe_apply(df, convert_to_list, n_workers=8)


## LOAD Dataset [json]
start = time.time()
df = load_json_in_parallel(f"./datasets/{isTEST}dataset.json" , n_workers=8)
input_data  = df.input
target_data = df.target
NUM_SAMPLES = df.shape[0]

end = time.time()

print(f" =>TIME SPENT : {end-start}")


## LOADING DATASET

  =>LOADING ./datasets/test-dataset.json
 =>TIME SPENT : 0.006122589111328125


In [12]:
# Custom tokenizer to tokenize by lowercase characters only
char_vocab = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "[PAD]", "_", "[START]", "[END]"]


char_tokenizer = CharLevelTokenizer(char_vocab, SPECIAL_TOKENS)

mask_token_id = char_tokenizer.char_to_id[SPECIAL_TOKENS['mask_token']]
pad_token_id = char_tokenizer.char_to_id[SPECIAL_TOKENS['pad_token']]
start_token_id = char_tokenizer.char_to_id[SPECIAL_TOKENS['bos_token']]
end_token_id = char_tokenizer.char_to_id[SPECIAL_TOKENS['eos_token']]


In [13]:
# Split the data into training and testing sets
train_input_data, test_input_data, train_target_data, test_target_data = train_test_split(
    input_data, target_data, test_size=0.3, random_state=42)


# Initialize dataset
train_dataset = CharLevelDataset(train_input_data, train_target_data, char_tokenizer, SPECIAL_TOKENS, MAX_LEN, device)
test_dataset = CharLevelDataset(test_input_data, test_target_data, char_tokenizer, SPECIAL_TOKENS, MAX_LEN, device)


# Initialize dataloader
# train_dataset_loader = DataLoader(train_dataset, batch_size=64, collate_fn=char_level_data_collator)
# test_dataset_loader = DataLoader(test_dataset, batch_size=64, collate_fn=char_level_data_collator)


# for batch in train_dataset_loader:  # Correctly iterates over batches
#     print("Input IDs:", batch["input_ids"])
#     print("Target IDs:", batch["target_ids"])
#     break

## MODEL

In [14]:
print("\n==> LOADING MODEL")


==> LOADING MODEL


In [15]:
# # Load pre-trained RoBERTa model for Masked Language Modeling (MLM)
# model = RobertaForMaskedLM.from_pretrained("roberta-base")

# # Resize the model's token embeddings to match the character-level vocab size
# model.resize_token_embeddings(len(char_tokenizer.char_vocab))  # Resize for lowercase char-level tokens

model = prepare_model_for_char_vocab(MODEL_NAME, char_tokenizer.char_vocab)


In [None]:

# Move model to MPS
model.to(device)

# Setup the training arguments
training_args = TrainingArguments(
    output_dir=f'./models/model-{MODEL_NAME}-EP_{EPOCHES}-{VERSION}/results', 
    # evaluation_strategy="steps", 
    evaluation_strategy="epoch", 
    num_train_epochs=EPOCHES,
    per_device_train_batch_size=min(BATCH_SIZE, int(NUM_SAMPLES/4)), 
    logging_dir=f'./models/model-{MODEL_NAME}-EP_{EPOCHES}-{VERSION}/logs', 
    logging_steps=50 if isTEST else 10000,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=char_tokenizer,  
    # data_collator=None,  
    data_collator=data_collator, 
)


# Fine-tune the model
trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.770925
2,No log,0.562441
3,No log,0.454871
4,No log,0.347995
5,No log,0.293533
6,No log,0.237677
7,No log,0.176135
8,No log,0.158903
9,No log,0.135601
10,No log,0.131436


TrainOutput(global_step=45, training_loss=0.370575926038954, metrics={'train_runtime': 13.9177, 'train_samples_per_second': 94.843, 'train_steps_per_second': 3.233, 'total_flos': 19671668929440.0, 'train_loss': 0.370575926038954, 'epoch': 15.0})

In [17]:
# Save the fine-tuned model
model.save_pretrained(f"./models/model-{MODEL_NAME}-EP_{EPOCHES}-{VERSION}/fine_tuned_{MODEL_NAME}")
print("\n==> MODEL SAVED")


==> MODEL SAVED


## INFERENCE

In [26]:
def predict_masked_characters(input_sequence, tokenizer, model, tokens, max_len=30):
    
    # pad tokens
    input_sequence = [tokens['bos_token']] + input_sequence + [tokens['eos_token']] + [tokens['pad_token']]*(max_len - len(input_sequence) - 2)

    # print(input_sequence)
    
    # Convert the input sequence to token IDs using the tokenizer
    input_ids = tokenizer.encode(input_sequence)

    # Convert input_ids to tensor and move it to the right device
    input_tensor = torch.tensor([input_ids])

    # Run the model to predict masked token positions
    with torch.no_grad():
        outputs = model(input_tensor)
        logits = outputs.logits

    # Extract the predicted token IDs for each masked position
    predicted_ids = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()

    # Convert predicted IDs to characters using the tokenizer
    predicted_sequence = tokenizer.decode(predicted_ids)
    return predicted_sequence

In [27]:
def loadFinetunedModel(model_name , model_path = None):

    if model_path and model_name == "roberta":
        model = RobertaForMaskedLM.from_pretrained(model_path)
    elif model_path and model_name == "bart":
        model = BartForConditionalGeneration.from_pretrained(model_path)
    else:
        print("ENTER VALID MODEL PATH!!")
        raise ValueError("Unsupported model type. Choose 'roberta' or 'bart'.")


    # Set the model to evaluation mode for inference
    model.eval()
    
    return model
    

In [28]:
model_path = f"./models/model-{MODEL_NAME}-EP_{EPOCHES}-{VERSION}/fine_tuned_{MODEL_NAME}"
model = loadFinetunedModel(MODEL_NAME, model_path)

# Load the tokenizer (same tokenizer used during training)
char_tokenizer = CharLevelTokenizer(char_vocab, SPECIAL_TOKENS)


mask_token_id = char_tokenizer.char_to_id[SPECIAL_TOKENS['mask_token']]
pad_token_id = char_tokenizer.char_to_id[SPECIAL_TOKENS['pad_token']]
start_token_id = char_tokenizer.char_to_id[SPECIAL_TOKENS['bos_token']]
end_token_id = char_tokenizer.char_to_id[SPECIAL_TOKENS['eos_token']]

In [None]:
# Example masked input
print("\n==> TESTNG INFERENCE")

# masked_input = ["_", "a", "_", "s"]
# answer = "plays"

masked_input = ["a", "a", "_", "_", "e", "_"]
answer = "aachen"


# Call the prediction function
predicted_output = predict_masked_characters(masked_input, char_tokenizer, model, SPECIAL_TOKENS, MAX_LEN)

print(f"Predicted Output: {predicted_output}\nANSWER: {answer}")



==> TESTNG INFERENCE
Predicted Output: aaas
ANSWER: plays


In [None]:
torch.cuda.empty_cache() # clear GPU cache
torch.cuda.reset_max_memory_allocated()

print(f"FINE-TUNED {MODEL_NAME}, GPU: {GPU_INDEX}")

print("\n\n ****ENDED SESSION !!*** \n\n")