In [1]:
import warnings
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import get_scheduler
import numpy as np
import gc
from accelerate import Accelerator
from torch.nn import DataParallel
from transformers import AutoTokenizer, LongT5ForConditionalGeneration
from datasets import Dataset, DatasetDict
from transformers import DataCollatorForSeq2Seq

# Suppressing unnecessary warnings
warnings.filterwarnings("ignore")

2024-01-27 19:54:53.101852: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-27 19:54:53.139343: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-27 19:54:53.139375: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-27 19:54:53.139398: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-27 19:54:53.147290: I tensorflow/core/platform/cpu_feature_g

In [2]:
# Check if a GPU is available, and if so, use it

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


#torch.cuda.empty_cache()
#gc.collect()    


#tokenizer = T5Tokenizer.from_pretrained("./T5Snapshot/tokenizer_epoch5_loss1.60")
#model = T5ForConditionalGeneration.from_pretrained("./T5Snapshot/model_epoch5_loss1.60").to(device)


tokenizer = AutoTokenizer.from_pretrained("T5Tokenizer/tokenizer",local_files_only=True) #GPT2 BPE Tokenizer trained on Code
tokenizer.add_special_tokens({'pad_token': '<pad>'})
tokenizer.add_special_tokens({'eos_token': '</s>'})
tokenizer.add_special_tokens({'unk_token': '<unk>'})

model =  LongT5ForConditionalGeneration.from_pretrained("google/long-t5-local-base")
#tokenizer = T5Tokenizer.from_pretrained("t5-base") # When training from scratch
#model = T5ForConditionalGeneration.from_pretrained("t5-base")  # Move model to the GPU
model = DataParallel(model).to(device)
#We have modify the tokenizer a little bit, since it was designed for human language only
#new_tokens = ["{", "<", "}","\\",">"]
#https://github.com/google/sentencepiece

# Add the tokens to the tokenizer vocabulary
#tokenizer.add_tokens(new_tokens)

# Resize the model's embeddings to accommodate the extended vocabulary
model.module.resize_token_embeddings(len(tokenizer))

# the following 2 hyperparameters are task-specific
max_source_length = 2048  # Set to a value that covers most assembly code sequences
max_target_length = 1024  # Set to a value that covers most C code sequences


#max_split_size_mb = 1024  # Adjust this value to a lower value if needed
#tokenizer.max_split_size_mb = max_split_size_mb

# Read input sequences from assembly.txt
with open("assembly.txt", "r", encoding="utf-8") as assembly_file:
    assembly_lines = assembly_file.readlines()

# Read output sequences from functions.txt
with open("function.txt", "r", encoding="utf-8") as functions_file:
    functions_lines = functions_file.readlines()

# Ensure that the number of lines in assembly.txt and functions.txt are the same
assert len(assembly_lines) == len(functions_lines), "Number of lines in assembly.txt and functions.txt must be the same."

# Encode inputs and targets
input_sequences = assembly_lines
output_sequences = functions_lines

input_train, input_temp, output_train, output_temp = train_test_split(
    input_sequences, output_sequences, test_size=0.05, random_state=42
)

input_test, input_val, output_test, output_val = train_test_split(
    input_temp, output_temp, test_size=0.5, random_state=42)


train_dataset = Dataset.from_dict({"translation": [{"assembly": a, "c": o} for a, o in zip(input_train, output_train)]})
val_dataset = Dataset.from_dict({"translation": [{"assembly": a, "c": o} for a, o in zip(input_val, output_val)]})
test_dataset = Dataset.from_dict({"translation": [{"assembly": a, "c": o} for a, o in zip(input_test, output_test)]})

# Create DatasetDict
data_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset,
})

def preprocess_function(examples):
    inputs = [ex["assembly"] for ex in examples["translation"]]
    targets = [ex["c"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_source_length,padding="max_length", truncation=True)
    # Setup the tokenizer for targets
    #with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, max_length=max_target_length,padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#print(preprocess_function(data_dict['train'][:1]))

tokenized_datasets = data_dict.map(preprocess_function, batched=True)

print(tokenized_datasets["train"].shuffle(seed=42).select(range(10)))

tokenized_datasets.remove_columns(["translation"])

tokenized_datasets.set_format("torch")
#print(tokenized_datasets)

#input_ids_train = torch.tensor(tokenized_datasets['train']['input_ids'])

#attention_mask_train = torch.tensor(tokenized_datasets['train']['attention_mask'])
#labels_train = torch.tensor(tokenized_datasets['train']['labels'])

#input_ids_val = torch.tensor(tokenized_datasets['validation']['input_ids'])
#attention_mask_val = torch.tensor(tokenized_datasets['validation']['attention_mask'])
#labels_val = torch.tensor(tokenized_datasets['validation']['labels'])

print(model.module.device)  # Print the device the model is on to verify we use GPU
#print(input_ids_train.device)  # Print the device of input_ids

# Set hyperparameters
learning_rate = 5e-5
batch_size = 2 #thumb of rule
num_epochs = 20
num_warmup_steps = 0.1 * num_epochs * len(input_sequences) / batch_size

# Define optimizer with GPU device
optimizer = AdamW(model.parameters(), lr=learning_rate)



# Create DataLoader for training data
#train_dataset = TensorDataset(input_ids_train, attention_mask_train, labels_train)
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=batch_size, shuffle=True)


num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(

    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps

)

accelerator = Accelerator(gradient_accumulation_steps=3)
model, optimizer, train_dataloader, scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, lr_scheduler
)


# DataLoader for validation set
#validation_dataset = TensorDataset(input_ids_val, attention_mask_val, labels_val)
validation_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=batch_size, shuffle=False)  # No need to shuffle for validation

# Initialize lists to store training and validation losses and prepare Plot
train_losses = []
validation_losses = []
# Training loop on the GPU
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1} (Training)"):
        input_ids_batch_training = batch["input_ids"]
        attention_mask_batch_training = batch["attention_mask"]
        labels_batch_training = batch["labels"]
        # Forward pass
        loss = model(
            input_ids=input_ids_batch_training,
            attention_mask=attention_mask_batch_training,
            labels=labels_batch_training
        ).loss.mean()

        accelerator.backward(loss)
        #loss.backward()
        # Backward pass
        optimizer.step()
        optimizer.zero_grad()
        lr_scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    train_losses.append(avg_loss)
    
    # Validation
    model.eval()
    total_validation_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(validation_dataloader, desc=f"Epoch {epoch + 1} (Validation)"):
            input_ids_batch_validation = batch["input_ids"]
            attention_mask_batch_validation = batch["attention_mask"]
            labels_batch_validation = batch["labels"]
            # Forward pass on validation data
            # Calculate the validation loss
            validation_loss = model(
                input_ids=input_ids_batch_validation,
                attention_mask=attention_mask_batch_validation,
                labels=labels_batch_validation
            ).loss.mean()
    
            total_validation_loss += validation_loss.item()
    
        # Calculate the average validation loss for this epoch
        avg_validation_loss = total_validation_loss / len(validation_dataloader)
        validation_losses.append(avg_validation_loss)
        # Plot the live loss curve
        plt.clf()
        plt.plot(train_losses, label='Training Loss', color='blue')
        plt.plot(validation_losses, label='Validation Loss', color='red')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        #x_ticks = range(1, epoch + 1) #From 1 to Epoche
        #plt.xticks(x_ticks)
        plt.savefig('TrainValLoss.png')
        np.savetxt('loss_data.txt', np.column_stack((train_losses, validation_losses)), header='Train Loss, Validation Loss', delimiter=',')
        print(f"Epoch {epoch + 1}, Average Loss: {avg_loss}")
        model.module.save_pretrained(f"./T5Snapshot/model_epoch{epoch + 1}")
        tokenizer.save_pretrained(f"./T5Snapshot/tokenizer_epoch{epoch + 1}")

# Save the trained model
model.save_pretrained("./T5Snapshot/model")
tokenizer.save_pretrained("./T5Snapshot/tokenizer")

Some weights of LongT5ForConditionalGeneration were not initialized from the model checkpoint at google/long-t5-local-base and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/330571 [00:00<?, ? examples/s]

Map:   0%|          | 0/8700 [00:00<?, ? examples/s]

Map:   0%|          | 0/8699 [00:00<?, ? examples/s]

Dataset({
    features: ['translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10
})
cuda:0


Epoch 1 (Training):   0%|          | 403/165286 [04:28<30:29:07,  1.50it/s]


KeyboardInterrupt: 

**Test Set Evaluation**

In [None]:
import Levenshtein

# Initialize lists to store results
predicted_c_codes = []
actual_c_codes = []
levenshtein_distances = []


#tokenizer = T5Tokenizer.from_pretrained("./T5Snapshot/tokenizer_")
#model = T5ForConditionalGeneration.from_pretrained("./T5Snapshot/model").to(device)
# Set the model to evaluation mode
model.eval()

with torch.no_grad():
    for index, assembly_sequence in enumerate(input_test):
        # Encode the assembly sequence
        encoding = tokenizer(
            assembly_sequence,
            padding="longest",
            max_length=max_source_length,
            truncation=True,
            return_tensors="pt"
        )

        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

        # Generate the corresponding C code
        output = model.generate(input_ids, max_length=max_target_length, num_return_sequences=1)
        predicted_c_code = tokenizer.decode(output[0], skip_special_tokens=True)

        # Get the actual C code
        actual_c_code = output_test[index]

        # Calculate Levenshtein distance
        levenshtein_distance = Levenshtein.distance(predicted_c_code, actual_c_code)

        # Append results to lists
        predicted_c_codes.append(predicted_c_code)
        actual_c_codes.append(actual_c_code)
        levenshtein_distances.append(levenshtein_distance)

# Calculate and print performance metrics
total_samples = len(predicted_c_codes)
avg_levenshtein_distance = sum(levenshtein_distances) / total_samples

print(f"Total samples: {total_samples}")
print(f"Average Levenshtein Distance: {avg_levenshtein_distance}")

**Manual Inference**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("./T5Snapshot/tokenizer_epoch1")
model =  LongT5ForConditionalGeneration .from_pretrained("./T5Snapshot/model_epoch1",local_files_only=True).to(device)

input_sequence = 'endbr64 ;push   rbp;mov    rbp,rsp;mov    DWORD PTR [rbp-(1)],edi;mov    DWORD PTR [rbp-(0)],esi;mov    edx,DWORD PTR [rbp-(1)];mov    eax,DWORD PTR [rbp-(0)];add    eax,edx;pop    rbp;ret'
input_ids = tokenizer(input_sequence, return_tensors="pt").input_ids.to(device)
outputs = model.generate(input_ids,max_length=max_target_length, num_return_sequences=1) #torch.no_grad() already involved in generate
print(tokenizer.decode(outputs[0], skip_special_tokens=True))