In [None]:
import warnings
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
from tqdm import tqdm

# Suppressing unnecessary warnings
warnings.filterwarnings("ignore")

# Check if a GPU is available, and if so, use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

#tokenizer = T5Tokenizer.from_pretrained("t5-small")
#model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)  # Move model to the GPU

# the following 2 hyperparameters are task-specific
max_source_length = 2048  # Set to a value that covers most assembly code sequences
max_target_length = 512  # Set to a value that covers most C code sequences

# Read input sequences from assembly.txt
with open("assembly.txt", "r", encoding="utf-8") as assembly_file:
    assembly_lines = assembly_file.readlines()

# Read output sequences from functions.txt
with open("function.txt", "r", encoding="utf-8") as functions_file:
    functions_lines = functions_file.readlines()

# Ensure that the number of lines in assembly.txt and functions.txt are the same
assert len(assembly_lines) == len(functions_lines), "Number of lines in assembly.txt and functions.txt must be the same."

# Encode inputs and targets
input_sequences = [assembly_line for assembly_line in assembly_lines]
output_sequences = [function_line for function_line in functions_lines]

encoding = tokenizer(
    input_sequences,
    padding="longest",
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
)

input_ids, attention_mask = encoding.input_ids.to(device), encoding.attention_mask.to(device)

print(model.device)  # Print the device the model is on
print(input_ids.device)  # Print the device of input_ids

target_encoding = tokenizer(
    output_sequences,
    padding="longest",
    max_length=max_target_length,
    truncation=True,
    return_tensors="pt",
)
labels = target_encoding.input_ids.to(device)

# Replace padding token id's of the labels by -100 so it's ignored by the loss
labels[labels == tokenizer.pad_token_id] = -100

# Analyze sequence lengths
assembly_lengths = [len(tokenizer.encode(seq)) for seq in assembly_lines]
c_code_lengths = [len(tokenizer.encode(seq)) for seq in functions_lines]


# Set hyperparameters
learning_rate = 5e-5
batch_size = 4
num_epochs = 20
num_warmup_steps = 0.1 * num_epochs * len(input_sequences) / batch_size

# Define optimizer with GPU device
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Create DataLoader for training data
train_dataset = TensorDataset(input_ids, attention_mask, labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training loop on the GPU
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        input_ids_batch, attention_mask_batch, labels_batch = batch

        # Forward pass
        loss = model(
            input_ids=input_ids_batch,
            attention_mask=attention_mask_batch,
            labels=labels_batch
        ).loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_datawloader)
    print(f"Epoch {epoch + 1}, Average Loss: {avg_loss}")

# Save the trained model
model.save_pretrained("./T5Snapshot/model")
tokenizer.save_pretrained("./T5Snapshot/tokenizer")


In [None]:
# Print statistics
print("Assembly Code Statistics:")
print("Max Length:", max(assembly_lengths))
print("Min Length:", min(assembly_lengths))
print("Average Length:", sum(assembly_lengths) / len(assembly_lengths))

print("\nC Code Statistics:")
print("Max Length:", max(c_code_lengths))
print("Min Length:", min(c_code_lengths))
print("Average Length:", sum(c_code_lengths) / len(c_code_lengths))


**Inference**

In [None]:
tokenizer = T5Tokenizer.from_pretrained("./T5Snapshot/tokenizer")
model = T5ForConditionalGeneration.from_pretrained("./T5Snapshot/model")

input_sequence = 'endbr64 ;push   rbp;mov    rbp,rsp;mov    DWORD PTR [rbp-(1)],edi;mov    DWORD PTR [rbp-(2)],esi;jmp    OFS 14 <Gcd>;mov    eax,DWORD PTR [rbp-(1)];mov    edx,0x0;div    DWORD PTR [rbp-(2)];mov    DWORD PTR [rbp-(0)],edx;mov    eax,DWORD PTR [rbp-(2)];mov    DWORD PTR [rbp-(1)],eax;mov    eax,DWORD PTR [rbp-(0)];mov    DWORD PTR [rbp-(2)],eax;cmp    DWORD PTR [rbp-(2)],0x0;jne    OFS 6 <Gcd>;mov    eax,DWORD PTR [rbp-(1)];pop    rbp;ret    '
input_ids = tokenizer(input_sequence, return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
def align_text_files(file1, file2):
    '''This function makes sure the functions.txt and assembly.txt have the same number of lines.
    '''
    with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
        lines1 = f1.readlines()
        lines2 = f2.readlines()
        print(len(lines1))
        print(len(lines2))

    while len(lines1) > len(lines2):
        lines1.pop()
    while len(lines2) > len(lines1):
        lines2.pop()

    print(len(lines1))
    print(len(lines2))
    with open(file1, 'w', encoding='utf-8') as f1, open(file2, 'w', encoding='utf-8') as f2:
        f1.writelines(lines1)
        f2.writelines(lines2)

file1_path = "assembly.txt"
file2_path = "function.txt"

align_text_files(file1_path, file2_path)
