In [19]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from itertools import chain
import pandas as pd
from pathlib import Path
import torch

In [3]:
# Our data is located in the /data directory. Let's see what we're working with.
!ls ./data/causal_modeling/

CVE-2020-29583.txt xss.txt


In [4]:
# Load the file names into a list
data_path = Path("./data/causal_modeling")
file_paths = [filename for filename in data_path.glob("*.txt")]

In [5]:
# Read all the file contents into a list
file_data = list()
for filename in file_paths:
    with open(filename, "r") as f:
        data = f.read()
    file_data.append(data)

In [6]:
print(file_data[0])
print('=====================')
print(file_data[1])

CVE:   CVE-2020-29583 Summary Zyxel has released a patch for the hardcoded credential vulnerability of firewalls and AP controllers recently reported by researchers from EYE Netherlands. Users are advised to install the applicable firmware updates for optimal protection. What is the vulnerability? A hardcoded credential vulnerability was identified in the “zyfwp” user account in some Zyxel firewalls and AP controllers. The account was designed to deliver automatic firmware updates to connected access points through FTP. What versions are vulnerable—and what should you do? After a thorough investigation, we’ve identified the vulnerable products and are releasing firmware patches to address the issue, as shown in the table below. For optimal protection, we urge users to install the applicable updates. For those not listed, they are not affected. Contact your local Zyxel support team if you require further assistance or visit our  forum  for more information. Got a question or a tipoff? P

In [7]:
# Convert our list of text into a dataset using .from_dict()
dataset = Dataset.from_dict({"text": file_data})

In [8]:
# Preview the dataset
dataset["text"]

['CVE:   CVE-2020-29583 Summary Zyxel has released a patch for the hardcoded credential vulnerability of firewalls and AP controllers recently reported by researchers from EYE Netherlands. Users are advised to install the applicable firmware updates for optimal protection. What is the vulnerability? A hardcoded credential vulnerability was identified in the “zyfwp” user account in some Zyxel firewalls and AP controllers. The account was designed to deliver automatic firmware updates to connected access points through FTP. What versions are vulnerable—and what should you do? After a thorough investigation, we’ve identified the vulnerable products and are releasing firmware patches to address the issue, as shown in the table below. For optimal protection, we urge users to install the applicable updates. For those not listed, they are not affected. Contact your local Zyxel support team if you require further assistance or visit our  forum  for more information. Got a question or a tipoff?

In [9]:
# Load the tokenizer for GPT-2
tokenizer = AutoTokenizer.from_pretrained('gpt2')

# The tokenizer does not have a pad token, so we'll specify one.
tokenizer.pad_token = tokenizer.eos_token

# Load the GPT-2 model
model = AutoModelForCausalLM.from_pretrained('gpt2')

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [10]:
# Create a tokenization function to tokenize the dataset
def tokenize_function(examples):
    output = tokenizer(examples['text'])
    return output

# Run the tokenizer over our dataset using the .map method 
# NOTE: For large datasets, this can take a while
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# We want to remove our original dataset's column names from the tokenized dataset
tokenized_dataset = tokenized_dataset.remove_columns(dataset.column_names)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1734 > 1024). Running this sequence through the model will result in indexing errors


In [11]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 2
})

In [12]:
# This function was lightly modified from the HuggingFace run_clm.py
# You can find the original function at https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py
# Create a preprocessing function to group aour texts together in chunks of 1024
def group_texts(examples):
    # Specify our bock size -- 1024
    block_size = 1024
    
    # Concatenate all the texts together for each example
    concatenated_examples = dict()
    for k in examples.keys():
        concatenated_examples[k] = list(chain(*examples[k]))
        
    # Compute the total length of all the text
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    
    # We drop the small remainder of the block
    # If total_length < block_size, we return an empty dict.
    total_length = (total_length // block_size) * block_size
    
    # Split into chunks of 1024
    result = dict()
    # Loop over the keys and texts in the concatenated examples
    for k, t in concatenated_examples.items():
        # Divide each text into chunks of 1024
        chunks = list()
        for i in range(0, total_length, block_size):
            chunks.append(t[i : i + block_size])
        result[k] = chunks
    # Set the "labels" equal to the "input_ids"
    result["labels"] = result["input_ids"].copy()
    return result

In [13]:
# Chunk our datasets using the group_texts function
dataset = tokenized_dataset.map(group_texts, batched=True)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [None]:
# Set up our data collator for training. Since our model is PyTorch, we need to specify return_tensors as "pt"
# A data collator is a utility to dynamically batch and format individual samples from a dataset into tensors suitable for training
# mlm=False: Disables masked language modeling (MLM), meaning it’s for causal language modeling (like GPT-style models).
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="pt")

In [15]:
# Establish our training arguments
training_args = TrainingArguments(
    output_dir="finetune_gpt2",
    per_device_train_batch_size=1,
    save_strategy="no"
)

In [16]:
# Put everything into our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

In [17]:
# Run the trainer
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=6, training_loss=3.2493375142415366, metrics={'train_runtime': 13.5086, 'train_samples_per_second': 0.444, 'train_steps_per_second': 0.444, 'total_flos': 3135504384000.0, 'train_loss': 3.2493375142415366, 'epoch': 3.0})

In [23]:
# Specify an input string
input_string = "Cross-Site Scripting is a vulnerability that"

# Tokenize our input string
inputs = tokenizer(
    input_string,
    return_tensors="pt",
    padding=True,
    truncation=True,
)
input_ids = inputs["input_ids"].to("cpu")
attention_mask = inputs["attention_mask"].to("cpu")

# Generate model output_ids
model.eval()
model.to("cpu")  # Ensure the model is on CPU for inference
with torch.no_grad():
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        num_beams=10,
        num_return_sequences=1,
        no_repeat_ngram_size=1,
        remove_invalid_values=True,
        pad_token_id=tokenizer.eos_token_id,  # required for GPT2
    )

# Decode the output tokens to text
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print our output!
print(output_text)

Cross-Site Scripting is a vulnerability that allows remote attackers to inject arbitrary web script or HTML via unspecified vectors. A denial of service (DoS
