In [1]:
!pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [2]:
!pip install accelerate -U



In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.

In [4]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


In [6]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import torch

def prepare_data(filepath):
    data = pd.read_csv(filepath)
    def label_to_prompt(label):
        prompts = {
            'SQL Injection': 'Generate an SQL injection script.',
            'XSS': 'Generate an XSS script.',
            'Invalid Script': 'Generate a non-executable script.'
        }
        return prompts.get(label, 'Unknown label')  # Handle unknown labels
    data['prompt'] = data['Label'].apply(label_to_prompt)
    data['input'] = data['prompt'] + "  " + data['Script']
    # Drop any rows that might have NaN after processing
    data.dropna(subset=['input', 'Script'], inplace=True)
    return data[['input', 'Script']]

def tokenize_function(examples, tokenizer):
    tokenized_output = tokenizer(examples['input'], padding="max_length", truncation=True, max_length=tokenizer.model_max_length)
    tokenized_output['labels'] = tokenized_output['input_ids'].copy()  # Create labels by copying input_ids
    return tokenized_output
def main():
    # Set the device to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    # Load tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # Ensure the special PAD token is added correctly
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings to include PAD token

    # Load and prepare data
    data_path = '/content/drive/MyDrive/gradDS/Merged_SQL_XSS_Dataset.csv'  # Update this path as per your Colab setup
    data = prepare_data(data_path)
    dataset = Dataset.from_pandas(data)
    dataset = dataset.train_test_split(test_size=0.1)  # Splitting the data

    # Tokenization
    tokenized_datasets = dataset.map(tokenize_function, batched=True, fn_kwargs={'tokenizer': tokenizer})

    # Training arguments
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=10,                     # number of training epochs
        per_device_train_batch_size=4,          # batch size for training
        per_device_eval_batch_size=8,           # batch size for evaluation
        warmup_steps=500,                       # number of warmup steps for learning rate scheduler
        weight_decay=0.01,                      # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
        do_train=True,
        do_eval=True,
        evaluation_strategy="epoch",
        save_strategy="epoch"
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['test'],
        tokenizer=tokenizer
    )

    # Train the model
    trainer.train()

    # Save the model
    model.save_pretrained('./drive/MyDrive/gradDS')  # Update the path as per your Colab setup

if __name__ == '__main__':
    main()


Using device: cuda




Map:   0%|          | 0/40144 [00:00<?, ? examples/s]

Map:   0%|          | 0/4461 [00:00<?, ? examples/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 39.06 MiB is free. Process 3435 has 14.71 GiB memory in use. Of the allocated memory 14.51 GiB is allocated by PyTorch, and 73.55 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
from tqdm import tqdm

# Set up the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load and prepare the dataset
data_path = '/content/drive/MyDrive/gradDS/Merged_SQL_XSS_Dataset.csv'  # Update with the actual path in Colab
data = pd.read_csv(data_path)
data['prompt'] = data['Label'].apply(lambda x: "SQL Injection: " if x == "SQL Injection" else "XSS Attack: ")

# Define the dataset class
class PayloadDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length=256):
        self.input_ids = []
        self.attn_masks = []
        for txt in txt_list:
            encodings = tokenizer(txt, truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings['input_ids']))
            self.attn_masks.append(torch.tensor(encodings['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model = model.to(device)

# Prepare the dataset for DataLoader
combined_texts = data['prompt'] + data['Script']
dataset = PayloadDataset(combined_texts, tokenizer)
loader = DataLoader(dataset, batch_size=4, shuffle=True)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(loader) * 10  # 10 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
model.train()
for epoch in range(10):
    print(f"Starting epoch {epoch+1}")
    progress_bar = tqdm(loader)
    for batch in progress_bar:
        b_input_ids, b_masks = batch
        b_input_ids = b_input_ids.to(device)
        b_masks = b_masks.to(device)

        model.zero_grad()
        outputs = model(b_input_ids, labels=b_input_ids, attention_mask=b_masks)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        progress_bar.set_description(f"Epoch {epoch+1} Loss {loss.item()}")

# Save the fine-tuned model
model_save_path = '/content/drive/MyDrive/gradDS'  # Update with the desired path in Colab
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print("Model training complete and saved.")