In [1]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-bnb-4bit", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
     token = "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", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.11.5 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

# Define the prompt template for CWE prediction task
prompt = """Below is a CVE description. Predict the corresponding CWE ID for this vulnerability.

### Description:
{}

### Response:
"""

# Function to format the dataset
def formatting_prompts_func(examples):
    descriptions = examples["Description"]  # Get descriptions from the dataset
    ground_truth = examples["GT"]  # Get ground truth CWE ID

    texts = []
    labels = []
    for description, gt in zip(descriptions, ground_truth):
        # Format the description for input and map the ground truth as the label
        text = prompt.format(description)
        texts.append(text)
        labels.append(gt)  # Set the label as the CWE ID (GT)

    return {"text": texts, "labels": labels}

# Load the dataset from TSV file
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t')  # Adjust separator if necessary
    return Dataset.from_pandas(df)

# Tokenize the dataset and set labels for the classification task
def tokenize_function(examples, tokenizer, max_seq_length):
    # Tokenize input descriptions and set labels to input_ids for classification
    model_inputs = tokenizer(examples['text'], padding=True, truncation=True, max_length=max_seq_length)
    model_inputs['labels'] = examples['labels']  # Set the labels to the ground truth CWE ID
    return model_inputs

# Path to your dataset file
train_file_path = '/content/cti-rcm.tsv'  # Adjust the path as needed
train_dataset = load_data(train_file_path)

# Create a DatasetDict (if you have multiple splits like validation, you can add them here)
dataset_dict = DatasetDict({'train': train_dataset})

# Apply the formatting function to your dataset
formatted_dataset = dataset_dict['train'].map(formatting_prompts_func, batched=True)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('unsloth/Llama-3.2-1B-bnb-4bit')

# Tokenize the dataset
max_seq_length = 512  # Define maximum sequence length
tokenized_dataset = formatted_dataset.map(lambda x: tokenize_function(x, tokenizer, max_seq_length), batched=True)

# Check the first example to ensure it's formatted correctly
print(tokenized_dataset[0])  # This will print the formatted text and label for the first example


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'URL': 'https://nvd.nist.gov/vuln/detail/CVE-2024-23848', 'Description': 'In the Linux kernel through 6.7.1, there is a use-after-free in cec_queue_msg_fh, related to drivers/media/cec/core/cec-adap.c and drivers/media/cec/core/cec-api.c.', 'Prompt': 'Analyze the following CVE description and map it to the appropriate CWE. Provide a brief justification for your choice. Ensure the last line of your response contains only the CWE ID.  CVE Description: In the Linux kernel through 6.7.1, there is a use-after-free in cec_queue_msg_fh, related to drivers/media/cec/core/cec-adap.c and drivers/media/cec/core/cec-api.c. ', 'GT': 'CWE-416', 'text': 'Below is a CVE description. Predict the corresponding CWE ID for this vulnerability.\n\n### Description:\nIn the Linux kernel through 6.7.1, there is a use-after-free in cec_queue_msg_fh, related to drivers/media/cec/core/cec-adap.c and drivers/media/cec/core/cec-api.c.\n\n### Response:\n', 'labels': 'CWE-416', 'input_ids': [128004, 128004, 128004, 

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Shuffle and split the dataset into 70% training and 30% testing examples
shuffled_dataset = formatted_dataset.shuffle(seed=42)
train_size = int(0.7 * len(shuffled_dataset))
train_dataset = shuffled_dataset.select(range(train_size))  # 70% for training
test_dataset = shuffled_dataset.select(range(train_size, len(shuffled_dataset)))  # 30% for testing

max_seq_length = 512

# Training arguments without wandb integration
training_args = TrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=2,  # Increased batch size
    gradient_accumulation_steps=1,  # Reduced for faster training
    warmup_steps=600,  # Increased warmup steps for stability
    num_train_epochs=5,  # Use epochs instead of steps for training duration
    learning_rate=1e-4,  # Adjusted learning rate
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=50,  # Reduced logging frequency
    optim="adamw_8bit",  # 8-bit optimizer
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    evaluation_strategy="steps",  # Evaluate during training
    eval_steps=50,  # Reduced evaluation frequency
    save_strategy="steps",  # Save model checkpoint during training
    save_steps=50,  # Reduced checkpoint frequency
    load_best_model_at_end=True,  # Load the best model based on eval metric
    # Removed wandb logging
    report_to="none",  # This ensures no logging to wandb
)

# Initialize the trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,  # Use the 70% training dataset
    eval_dataset=test_dataset,  # Use the 30% testing dataset
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=6,  # Increased for faster data processing
    packing=False,  # Set to True if sequences are short to speed up
    args=training_args,
)

# Train the model
trainer.train()

# Evaluate the model on the test dataset
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")




Map (num_proc=6):   0%|          | 0/700 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/300 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 700 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 1
\        /    Total batch size = 2 | Total steps = 1,750
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss,Validation Loss
50,2.1026,2.048889
100,2.0841,2.00811
150,2.0453,1.970842
200,1.9392,1.940955
250,1.9335,1.912362
300,1.9227,1.893148
350,1.8631,1.867656
400,1.7503,1.852466
450,1.7444,1.823986
500,1.601,1.803527


Evaluation results: {'eval_loss': 1.6590739488601685, 'eval_runtime': 11.5844, 'eval_samples_per_second': 25.897, 'eval_steps_per_second': 3.28, 'epoch': 5.0}


In [14]:
model.save_pretrained("/content/llama1bRCM_finetuned")
tokenizer.save_pretrained("/content/llama1bRCM_finetuned")


('/content/llama1bRCM_finetuned/tokenizer_config.json',
 '/content/llama1bRCM_finetuned/special_tokens_map.json',
 '/content/llama1bRCM_finetuned/tokenizer.json')

In [15]:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
api_key = os.getenv('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')

tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B-bnb-4bit")
base_model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.2-1B-bnb-4bit")

`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [16]:

base_model.save_pretrained('/content/llama1bRCMBASE_model')
tokenizer.save_pretrained('/content/llama1bRCMBASE_model')

('/content/llama1bRCMBASE_model/tokenizer_config.json',
 '/content/llama1bRCMBASE_model/special_tokens_map.json',
 '/content/llama1bRCMBASE_model/tokenizer.json')

In [17]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load the base model
base_model_directory = '/content/llama1bRCMBASE_model'
base_model = AutoModelForCausalLM.from_pretrained(base_model_directory)

# Load the LoRA adapter
lora_adapters_path = "/content/llama1bRCM_finetuned"
lora_model = PeftModel.from_pretrained(base_model, lora_adapters_path)

# Apply LoRA adapter to the base model
# Assuming PeftModel integrates the adapter with the base model
base_model_with_lora = lora_model

# Define the directory to save the combined model
combined_model_directory = '/content/llama1bRCM_finetune_version'

# Save the base model
base_model.save_pretrained(combined_model_directory)

# Save the LoRA adapter
lora_model.save_pretrained(combined_model_directory)

# Load the tokenizer from a different source if not found in base_model_directory

tokenizer = AutoTokenizer.from_pretrained(base_model_directory)
tokenizer.save_pretrained(combined_model_directory)

print(f"Combined model saved to {combined_model_directory}")

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Combined model saved to /content/llama1bRCM_finetune_version


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the directory where your combined model (base model + LoRA adapter) is saved
combined_model_directory = '/content/llama1bRCM_finetune_version'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(combined_model_directory)

# Load the model
model = AutoModelForCausalLM.from_pretrained(combined_model_directory)

print("Combined model and tokenizer loaded successfully.")


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Combined model and tokenizer loaded successfully.


In [2]:
import pandas as pd


df1=pd.read_csv('/content/cti-rcm.tsv', sep='\t')
df1.head()

Unnamed: 0,URL,Description,Prompt,GT
0,https://nvd.nist.gov/vuln/detail/CVE-2024-23848,"In the Linux kernel through 6.7.1, there is a ...",Analyze the following CVE description and map ...,CWE-416
1,https://nvd.nist.gov/vuln/detail/CVE-2023-38738,IBM OpenPages with Watson 8.3 and 9.0 could pr...,Analyze the following CVE description and map ...,CWE-257
2,https://nvd.nist.gov/vuln/detail/CVE-2024-22137,Improper Neutralization of Input During Web Pa...,Analyze the following CVE description and map ...,CWE-79
3,https://nvd.nist.gov/vuln/detail/CVE-2024-20819,Out-of-bounds Write vulnerabilities in svc1td_...,Analyze the following CVE description and map ...,CWE-787
4,https://nvd.nist.gov/vuln/detail/CVE-2024-0585,The Essential Addons for Elementor – Best Elem...,Analyze the following CVE description and map ...,CWE-79


In [23]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
Using cached transformers-4.46.2-py3-none-any.whl (10.0 MB)
Installing collected packages: transformers
Successfully installed transformers-4.46.2


In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import re
import pandas as pd

# Define the directory where your combined model (base model + LoRA adapter) is saved
combined_model_directory = '/content/llama1bRCM_finetune_version'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(combined_model_directory)

# Load the model
model = AutoModelForCausalLM.from_pretrained(combined_model_directory)

# Define the device (GPU if available, otherwise CPU)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Move the model to the selected device
model.to(device)

print("Combined model and tokenizer loaded successfully.")

def generate_gt(cve_description):
    """Generates the CWE mapping for a given CVE description using LLaMA."""
    try:
        # Ensure model and tokenizer are using the correct device
        # The obligatory prompt
        prompt = (f"Analyze the following CVE description and determine the appropriate CWE. "
                  f"Provide a justification. The last part of your response should contain only the CWE ID number.\n\n"
                  f"CVE Description: {cve_description}\n")

        # Tokenize the prompt with truncation to a max length (adjust as needed)
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)

        # Generate response with a dynamic length limit
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=min(2000, inputs["input_ids"].shape[1] + 50),
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id
        )

        # Decode the generated text
        response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

        # Extract CWE ID from response
        cwe_id_match = re.search(r'CWE-\d+', response)
        cwe_id = cwe_id_match.group(0) if cwe_id_match else None

        return {
            "full_response": response,
            "cwe_id": cwe_id
        }

    except Exception as e:
        print(f"Error generating CWE mapping: {str(e)}")
        return None

# Load the DataFrame containing the CVE descriptions
sampled_df = df1.sample(n=200, random_state=42)  # Adjust if needed

# Initialize a list to store valid results
results = []
attempted_rows = 0

# Process each CVE description until we reach 200 valid answers
for index, row in sampled_df.iterrows():
    if len(results) >= 200:
        break  # Stop once we have 200 valid entries

    cve_description = row["Description"]
    print(f"Processing CVE Description {index + 1}/{len(sampled_df)}: {cve_description}")

    # Retry logic with a maximum of 3 attempts
    for attempt in range(3):
        result = generate_gt(cve_description)
        if result and result["cwe_id"]:  # Check if CWE ID was successfully generated
            print(f"Success: {result['cwe_id']} - {result['full_response']}")
            results.append({
                "CVE Description": cve_description,
                "Justification": result["full_response"],
                "CWE_ID": result["cwe_id"]
            })
            break  # Exit retry loop on success
        else:
            print(f"Attempt {attempt + 1} failed for this description.")

        # If all attempts failed, mark as error after third attempt
        if attempt == 2 and (not result or not result["cwe_id"]):
            print("Error: Unable to process this description after 3 attempts.")
            results.append({
                "CVE Description": cve_description,
                "Justification": "Error in processing",
                "CWE_ID": "Error"
            })

    # Increment counter for rows attempted to ensure we sample enough entries
    attempted_rows += 1

    # Save intermediate results to a TSV file after each entry
    results_df = pd.DataFrame(results)
    results_df.to_csv("resultsllama1BFNcti.tsv", sep='\t', index=False)

# Final save after processing all entries
print("Finalizing and saving results...")
results_df.to_csv("resultsllama1BFNcti.tsv", sep='\t', index=False)
print("Results saved to 'resultsllama1BFNcti.tsv'.")


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Combined model and tokenizer loaded successfully.
Processing CVE Description 522/200: A SQL injection vulnerability exists in Novel-Plus v4.3.0-RC1 and prior. An attacker can pass specially crafted offset, limit, and sort parameters to perform SQL injection via /novel/userFeedback/list.
Attempt 1 failed for this description.
Attempt 2 failed for this description.
Attempt 3 failed for this description.
Error: Unable to process this description after 3 attempts.
Processing CVE Description 738/200: Due to a failure in validating the number of scanline samples of a OpenEXR file containing deep scanline data, Academy Software Foundation OpenEX image parsing library version 3.2.1 and prior is susceptible to a heap-based buffer overflow vulnerability. This issue was resolved as of versions v3.2.2 and v3.1.12 of the affected library.
Attempt 1 failed for this description.
Attempt 2 failed for this description.
Attempt 3 failed for this description.
Error: Unable to process this description aft

KeyboardInterrupt: 