In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [None]:
!pip install --upgrade transformers peft





In [None]:
!pip install --upgrade transformers




In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r 6= 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


My dataset preparation


In [None]:
import pandas as pd
prompt = """Below is a question followed by multiple choices. Select the correct answer from the options provided.

### Question:
{}

### Options:
A) {}
B) {}
C) {}
D) {}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    questions = examples["question"]
    option_as = examples["Option_A "]
    option_bs = examples["Option_B"]
    option_cs = examples["Option_C "]
    option_ds = examples["Option_D"]
    correct_answers = examples["Correct_Answer"]

    texts = []
    for question, option_a, option_b, option_c, option_d, correct_answer in zip(questions, option_as, option_bs, option_cs, option_ds, correct_answers):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(question, option_a, option_b, option_c, option_d, correct_answer) + EOS_TOKEN
        texts.append(text)

    return { "text" : texts }

# Assuming you've loaded your custom dataset like this
from datasets import Dataset, DatasetDict

# Load your TSV dataset
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    return Dataset.from_pandas(df)

train_file_path = '/content/drive/MyDrive/myDB.tsv'
train_dataset = load_data(train_file_path)

dataset_dict = DatasetDict({
    'train': train_dataset,
})

# Apply the formatting function to your dataset
formatted_dataset = dataset_dict['train'].map(formatting_prompts_func, batched=True)

# Check the first example to ensure it's formatted correctly
print(formatted_dataset[0])


Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

{'question': 'Given a scenario where an adversary is using Command and Control (C2) over Tor network (MITRE ATT&CK T1036), which Diamond Model feature is primarily affected?', 'Option_A ': 'Adversary', 'Option_B': 'Infrastructure', 'Option_C ': 'Capability', 'Option_D': 'Victim', 'Correct_Answer': 'B', 'Explanation': "The adversary may use various capabilities like T1036 to communicate, but it's the infrastructure (Tor network) that is directly impacted.", 'text': 'Below is a question followed by multiple choices. Select the correct answer from the options provided.\n\n### Question:\nGiven a scenario where an adversary is using Command and Control (C2) over Tor network (MITRE ATT&CK T1036), which Diamond Model feature is primarily affected?\n\n### Options:\nA) Adversary\nB) Infrastructure\nC) Capability\nD) Victim\n\n### Response:\nB<|end_of_text|>'}


<a name="Train"></a>
### Train the model with my data
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

#training with the whole dataset

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Shuffle and split the dataset into 70% training and 30% testing examples
shuffled_dataset = formatted_dataset.shuffle(seed=42)
train_size = int(0.7 * len(shuffled_dataset))
train_dataset = shuffled_dataset.select(range(train_size))  # 70% for training
test_dataset = shuffled_dataset.select(range(train_size, len(shuffled_dataset)))  # 30% for testing

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,  # Use the 70% training dataset
    eval_dataset=test_dataset,  # Use the 30% testing dataset
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=6,  # Increased for faster data processing
    packing=False,  # Set to True if sequences are short to speed up
    args=TrainingArguments(
        per_device_train_batch_size=8,  # Increased batch size
        gradient_accumulation_steps=1,  # Reduced for faster training
        warmup_steps=600,  # Increased warmup steps for stability
        num_train_epochs=3,  # Use epochs instead of steps for training duration
        learning_rate=2e-4,  # Adjusted learning rate
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=50,  # Reduced logging frequency
        optim="adamw_8bit",  # 8-bit optimizer
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        evaluation_strategy="steps",  # Evaluate during training
        eval_steps=50,  # Reduced evaluation frequency
        save_strategy="steps",  # Save model checkpoint during training
        save_steps=50,  # Reduced checkpoint frequency
        load_best_model_at_end=True,  # Load the best model based on eval metric
    ),
)

# Train the model
trainer.train()

# Evaluate the model on the test dataset
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")




Map (num_proc=6):   0%|          | 0/1750 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/750 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,750 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
\        /    Total batch size = 8 | Total steps = 657
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
50,0.6288,0.79506
100,0.6218,0.778871
150,0.609,0.776551
200,0.5541,0.766787
250,0.5186,0.79864
300,0.5225,0.775734
350,0.5617,0.758047
400,0.5364,0.765631
450,0.5194,0.810425
500,0.3925,0.812765


Evaluation results: {'eval_loss': 0.7303941249847412, 'eval_runtime': 96.6277, 'eval_samples_per_second': 7.762, 'eval_steps_per_second': 0.973, 'epoch': 3.0}


#**Training Summary**
**Training and Evaluation Time:**

Training took around 58 minutes and 20 seconds.
Evaluation took approximately 1 minute and 39 seconds.
Training Loss:

The training loss decreased steadily over epochs, which is a good sign. The final training loss was around 0.5.
Validation Loss:

The validation loss also decreased over time, indicating that the model is learning and generalizing well. The final validation loss was around 0.68.
Evaluation Results:

**Evaluation Loss**: 0.730
**Evaluation Runtime**: 96.6277 seconds
**Samples per Second**: 7.69
**Steps per Second**: 0.97

In [None]:
model.save_pretrained("/content/drive/MyDrive/finetuned_model/llama_finetuned")
tokenizer.save_pretrained("/content/drive/MyDrive/finetuned_model/llama_finetuned")


('/content/drive/MyDrive/finetuned_model/llama_finetuned/tokenizer_config.json',
 '/content/drive/MyDrive/finetuned_model/llama_finetuned/special_tokens_map.json',
 '/content/drive/MyDrive/finetuned_model/llama_finetuned/tokenizer.json')

In [None]:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
api_key = os.getenv('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')

tokenizer = AutoTokenizer.from_pretrained("unsloth/meta-llama-3.1-8b")
base_model = AutoModelForCausalLM.from_pretrained("unsloth/meta-llama-3.1-8b")

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/926 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

In [None]:

base_model.save_pretrained('/content/drive/MyDrive/finetuned_model/llama_model')
tokenizer.save_pretrained('/content/drive/MyDrive/finetuned_model/llama_model')

('/content/drive/MyDrive/finetuned_model/llama_model/tokenizer_config.json',
 '/content/drive/MyDrive/finetuned_model/llama_model/special_tokens_map.json',
 '/content/drive/MyDrive/finetuned_model/llama_model/tokenizer.json')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load the base model
base_model_directory = '/content/drive/MyDrive/finetuned_model/llama_model'
base_model = AutoModelForCausalLM.from_pretrained(base_model_directory)

# Load the LoRA adapter
lora_adapters_path = "/content/drive/MyDrive/finetuned_model/llama_finetuned"
lora_model = PeftModel.from_pretrained(base_model, lora_adapters_path)

# Apply LoRA adapter to the base model
# Assuming PeftModel integrates the adapter with the base model
base_model_with_lora = lora_model

# Define the directory to save the combined model
combined_model_directory = '/content/drive/MyDrive/finetuned_model/llama_finetune_version'

# Save the base model
base_model.save_pretrained(combined_model_directory)

# Save the LoRA adapter
lora_model.save_pretrained(combined_model_directory)

# Load the tokenizer from a different source if not found in base_model_directory

tokenizer = AutoTokenizer.from_pretrained(base_model_directory)
tokenizer.save_pretrained(combined_model_directory)

print(f"Combined model saved to {combined_model_directory}")

ModuleNotFoundError: No module named 'peft'

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the directory where your combined model (base model + LoRA adapter) is saved
combined_model_directory = '/content/drive/MyDrive/finetuned_model/llama_finetune_version'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(combined_model_directory)

# Load the model
model = AutoModelForCausalLM.from_pretrained(combined_model_directory)

print("Combined model and tokenizer loaded successfully.")


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


PackageNotFoundError: No package metadata was found for bitsandbytes

In [None]:
import os
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Move model to the same device as input_ids (ensure this matches the device where model is loaded)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset from the TSV file
df = pd.read_csv('/content/drive/MyDrive/myDB.tsv', sep='\t', on_bad_lines='warn')

# Trim any leading/trailing whitespace from column names
df.columns = df.columns.str.strip()

# Ensure necessary columns are present
required_columns = ['question', 'Option_A', 'Option_B', 'Option_C', 'Option_D', 'Correct_Answer', 'Explanation']
missing_columns = [col for col in required_columns if col not in df.columns]

if missing_columns:
    raise ValueError(f"Missing columns in the dataset: {', '.join(missing_columns)}")

# Function to prepare the prompt for the model
def prepare_prompt(question, options):
    options_text = "\n".join([f"Option {chr(65+i)}: {opt}" for i, opt in enumerate(options)])
    formatted_prompt = (
        "You are a cybersecurity expert specializing in cyber threat intelligence. "
        "You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. "
        "Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.\n"
        f"**Question:** {question}\n**Options:**\n{options_text}\n**Important:** The last line of your answer should contain only the single letter corresponding to the best option, with no additional text.\n"
        "**Answer:**"
    )
    return formatted_prompt

# Function to tokenize input
def tokenize_input(formatted_prompt, tokenizer):
    encoding = tokenizer(formatted_prompt, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device) if 'attention_mask' in encoding else None
    return input_ids, attention_mask

# Function to generate response from the model
def generate_response(input_ids, attention_mask, model, max_length=512):
    response = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(response[0], skip_special_tokens=True)
    return generated_text

# Function to extract the answer from generated text
def extract_answer(generated_text):
    # Find the last line that contains only a single uppercase letter
    match = re.findall(r'\b[A-D]\b', generated_text)
    return match[-1] if match else "No valid answer found"

# Process all the questions and generate responses
results = []
for idx, row in df.iterrows():
    print(f"Processing question {idx + 1}...")  # Debug print to track progress

    question = row['question']
    options = [row['Option_A'], row['Option_B'], row['Option_C'], row['Option_D']]
    correct_answer = row['Correct_Answer'].strip().upper()
    explanation = row['Explanation']

    # Prepare the prompt
    formatted_prompt = prepare_prompt(question, options)

    # Tokenize the input
    input_ids, attention_mask = tokenize_input(formatted_prompt, tokenizer)

    # Generate a response
    generated_text = generate_response(input_ids, attention_mask, model)

    # Extract the answer
    predicted_answer = extract_answer(generated_text)

    # Compare the predicted answer with the correct answer
    answer_match = predicted_answer == correct_answer

    # Append results
    results.append({
        'Question': question,
        'Options': options,
        'Correct Answer': correct_answer,
        'Predicted Answer': predicted_answer,
        'Match': answer_match,
        'Generated Text': generated_text,
    })

    # Save results incrementally
    results_df = pd.DataFrame(results)
    results_df.to_csv('/content/drive/MyDrive/finetuned_model/mydb_FNllama_answer2.csv', index=False)

    print(f"Processed {idx + 1} questions")  # Debug print to track progress

print("All questions processed")
print("Predicted answers saved to '/content/drive/MyDrive/finetuned_model/mydb_FNllama_answer2.csv'")


Processing question 1...
Processed 1 questions
Processing question 2...
Processed 2 questions
Processing question 3...
Processed 3 questions
Processing question 4...
Processed 4 questions
Processing question 5...
Processed 5 questions
Processing question 6...
Processed 6 questions
Processing question 7...
Processed 7 questions
Processing question 8...
Processed 8 questions
Processing question 9...
Processed 9 questions
Processing question 10...
Processed 10 questions
Processing question 11...
Processed 11 questions
Processing question 12...
Processed 12 questions
Processing question 13...
Processed 13 questions
Processing question 14...
Processed 14 questions
Processing question 15...
Processed 15 questions
Processing question 16...
Processed 16 questions
Processing question 17...
Processed 17 questions
Processing question 18...
Processed 18 questions
Processing question 19...
Processed 19 questions
Processing question 20...
Processed 20 questions
Processing question 21...
Processed 21

AttributeError: 'float' object has no attribute 'strip'

In [None]:
import os
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Move model to the same device as input_ids (ensure this matches the device where the model is loaded)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset from the TSV file starting from the 1623rd question
df = pd.read_csv('/content/drive/MyDrive/myDB.tsv', sep='\t', skiprows=1622, on_bad_lines='warn')

# Debugging: Print the column names and first few rows to inspect
print("Columns:", df.columns.tolist())
print("First few rows:\n", df.head())

# Map existing columns to expected labels
df.columns = [
    'question',           # The first column in your dataset
    'Option_A',           # The second column
    'Option_B',           # The third column
    'Option_C',           # The fourth column
    'Option_D',           # The fifth column
    'Correct_Answer',     # The sixth column
    'Explanation'         # The seventh column (if present)
]

# Trim any leading/trailing whitespace from column names
df.columns = df.columns.str.strip()

# Ensure necessary columns are present
required_columns = ['question', 'Option_A', 'Option_B', 'Option_C', 'Option_D', 'Correct_Answer', 'Explanation']
missing_columns = [col for col in required_columns if col not in df.columns]

if missing_columns:
    raise ValueError(f"Missing columns in the dataset: {', '.join(missing_columns)}")

# Function to prepare the prompt for the model
def prepare_prompt(question, options):
    options_text = "\n".join([f"Option {chr(65+i)}: {opt}" for i, opt in enumerate(options)])
    formatted_prompt = (
        "You are a cybersecurity expert specializing in cyber threat intelligence. "
        "You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. "
        "Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.\n"
        f"**Question:** {question}\n**Options:**\n{options_text}\n**Important:** The last line of your answer should contain only the single letter corresponding to the best option, with no additional text.\n"
        "**Answer:**"
    )
    return formatted_prompt

# Function to tokenize input
def tokenize_input(formatted_prompt, tokenizer):
    encoding = tokenizer(formatted_prompt, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device) if 'attention_mask' in encoding else None
    return input_ids, attention_mask

# Function to generate a response from the model
def generate_response(input_ids, attention_mask, model, max_length=512):
    response = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(response[0], skip_special_tokens=True)
    return generated_text

# Function to extract the answer from generated text
def extract_answer(generated_text):
    # Find the last line that contains only a single uppercase letter
    match = re.findall(r'\b[A-D]\b', generated_text)
    return match[-1] if match else "No valid answer found"

# Process all the questions and generate responses
results = []
for idx, row in df.iterrows():
    print(f"Processing question {1623 + idx}...")  # Debug print to track progress

    question = row['question']
    options = [row['Option_A'], row['Option_B'], row['Option_C'], row['Option_D']]
    correct_answer = row['Correct_Answer'].strip().upper() if isinstance(row['Correct_Answer'], str) else "N/A"
    explanation = row['Explanation'] if 'Explanation' in row else ""

    # Prepare the prompt
    formatted_prompt = prepare_prompt(question, options)

    # Tokenize the input
    input_ids, attention_mask = tokenize_input(formatted_prompt, tokenizer)

    # Generate a response
    generated_text = generate_response(input_ids, attention_mask, model)

    # Extract the answer
    predicted_answer = extract_answer(generated_text)

    # Compare the predicted answer with the correct answer
    answer_match = predicted_answer == correct_answer

    # Append results
    results.append({
        'Question': question,
        'Options': options,
        'Correct Answer': correct_answer,
        'Predicted Answer': predicted_answer,
        'Match': answer_match,
        'Generated Text': generated_text,
    })

    # Save results incrementally
    results_df = pd.DataFrame(results)
    results_df.to_csv('/content/drive/MyDrive/finetuned_model/mydb_FNllama_answer_1623_onwards.csv', index=False)

    print(f"Processed {1623 + idx} questions")  # Debug print to track progress

print("All questions processed")
print("Predicted answers saved to '/content/drive/MyDrive/finetuned_model/mydb_FNllama_answer_1623_onwards.csv'")


Columns: ['Adversaries may add additional commands to an existing login hook. What does this require?', 'Administrator permissions', 'User permissions', 'Root permissions', 'Guest permissions', 'A', 'Unnamed: 6']
First few rows:
   Adversaries may add additional commands to an existing login hook. What does this require?  \
0  Adversaries may modify the com.apple.loginwind...                                           
1  Adversaries may use login hooks to execute mal...                                           
2  Adversaries may use login hooks to execute mal...                                           
3  What is the deprecated mechanism for login hoo...                                           
4  What access control should be applied to logon...                                           

        Administrator permissions   User permissions       Root permissions  \
0                        defaults              chmod              chowngrep   
1                           T1086  

In [None]:
import pandas as pd

# Load the results from the CSV file
results_df = pd.read_csv('/content/drive/MyDrive/finetuned_model/mydb_FNllama_answer2.csv')

# Calculate the number of correct predictions
correct_predictions = results_df['Match'].sum()

# Calculate the total number of questions processed
total_questions = len(results_df)

# Calculate the accuracy percentage
accuracy_percentage = (correct_predictions / total_questions) * 100

# Print the results
print(f"Total Questions Processed: {total_questions}")
print(f"Number of Correct Predictions: {correct_predictions}")
print(f"Accuracy Percentage: {accuracy_percentage:.2f}%")


Total Questions Processed: 1622
Number of Correct Predictions: 1081
Accuracy Percentage: 66.65%


In [None]:
print("Execution completed. Ending the session.")
import os
os.kill(os.getpid(), 9)

#fine tuning using cti-dataset


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
import pandas as pd
prompt = """Below is a question followed by multiple choices. Select the correct answer from the options provided.

### Question:
{}

### Options:
A) {}
B) {}
C) {}
D) {}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    # Extract columns from the dataset
    questions = examples["Question"]
    option_as = examples["Option A"]
    option_bs = examples["Option B"]
    option_cs = examples["Option C"]
    option_ds = examples["Option D"]
    correct_answers = examples["GT"]

    # Format each entry in the dataset
    texts = []
    for question, option_a, option_b, option_c, option_d, correct_answer in zip(questions, option_as, option_bs, option_cs, option_ds, correct_answers):
        # Format the prompt and add the EOS_TOKEN
        text = prompt.format(question, option_a, option_b, option_c, option_d, correct_answer) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}

# Assuming you've loaded your custom dataset like this
from datasets import Dataset, DatasetDict

# Load your TSV dataset
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    return Dataset.from_pandas(df)

train_file_path = '/content/drive/MyDrive/cti-mcq.tsv'
train_dataset = load_data(train_file_path)

dataset_dict = DatasetDict({
    'train': train_dataset,
})

# Apply the formatting function to your dataset
formatted_dataset = dataset_dict['train'].map(formatting_prompts_func, batched=True)

# Check the first example to ensure it's formatted correctly
print(formatted_dataset[0])


Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

{'URL': 'https://attack.mitre.org/techniques/T1548/', 'Question': "Which of the following mitigations involves preventing applications from running that haven't been downloaded from legitimate repositories?", 'Option A': 'Audit', 'Option B': 'Execution Prevention', 'Option C': 'Operating System Configuration', 'Option D': 'User Account Control', 'Prompt': "You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.  **Question:** Which of the following mitigations involves preventing applications from running that haven't been downloaded from legitimate repositories?  **Options:** A) Audit B) Execution Prevention C) Operating System Configuration D) User Account Control  **Important:** The last line of your answer should contain only the single letter corresponding to the best option, with no additional te

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Shuffle and split the dataset into 70% training and 30% testing examples
shuffled_dataset = formatted_dataset.shuffle(seed=42)
train_size = int(0.7 * len(shuffled_dataset))
train_dataset = shuffled_dataset.select(range(train_size))  # 70% for training
test_dataset = shuffled_dataset.select(range(train_size, len(shuffled_dataset)))  # 30% for testing

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,  # Use the 70% training dataset
    eval_dataset=test_dataset,  # Use the 30% testing dataset
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=6,  # Increased for faster data processing
    packing=False,  # Set to True if sequences are short to speed up
    args=TrainingArguments(
        per_device_train_batch_size=8,  # Increased batch size
        gradient_accumulation_steps=1,  # Reduced for faster training
        warmup_steps=500,  # Increased warmup steps for stability
        num_train_epochs=3,  # Use epochs instead of steps for training duration
        learning_rate=2e-4,  # Adjusted learning rate
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=50,  # Reduced logging frequency
        optim="adamw_8bit",  # 8-bit optimizer
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        evaluation_strategy="steps",  # Evaluate during training
        eval_steps=50,  # Reduced evaluation frequency
        save_strategy="steps",  # Save model checkpoint during training
        save_steps=50,  # Reduced checkpoint frequency
        load_best_model_at_end=True,  # Load the best model based on eval metric
    ),
)

# Train the model
trainer.train()

# Evaluate the model on the test dataset
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")




Map (num_proc=6):   0%|          | 0/1750 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/750 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,750 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
\        /    Total batch size = 8 | Total steps = 657
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
50,2.3622,1.728521
100,1.5789,1.442423
150,1.3628,1.310072
200,1.318,1.267033
250,1.2544,1.25764
300,1.1833,1.233139
350,1.2004,1.212712
400,1.1838,1.202244
450,1.1085,1.231889
500,1.0017,1.229176


Evaluation results: {'eval_loss': 1.1866949796676636, 'eval_runtime': 90.9364, 'eval_samples_per_second': 8.248, 'eval_steps_per_second': 1.034, 'epoch': 3.0}


In [None]:
model.save_pretrained("/content/drive/MyDrive/finetuned_model_cti/llama_finetuned_cti")
tokenizer.save_pretrained("/content/drive/MyDrive/finetuned_model_cti/llama_finetuned_cti")

('/content/drive/MyDrive/finetuned_model_cti/llama_finetuned_cti/tokenizer_config.json',
 '/content/drive/MyDrive/finetuned_model_cti/llama_finetuned_cti/special_tokens_map.json',
 '/content/drive/MyDrive/finetuned_model_cti/llama_finetuned_cti/tokenizer.json')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load the base model
base_model_directory = '/content/drive/MyDrive/finetuned_model/llama_model'
base_model = AutoModelForCausalLM.from_pretrained(base_model_directory)

# Load the LoRA adapter
lora_adapters_path = "/content/drive/MyDrive/finetuned_model_cti/llama_finetuned_cti"
lora_model = PeftModel.from_pretrained(base_model, lora_adapters_path)

# Apply LoRA adapter to the base model
# Assuming PeftModel integrates the adapter with the base model
base_model_with_lora = lora_model

# Define the directory to save the combined model
combined_model_directory = '/content/drive/MyDrive/finetuned_model_cti/full_version'

# Save the base model
base_model.save_pretrained(combined_model_directory)

# Save the LoRA adapter
lora_model.save_pretrained(combined_model_directory)

# Load the tokenizer from a different source if not found in base_model_directory

tokenizer = AutoTokenizer.from_pretrained(base_model_directory)
tokenizer.save_pretrained(combined_model_directory)

print(f"Combined model saved to {combined_model_directory}")

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Combined model saved to /content/drive/MyDrive/finetuned_model_cti/full_version


#retesting the mcq responding (cti_mcQ)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the directory where your combined model (base model + LoRA adapter) is saved
combined_model_directory = '/content/drive/MyDrive/finetuned_model_cti/full_version'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(combined_model_directory)

# Load the model
model = AutoModelForCausalLM.from_pretrained(combined_model_directory)

print("Combined model and tokenizer loaded successfully.")


config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


PackageNotFoundError: No package metadata was found for bitsandbytes

In [None]:
import os
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Move model to the same device as input_ids (ensure this matches the device where model is loaded)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset from the TSV file
df = pd.read_csv('/content/drive/MyDrive/cti-mcq.tsv', sep='\t', on_bad_lines='warn')

# Trim any leading/trailing whitespace from column names
df.columns = df.columns.str.strip()

# Ensure necessary columns are present
required_columns = ['Question', 'Option A', 'Option B', 'Option C', 'Option D', 'GT', 'Prompt']
missing_columns = [col for col in required_columns if col not in df.columns]

if missing_columns:
    raise ValueError(f"Missing columns in the dataset: {', '.join(missing_columns)}")


# Function to prepare the prompt for the model
def prepare_prompt(question, options):
    options_text = "\n".join([f"Option {chr(65+i)}: {opt}" for i, opt in enumerate(options)])
    formatted_prompt = (
        "You are a cybersecurity expert specializing in cyber threat intelligence. "
        "You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. "
        "Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.\n"
        f"**Question:** {question}\n**Options:**\n{options_text}\n**Important:** The last line of your answer should contain only the single letter corresponding to the best option, with no additional text.\n"
        "**Answer:**"
    )
    return formatted_prompt

# Function to tokenize input
def tokenize_input(formatted_prompt, tokenizer):
    encoding = tokenizer(formatted_prompt, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device) if 'attention_mask' in encoding else None
    return input_ids, attention_mask

# Function to generate response from the model
def generate_response(input_ids, attention_mask, model, max_length=512):
    response = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(response[0], skip_special_tokens=True)
    return generated_text

# Function to extract the answer from generated text
def extract_answer(generated_text):
    # Find the last line that contains only a single uppercase letter
    match = re.findall(r'\b[A-D]\b', generated_text)
    return match[-1] if match else "No valid answer found"

# Process the questions and generate responses
results = []
num_processed = 0
for idx, row in df.iterrows():
    if idx >= 100:
        break

    print(f"Processing question {idx + 1}...")  # Debug print to track progress

    question = row['Question']
    options = [row['Option A'], row['Option B'], row['Option C'], row['Option D']]
    correct_answer = row['GT'].strip().upper()
    explanation = row.get('Prompt', 'No explanation provided')

    # Prepare the prompt
    formatted_prompt = prepare_prompt(question, options)

    # Tokenize the input
    input_ids, attention_mask = tokenize_input(formatted_prompt, tokenizer)

    # Generate a response
    generated_text = generate_response(input_ids, attention_mask, model)

    # Extract the answer
    predicted_answer = extract_answer(generated_text)

    # Compare the predicted answer with the correct answer
    answer_match = predicted_answer == correct_answer

    # Append results
    results.append({
        'Question': question,
        'Options': options,
        'Correct Answer': correct_answer,
        'Explanation': explanation,
        'Predicted Answer': predicted_answer,
        'Match': answer_match,
        'Formatted Prompt': formatted_prompt,
        'Generated Text': generated_text,
    })

    num_processed += 1
    # Save results incrementally
    results_df = pd.DataFrame(results)
    results_df.to_csv('/content/drive/MyDrive/finetuned_model_cti/cti_FNllama_answer.csv', index=False)

    print(f"Processed {num_processed} out of 100 questions")  # Debug print to track progress

print(f"Processed {num_processed} questions")
print("Predicted answers saved to '/content/drive/MyDrive/finetuned_model_cti/cti_FNllama_answer.csv'")


Processing question 1...
Processed 1 out of 100 questions
Processing question 2...
Processed 2 out of 100 questions
Processing question 3...
Processed 3 out of 100 questions
Processing question 4...
Processed 4 out of 100 questions
Processing question 5...
Processed 5 out of 100 questions
Processing question 6...
Processed 6 out of 100 questions
Processing question 7...
Processed 7 out of 100 questions
Processing question 8...
Processed 8 out of 100 questions
Processing question 9...
Processed 9 out of 100 questions
Processing question 10...
Processed 10 out of 100 questions
Processing question 11...
Processed 11 out of 100 questions
Processing question 12...
Processed 12 out of 100 questions
Processing question 13...
Processed 13 out of 100 questions
Processing question 14...
Processed 14 out of 100 questions
Processing question 15...
Processed 15 out of 100 questions
Processing question 16...
Processed 16 out of 100 questions
Processing question 17...
Processed 17 out of 100 question

In [None]:
import os
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Move model to the same device as input_ids (ensure this matches the device where model is loaded)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset from the TSV file
df = pd.read_csv('/content/drive/MyDrive/cti-mcq.tsv', sep='\t', on_bad_lines='warn')

# Trim any leading/trailing whitespace from column names
df.columns = df.columns.str.strip()

# Ensure necessary columns are present
required_columns = ['Question', 'Option A', 'Option B', 'Option C', 'Option D', 'GT', 'Prompt']
missing_columns = [col for col in required_columns if col not in df.columns]

if missing_columns:
    raise ValueError(f"Missing columns in the dataset: {', '.join(missing_columns)}")

# Function to prepare the prompt for the model
def prepare_prompt(question, options):
    options_text = "\n".join([f"Option {chr(65+i)}: {opt}" for i, opt in enumerate(options)])
    formatted_prompt = (
        "You are a cybersecurity expert specializing in cyber threat intelligence. "
        "You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. "
        "Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.\n"
        f"**Question:** {question}\n**Options:**\n{options_text}\n**Important:** The last line of your answer should contain only the single letter corresponding to the best option, with no additional text.\n"
        "**Answer:**"
    )
    return formatted_prompt

# Function to tokenize input
def tokenize_input(formatted_prompt, tokenizer):
    encoding = tokenizer(formatted_prompt, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device) if 'attention_mask' in encoding else None
    return input_ids, attention_mask

# Function to generate response from the model
def generate_response(input_ids, attention_mask, model, max_length=512):
    response = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(response[0], skip_special_tokens=True)
    return generated_text

# Function to extract the answer from generated text
def extract_answer(generated_text):
    # Find the last line that contains only a single uppercase letter
    match = re.findall(r'\b[A-D]\b', generated_text)
    return match[-1] if match else "No valid answer found"

# Process the questions and generate responses
def process_questions(df, num_questions_to_process=2500):
    results = []
    num_processed = 0

    for idx, row in df.iterrows():
        if num_processed >= num_questions_to_process:
            break

        print(f"Processing question {num_processed + 1}...")  # Debug print to track progress

        question = row['Question']
        options = [row['Option A'], row['Option B'], row['Option C'], row['Option D']]
        correct_answer = row['GT'].strip().upper()
        explanation = row.get('Prompt', 'No explanation provided')

        # Prepare the prompt
        formatted_prompt = prepare_prompt(question, options)

        # Tokenize the input
        input_ids, attention_mask = tokenize_input(formatted_prompt, tokenizer)

        # Generate a response
        generated_text = generate_response(input_ids, attention_mask, model)

        # Extract the answer
        predicted_answer = extract_answer(generated_text)

        # Compare the predicted answer with the correct answer
        answer_match = predicted_answer == correct_answer

        # Append results
        results.append({
            'Question': question,
            'Options': options,
            'Correct Answer': correct_answer,
            'Explanation': explanation,
            'Predicted Answer': predicted_answer,
            'Match': answer_match,
            'Formatted Prompt': formatted_prompt,
            'Generated Text': generated_text,
        })

        num_processed += 1

        # Save results incrementally
        results_df = pd.DataFrame(results)
        results_df.to_csv('/content/drive/MyDrive/finetuned_model_cti/cti_FNllama_answer.csv', index=False)

        print(f"Processed {num_processed} questions")  # Debug print to track progress

    return results_df


# Process the dataset
results_df = process_questions(df)

print(f"Processed {len(results_df)} questions")
print("Predicted answers saved to '/content/drive/MyDrive/finetuned_model_cti/cti_FNllama_answer.csv'")


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Processing question 2...
Processed 2 questions
Processing question 3...
Processed 3 questions
Processing question 4...
Processed 4 questions
Processing question 5...
Processed 5 questions
Processing question 6...
Processed 6 questions
Processing question 7...
Processed 7 questions
Processing question 8...
Processed 8 questions
Processing question 9...
Processed 9 questions
Processing question 10...
Processed 10 questions
Processing question 11...
Processed 11 questions
Processing question 12...
Processed 12 questions
Processing question 13...
Processed 13 questions
Processing question 14...
Processed 14 questions
Processing question 15...
Processed 15 questions
Processing question 16...
Processed 16 questions
Processing question 17...
Processed 17 questions
Processing question 18...
Processed 18 questions
Processing question 19...
Processed 19 questions
Processing question 20...
Processed 20 que

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load the results from the CSV file
results_df = pd.read_csv('/content/drive/MyDrive/finetuned_model_cti/cti_FNllama_answer.csv')

# Calculate the number of correct predictions
correct_predictions = results_df['Match'].sum()

# Calculate the total number of questions processed
total_questions = len(results_df)

# Calculate the accuracy percentage
accuracy_percentage = (correct_predictions / total_questions) * 100

# Print the results
print(f"Total Questions Processed: {total_questions}")
print(f"Number of Correct Predictions: {correct_predictions}")
print(f"Accuracy Percentage: {accuracy_percentage:.2f}%")


Total Questions Processed: 2500
Number of Correct Predictions: 1815
Accuracy Percentage: 72.60%


In [None]:
print("Execution completed. Ending the session.")
import os
os.kill(os.getpid(), 9)

#testing the cti-finetuned llama on my dataset

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the directory where your combined model (base model + LoRA adapter) is saved
combined_model_directory = '/content/drive/MyDrive/finetuned_model_cti/full_version'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(combined_model_directory)

# Load the model
model = AutoModelForCausalLM.from_pretrained(combined_model_directory)

print("Combined model and tokenizer loaded successfully.")


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

Combined model and tokenizer loaded successfully.


In [None]:
import os
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Move model to the same device as input_ids (ensure this matches the device where the model is loaded)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset from the TSV file
df = pd.read_csv('/content/drive/MyDrive/myDB.tsv', sep='\t', on_bad_lines='warn')

# Trim any leading/trailing whitespace from column names
df.columns = df.columns.str.strip()

# Ensure necessary columns are present
required_columns = ['question', 'Option_A', 'Option_B', 'Option_C', 'Option_D', 'Correct_Answer']
missing_columns = [col for col in required_columns if col not in df.columns]

if missing_columns:
    raise ValueError(f"Missing columns in the dataset: {', '.join(missing_columns)}")

# Function to prepare the prompt for the model
def prepare_prompt(question, options):
    options_text = "\n".join([f"Option {chr(65+i)}: {opt}" for i, opt in enumerate(options)])
    formatted_prompt = (
        "You are a cybersecurity expert specializing in cyber threat intelligence. "
        "You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. "
        "Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.\n"
        f"**Question:** {question}\n**Options:**\n{options_text}\n**Important:** The last line of your answer should contain only the single letter corresponding to the best option, with no additional text.\n"
        "**Answer:**"
    )
    return formatted_prompt

# Function to tokenize input
def tokenize_input(formatted_prompt, tokenizer):
    encoding = tokenizer(formatted_prompt, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device) if 'attention_mask' in encoding else None
    return input_ids, attention_mask

# Function to generate response from the model
def generate_response(input_ids, attention_mask, model, max_length=512):
    response = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(response[0], skip_special_tokens=True)
    return generated_text

# Function to extract the answer from generated text
def extract_answer(generated_text):
    # Find the last line that contains only a single uppercase letter
    match = re.findall(r'\b[A-D]\b', generated_text)
    return match[-1] if match else "No valid answer found"

# Process the questions and generate responses
def process_questions(df, num_questions_to_process=2500):
    results = []
    num_processed = 0

    for idx, row in df.iterrows():
        if num_processed >= num_questions_to_process:
            break

        print(f"Processing question {num_processed + 1}...")  # Debug print to track progress

        question = row['question']
        options = [row['Option_A'], row['Option_B'], row['Option_C'], row['Option_D']]
        correct_answer = row['Correct_Answer'].strip().upper()

        # Prepare the prompt
        formatted_prompt = prepare_prompt(question, options)

        # Tokenize the input
        input_ids, attention_mask = tokenize_input(formatted_prompt, tokenizer)

        # Generate a response
        generated_text = generate_response(input_ids, attention_mask, model)

        # Extract the answer
        predicted_answer = extract_answer(generated_text)

        # Compare the predicted answer with the correct answer
        answer_match = predicted_answer == correct_answer

        # Append results
        results.append({
            'Question': question,
            'Options': options,
            'Correct Answer': correct_answer,
            'Predicted Answer': predicted_answer,
            'Match': answer_match,

        })

        num_processed += 1

        # Save results incrementally
        results_df = pd.DataFrame(results)
        results_df.to_csv('/content/drive/MyDrive/finetuned_model_cti/cti_FNllama_answer_mydb.csv', index=False)

        print(f"Processed {num_processed} questions")  # Debug print to track progress

    return results_df


# Process the dataset
results_df = process_questions(df)

print(f"Processed {len(results_df)} questions")
print("Predicted answers saved to '/content/drive/MyDrive/finetuned_model_cti/cti_FNllama_answer_mydb.csv'")


Processing question 1...
Processed 1 questions
Processing question 2...
Processed 2 questions
Processing question 3...
Processed 3 questions
Processing question 4...
Processed 4 questions
Processing question 5...
Processed 5 questions
Processing question 6...
Processed 6 questions
Processing question 7...
Processed 7 questions
Processing question 8...
Processed 8 questions
Processing question 9...
Processed 9 questions
Processing question 10...
Processed 10 questions
Processing question 11...
Processed 11 questions
Processing question 12...
Processed 12 questions
Processing question 13...
Processed 13 questions
Processing question 14...
Processed 14 questions
Processing question 15...
Processed 15 questions
Processing question 16...
Processed 16 questions
Processing question 17...
Processed 17 questions
Processing question 18...
Processed 18 questions
Processing question 19...
Processed 19 questions
Processing question 20...
Processed 20 questions
Processing question 21...
Processed 21

AttributeError: 'float' object has no attribute 'strip'

In [None]:
import pandas as pd

# Load the results from the CSV file
results_df = pd.read_csv('/content/drive/MyDrive/finetuned_model_cti/cti_FNllama_answer_mydb.csv')

# Calculate the number of correct predictions
correct_predictions = results_df['Match'].sum()

# Calculate the total number of questions processed
total_questions = len(results_df)

# Calculate the accuracy percentage
accuracy_percentage = (correct_predictions / total_questions) * 100

# Print the results
print(f"Total Questions Processed: {total_questions}")
print(f"Number of Correct Predictions: {correct_predictions}")
print(f"Accuracy Percentage: {accuracy_percentage:.2f}%")


Total Questions Processed: 1622
Number of Correct Predictions: 821
Accuracy Percentage: 50.62%


In [None]:
print("Execution completed. Ending the session.")
import os
os.kill(os.getpid(), 9)

#Refinetunhing for better results


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
import pandas as pd
from datasets import Dataset

# The prompt format
prompt = """You are a cybersecurity expert specializing in cyber threat intelligence.
You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset.
Your task is to choose the best option among the four provided.

### Question:
{}

### Options:
A) {}
B) {}
C) {}
D) {}

### Response:
{}"""

# Define EOS_TOKEN (usually "<|endoftext|>" or a similar token, depending on the tokenizer)
EOS_TOKEN = tokenizer.eos_token  # Ensure that the tokenizer is loaded before this

# Function to format the prompts
def formatting_prompts_func(examples):
    questions = examples["Question"]
    option_as = examples["Option A"]
    option_bs = examples["Option B"]
    option_cs = examples["Option C"]
    option_ds = examples["Option D"]
    correct_answers = examples["GT"]

    texts = []
    for question, option_a, option_b, option_c, option_d, correct_answer in zip(questions, option_as, option_bs, option_cs, option_ds, correct_answers):
        # Format the prompt with the provided question, options, and correct answer
        text = prompt.format(question, option_a, option_b, option_c, option_d, correct_answer) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}

# Function to load data from a TSV file and convert it to a Dataset
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    return Dataset.from_pandas(df)

# Load the dataset
train_file_path = '/content/drive/MyDrive/cti-mcq.tsv'
train_dataset = load_data(train_file_path)

# Apply the formatting function to your dataset
formatted_dataset = train_dataset.map(formatting_prompts_func, batched=True)

# Check the first example to ensure it's formatted correctly
print(formatted_dataset[0]['text'])


Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

You are a cybersecurity expert specializing in cyber threat intelligence.
You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset.
Your task is to choose the best option among the four provided.

### Question:
Which of the following mitigations involves preventing applications from running that haven't been downloaded from legitimate repositories?

### Options:
A) Audit
B) Execution Prevention
C) Operating System Configuration
D) User Account Control

### Response:
B<|end_of_text|>


In [None]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Using cached transformers-4.44.2-py3-none-any.whl (9.5 MB)
Installing collected packages: transformers
Successfully installed transformers-4.44.2


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from sklearn.model_selection import KFold
import numpy as np

# Define cross-validation setup
kf = KFold(n_splits=4, shuffle=True, random_state=42)
max_seq_length = 512
eval_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(formatted_dataset)):
    print(f"Fold {fold + 1}/{kf.get_n_splits()}")

    train_dataset = formatted_dataset.select(train_idx)
    val_dataset = formatted_dataset.select(val_idx)

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=8,
        packing=False,
        args=TrainingArguments(
            per_device_train_batch_size=8,
            gradient_accumulation_steps=1,
            warmup_steps=500,
            num_train_epochs=3,
            learning_rate=2e-4,
            fp16=True,  # Use fp16 since T4 supports it

            logging_steps=50,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=3407,
            output_dir=f"outputs/fold_{fold+1}",
            evaluation_strategy="steps",
            eval_steps=50,
            save_strategy="steps",
            save_steps=50,
            load_best_model_at_end=True,
        ),
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the validation dataset
    fold_eval_results = trainer.evaluate()
    eval_results.append(fold_eval_results)
    print(f"Evaluation results for fold {fold + 1}: {fold_eval_results}")

# Average the results across all folds
avg_eval_results = {key: np.mean([result[key] for result in eval_results]) for key in eval_results[0].keys()}
print(f"Average evaluation results: {avg_eval_results}")


Fold 1/4




Map (num_proc=8):   0%|          | 0/1875 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/625 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,875 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
\        /    Total batch size = 8 | Total steps = 705
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
50,2.1547,1.418234
100,1.1576,1.053543
150,1.0403,0.959335
200,0.9647,0.93038
250,0.9203,0.925627
300,0.8796,0.904725
350,0.8773,0.896453
400,0.8779,0.887594
450,0.8725,0.882055
500,0.7763,0.904802


Evaluation results for fold 1: {'eval_loss': 0.8690705299377441, 'eval_runtime': 77.0875, 'eval_samples_per_second': 8.108, 'eval_steps_per_second': 1.025, 'epoch': 3.0}
Fold 2/4




Map (num_proc=8):   0%|          | 0/1875 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/625 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,875 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
\        /    Total batch size = 8 | Total steps = 705
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
50,0.6545,0.535772
100,0.6648,0.531452
150,0.6405,0.537744
200,0.6606,0.54348
250,0.6277,0.550361
300,0.5618,0.566265
350,0.5967,0.57547
400,0.6004,0.594226
450,0.5973,0.60871
500,0.5224,0.655375


Evaluation results for fold 2: {'eval_loss': 0.5314515829086304, 'eval_runtime': 77.4388, 'eval_samples_per_second': 8.071, 'eval_steps_per_second': 1.02, 'epoch': 3.0}
Fold 3/4




Map (num_proc=8):   0%|          | 0/1875 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/625 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,875 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
\        /    Total batch size = 8 | Total steps = 705
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
50,0.6461,0.503186
100,0.6064,0.504206
150,0.6502,0.514966
200,0.6378,0.523881
250,0.613,0.534915
300,0.546,0.549774
350,0.5669,0.559861
400,0.5963,0.57536
450,0.6146,0.588251
500,0.5028,0.623631


Evaluation results for fold 3: {'eval_loss': 0.5031859278678894, 'eval_runtime': 77.2117, 'eval_samples_per_second': 8.095, 'eval_steps_per_second': 1.023, 'epoch': 3.0}
Fold 4/4




Map (num_proc=8):   0%|          | 0/1875 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/625 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,875 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
\        /    Total batch size = 8 | Total steps = 705
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
50,0.6321,0.511772
100,0.6128,0.510146
150,0.6275,0.520486
200,0.6208,0.533369
250,0.6096,0.550002
300,0.5442,0.552949
350,0.5742,0.568654
400,0.5676,0.58419
450,0.6111,0.598305
500,0.493,0.658886


Evaluation results for fold 4: {'eval_loss': 0.510145902633667, 'eval_runtime': 78.0002, 'eval_samples_per_second': 8.013, 'eval_steps_per_second': 1.013, 'epoch': 3.0}
Average evaluation results: {'eval_loss': 0.6034634858369827, 'eval_runtime': 77.43455, 'eval_samples_per_second': 8.07175, 'eval_steps_per_second': 1.0202499999999999, 'epoch': 3.0}


In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("/content/outputs/fold_3/checkpoint-50")


`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [None]:
model.save_pretrained("/content/drive/MyDrive/finetuned_model_cti/Untitled Folder")
tokenizer.save_pretrained("/content/drive/MyDrive/finetuned_model_cti/Untitled Folder")




('/content/drive/MyDrive/finetuned_model_cti/Untitled Folder/tokenizer_config.json',
 '/content/drive/MyDrive/finetuned_model_cti/Untitled Folder/special_tokens_map.json',
 '/content/drive/MyDrive/finetuned_model_cti/Untitled Folder/tokenizer.json')

In [None]:
!ls /content/drive/MyDrive/finetuned_model_cti/fn2/checkpoint-50


adapter_config.json	   README.md	  special_tokens_map.json  trainer_state.json
adapter_model.safetensors  rng_state.pth  tokenizer_config.json    training_args.bin
optimizer.pt		   scheduler.pt   tokenizer.json


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load the base model
base_model_directory = '/content/drive/MyDrive/finetuned_model/llama_model'
base_model = AutoModelForCausalLM.from_pretrained(base_model_directory)

# Load the LoRA adapter
lora_adapters_path = "/content/drive/MyDrive/finetuned_model_cti/fn2/checkpoint-50"
lora_model = PeftModel.from_pretrained(base_model, lora_adapters_path)

# Apply LoRA adapter to the base model
# Assuming PeftModel integrates the adapter with the base model
base_model_with_lora = lora_model

# Define the directory to save the combined model
combined_model_directory = '/content/drive/MyDrive/finetuned_model_cti/fn2/fn2_model'

# Save the base model
base_model.save_pretrained(combined_model_directory)

# Save the LoRA adapter
lora_model.save_pretrained(combined_model_directory)

# Load the tokenizer from a different source if not found in base_model_directory

tokenizer = AutoTokenizer.from_pretrained(base_model_directory)
tokenizer.save_pretrained(combined_model_directory)

print(f"Combined model saved to {combined_model_directory}")

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Combined model saved to /content/drive/MyDrive/finetuned_model_cti/fn2/fn2_model


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the directory where your combined model (base model + LoRA adapter) is saved
combined_model_directory = '/content/drive/MyDrive/finetuned_model_cti/fn2/fn2_model'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(combined_model_directory)

# Load the model
model = AutoModelForCausalLM.from_pretrained(combined_model_directory)

print("Combined model and tokenizer loaded successfully.")


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Combined model and tokenizer loaded successfully.


In [None]:
import os
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Move model to the same device as input_ids (ensure this matches the device where the model is loaded)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset from the TSV file
df = pd.read_csv('/content/drive/MyDrive/myDB.tsv', sep='\t', on_bad_lines='warn')

# Trim any leading/trailing whitespace from column names
df.columns = df.columns.str.strip()

# Ensure necessary columns are present
required_columns = ['question', 'Option_A', 'Option_B', 'Option_C', 'Option_D', 'Correct_Answer']
missing_columns = [col for col in required_columns if col not in df.columns]

if missing_columns:
    raise ValueError(f"Missing columns in the dataset: {', '.join(missing_columns)}")

# Sample 250 random questions
df_sampled = df.sample(n=250, random_state=42)

# Function to prepare the prompt for the model
def prepare_prompt(question, options):
    options_text = "\n".join([f"Option {chr(65+i)}: {opt}" for i, opt in enumerate(options)])
    formatted_prompt = (
        "You are a cybersecurity expert specializing in cyber threat intelligence. "
        "You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. "
        "Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.\n"
        f"**Question:** {question}\n**Options:**\n{options_text}\n**Important:** The last line of your answer should contain only the single letter corresponding to the best option, with no additional text.\n"
        "**Answer:**"
    )
    return formatted_prompt

# Function to tokenize input
def tokenize_input(formatted_prompt, tokenizer):
    encoding = tokenizer(formatted_prompt, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device) if 'attention_mask' in encoding else None
    return input_ids, attention_mask

# Function to generate response from the model
def generate_response(input_ids, attention_mask, model, max_length=512):
    response = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(response[0], skip_special_tokens=True)
    return generated_text

# Function to extract the answer from generated text
def extract_answer(generated_text):
    # Find the last line that contains only a single uppercase letter
    match = re.findall(r'\b[A-D]\b', generated_text)
    return match[-1] if match else "No valid answer found"

# Process the questions and generate responses
def process_questions(df, num_questions_to_process=250):
    results = []
    num_processed = 0

    for idx, row in df.iterrows():
        if num_processed >= num_questions_to_process:
            break

        print(f"Processing question {num_processed + 1}...")  # Debug print to track progress

        question = row['question']
        options = [row['Option_A'], row['Option_B'], row['Option_C'], row['Option_D']]
        correct_answer = row['Correct_Answer'].strip().upper()

        # Prepare the prompt
        formatted_prompt = prepare_prompt(question, options)

        # Tokenize the input
        input_ids, attention_mask = tokenize_input(formatted_prompt, tokenizer)

        # Generate a response
        generated_text = generate_response(input_ids, attention_mask, model)

        # Extract the answer
        predicted_answer = extract_answer(generated_text)

        # Compare the predicted answer with the correct answer
        answer_match = predicted_answer == correct_answer

        # Append results
        results.append({
            'Question': question,
            'Options': options,
            'Correct Answer': correct_answer,
            'Predicted Answer': predicted_answer,
            'Match': answer_match,
        })

        num_processed += 1

        # Save results incrementally
        results_df = pd.DataFrame(results)
        results_df.to_csv('/content/drive/MyDrive/finetuned_model_cti/cti_FNllama_answer_mydb.csv', index=False)

        print(f"Processed {num_processed} questions")  # Debug print to track progress

    return results_df

# Process the sampled dataset
results_df = process_questions(df_sampled)

print(f"Processed {len(results_df)} questions")
print("Predicted answers saved to '/content/drive/MyDrive/finetuned_model_cti/cti_FNllama_answer_mydb.csv'")


Processing question 1...
Processed 1 questions
Processing question 2...
Processed 2 questions
Processing question 3...
Processed 3 questions
Processing question 4...
Processed 4 questions
Processing question 5...
Processed 5 questions
Processing question 6...
Processed 6 questions
Processing question 7...
Processed 7 questions
Processing question 8...
Processed 8 questions
Processing question 9...
Processed 9 questions
Processing question 10...
Processed 10 questions
Processing question 11...
Processed 11 questions
Processing question 12...
Processed 12 questions
Processing question 13...
Processed 13 questions
Processing question 14...
Processed 14 questions
Processing question 15...
Processed 15 questions
Processing question 16...
Processed 16 questions
Processing question 17...
Processed 17 questions
Processing question 18...
Processed 18 questions
Processing question 19...
Processed 19 questions
Processing question 20...
Processed 20 questions
Processing question 21...
Processed 21

In [None]:
import pandas as pd

# Load the results from the CSV file
results_df = pd.read_csv('/content/drive/MyDrive/finetuned_model_cti/cti_FNllama_answer_mydb.csv')

# Calculate the number of correct predictions
correct_predictions = results_df['Match'].sum()

# Calculate the total number of questions processed
total_questions = len(results_df)

# Calculate the accuracy percentage
accuracy_percentage = (correct_predictions / total_questions) * 100

# Print the results
print(f"Total Questions Processed: {total_questions}")
print(f"Number of Correct Predictions: {correct_predictions}")
print(f"Accuracy Percentage: {accuracy_percentage:.2f}%")


Total Questions Processed: 250
Number of Correct Predictions: 136
Accuracy Percentage: 54.40%


In [None]:
print("Execution completed. Ending the session.")
import os
os.kill(os.getpid(), 9)

#TESTING MY MODEL WITH THE CTI DATABASE


LOADING THE MODEL

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the directory where your combined model (base model + LoRA adapter) is saved
combined_model_directory = '/content/drive/MyDrive/finetuned_model/llama_finetune_version'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(combined_model_directory)

# Load the model
model = AutoModelForCausalLM.from_pretrained(combined_model_directory)

print("Combined model and tokenizer loaded successfully.")


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Combined model and tokenizer loaded successfully.


In [None]:
import os
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Move model to the same device as input_ids (ensure this matches the device where model is loaded)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset from the TSV file
df = pd.read_csv('/content/drive/MyDrive/cti-mcq.tsv', sep='\t', on_bad_lines='warn')

# Trim any leading/trailing whitespace from column names
df.columns = df.columns.str.strip()

# Ensure necessary columns are present
required_columns = ['URL', 'Question', 'Option A', 'Option B', 'Option C', 'Option D', 'Prompt', 'GT']
missing_columns = [col for col in required_columns if col not in df.columns]

if missing_columns:
    raise ValueError(f"Missing columns in the dataset: {', '.join(missing_columns)}")

# Sample 250 random questions
df_sample = df.sample(n=250, random_state=42)  # Adjust random_state for different samples

# Function to prepare the prompt for the model
def prepare_prompt(question, options):
    options_text = "\n".join([f"Option {chr(65+i)}: {opt}" for i, opt in enumerate(options)])
    formatted_prompt = (
        "You are a cybersecurity expert specializing in cyber threat intelligence. "
        "You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. "
        "Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.\n"
        f"**Question:** {question}\n**Options:**\n{options_text}\n**Important:** The last line of your answer should contain only the single letter corresponding to the best option, with no additional text.\n"
        "**Answer:**"
    )
    return formatted_prompt

# Function to tokenize input
def tokenize_input(formatted_prompt, tokenizer):
    encoding = tokenizer(formatted_prompt, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device) if 'attention_mask' in encoding else None
    return input_ids, attention_mask

# Function to generate response from the model
def generate_response(input_ids, attention_mask, model, max_length=512):
    response = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(response[0], skip_special_tokens=True)
    return generated_text

# Function to extract the answer from generated text
def extract_answer(generated_text):
    # Find the last line that contains only a single uppercase letter
    match = re.findall(r'\b[A-D]\b', generated_text)
    return match[-1] if match else "No valid answer found"

# Process the sampled questions and generate responses
results = []
for idx, row in df_sample.iterrows():
    print(f"Processing question {idx + 1}...")  # Debug print to track progress

    question = row['Question']
    options = [row['Option A'], row['Option B'], row['Option C'], row['Option D']]
    correct_answer = row['GT'].strip().upper()
    prompt = row['Prompt']  # If you want to use the existing prompt from the dataset

    # Prepare the prompt
    formatted_prompt = prepare_prompt(question, options)

    # Tokenize the input
    input_ids, attention_mask = tokenize_input(formatted_prompt, tokenizer)

    # Generate a response
    generated_text = generate_response(input_ids, attention_mask, model)

    # Extract the answer
    predicted_answer = extract_answer(generated_text)

    # Compare the predicted answer with the correct answer
    answer_match = predicted_answer == correct_answer

    # Append results
    results.append({
        'Question': question,
        'Options': options,
        'Correct Answer': correct_answer,
        'Predicted Answer': predicted_answer,
        'Match': answer_match,
        'Generated Text': generated_text,
    })

    # Save results incrementally
    results_df = pd.DataFrame(results)
    results_df.to_csv('/content/drive/MyDrive/finetuned_model/Testing_mymodel_with_cti2.csv', index=False)

    print(f"Processed {idx + 1} questions")  # Debug print to track progress

print("All questions processed")
print("Predicted answers saved to '/content/drive/MyDrive/finetuned_model/Testing_mymodel_with_cti2.csv'")


Processing question 1448...
Processed 1448 questions
Processing question 1115...
Processed 1115 questions
Processing question 1065...
Processed 1065 questions
Processing question 2288...
Processed 2288 questions
Processing question 1538...
Processed 1538 questions
Processing question 669...
Processed 669 questions
Processing question 1584...
Processed 1584 questions
Processing question 2405...
Processed 2405 questions
Processing question 498...
Processed 498 questions
Processing question 2481...
Processed 2481 questions
Processing question 472...
Processed 472 questions
Processing question 247...
Processed 247 questions
Processing question 924...
Processed 924 questions
Processing question 1132...
Processed 1132 questions
Processing question 671...
Processed 671 questions
Processing question 917...
Processed 917 questions
Processing question 1415...
Processed 1415 questions
Processing question 1607...
Processed 1607 questions
Processing question 403...
Processed 403 questions
Processin

In [None]:
import pandas as pd

# Load the results from the CSV file
results_df = pd.read_csv('/content/drive/MyDrive/finetuned_model/Testing_mymodel_with_cti2.csv')

# Calculate the number of correct predictions
correct_predictions = results_df['Match'].sum()

# Calculate the total number of questions processed
total_questions = len(results_df)

# Calculate the accuracy percentage
accuracy_percentage = (correct_predictions / total_questions) * 100

# Print the results
print(f"Total Questions Processed: {total_questions}")
print(f"Number of Correct Predictions: {correct_predictions}")
print(f"Accuracy Percentage: {accuracy_percentage:.2f}%")


Total Questions Processed: 250
Number of Correct Predictions: 115
Accuracy Percentage: 46.00%


In [None]:
print("Execution completed. Ending the session.")
import os
os.kill(os.getpid(), 9)

##test another dataset

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the directory where your combined model (base model + LoRA adapter) is saved
combined_model_directory = '/content/drive/MyDrive/finetuned_model_cti/fn2/fn2_model'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(combined_model_directory)

# Load the model
model = AutoModelForCausalLM.from_pretrained(combined_model_directory)

print("Combined model and tokenizer loaded successfully.")


config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

Combined model and tokenizer loaded successfully.


In [None]:
import os
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Move model to the same device as input_ids (ensure this matches the device where model is loaded)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset from the TSV file
df = pd.read_csv('/content/drive/MyDrive/output_questions (2).tsv', sep='\t', on_bad_lines='warn')

# Trim any leading/trailing whitespace from column names
df.columns = df.columns.str.strip()

# Ensure necessary columns are present
required_columns = ['Question', 'Option A', 'Option B', 'Option C', 'Option D', 'Correct Answer', 'Explanation']
missing_columns = [col for col in required_columns if col not in df.columns]

if missing_columns:
    raise ValueError(f"Missing columns in the dataset: {', '.join(missing_columns)}")

# Sample 100 random questions
df_sample = df.sample(n=250, random_state=42)  # Adjust random_state for different samples

# Function to prepare the prompt for the model
def prepare_prompt(question, options):
    options_text = "\n".join([f"Option {chr(65+i)}: {opt}" for i, opt in enumerate(options)])
    formatted_prompt = (
        "You are a cybersecurity expert specializing in cyber threat intelligence. "
        "You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. "
        "Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.\n"
        f"**Question:** {question}\n**Options:**\n{options_text}\n**Important:** The last line of your answer should contain only the single letter corresponding to the best option, with no additional text.\n"
        "**Answer:**"
    )
    return formatted_prompt

# Function to tokenize input
def tokenize_input(formatted_prompt, tokenizer):
    encoding = tokenizer(formatted_prompt, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device) if 'attention_mask' in encoding else None
    return input_ids, attention_mask

# Function to generate response from the model
def generate_response(input_ids, attention_mask, model, max_length=512):
    response = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(response[0], skip_special_tokens=True)
    return generated_text

# Function to extract the answer from generated text
def extract_answer(generated_text):
    # Find the last line that contains only a single uppercase letter
    match = re.findall(r'\b[A-D]\b', generated_text)
    return match[-1] if match else "No valid answer found"

# Process the sampled questions and generate responses
results = []
for idx, row in df_sample.iterrows():
    print(f"Processing question {idx + 1}...")  # Debug print to track progress

    question = row['Question']
    options = [row['Option A'], row['Option B'], row['Option C'], row['Option D']]
    correct_answer = row['Correct Answer'].strip().upper()
    explanation = row['Explanation']

    # Prepare the prompt
    formatted_prompt = prepare_prompt(question, options)

    # Tokenize the input
    input_ids, attention_mask = tokenize_input(formatted_prompt, tokenizer)

    # Generate a response
    generated_text = generate_response(input_ids, attention_mask, model)

    # Extract the answer
    predicted_answer = extract_answer(generated_text)

    # Compare the predicted answer with the correct answer
    answer_match = predicted_answer == correct_answer

    # Append results
    results.append({
        'Question': question,
        'Options': options,
        'Correct Answer': correct_answer,
        'Predicted Answer': predicted_answer,
        'Match': answer_match,
        'Explanation': explanation,
        'Generated Text': generated_text,
    })

    # Save results incrementally
    results_df = pd.DataFrame(results)
    results_df.to_csv('/content/drive/MyDrive/finetuned_model/Testing_mymodel_with_cti_100.csv', index=False)

    print(f"Processed {idx + 1} questions")  # Debug print to track progress

print("All questions processed")
print("Predicted answers saved to '/content/drive/MyDrive/finetuned_model/Testing_mymodel_with_cti_100.csv'")


Processing question 2197...
Processed 2197 questions
Processing question 411...
Processed 411 questions
Processing question 1390...
Processed 1390 questions
Processing question 889...
Processed 889 questions
Processing question 1773...
Processed 1773 questions
Processing question 57...
Processed 57 questions
Processing question 45...
Processed 45 questions
Processing question 1107...
Processed 1107 questions
Processing question 369...
Processed 369 questions
Processing question 1739...
Processed 1739 questions
Processing question 2244...
Processed 2244 questions
Processing question 195...
Processed 195 questions
Processing question 1966...
Processed 1966 questions
Processing question 1809...
Processed 1809 questions
Processing question 1674...
Processed 1674 questions
Processing question 1617...
Processed 1617 questions
Processing question 2000...
Processed 2000 questions
Processing question 1463...
Processed 1463 questions
Processing question 2001...
Processed 2001 questions
Processin

AttributeError: 'float' object has no attribute 'strip'

In [None]:
import pandas as pd

# Load the results from the CSV file
results_df = pd.read_csv('/content/drive/MyDrive/finetuned_model/Testing_mymodel_with_cti_100.csv')

# Calculate the number of correct predictions
correct_predictions = results_df['Match'].sum()

# Calculate the total number of questions processed
total_questions = len(results_df)

# Calculate the accuracy percentage
accuracy_percentage = (correct_predictions / total_questions) * 100

# Print the results
print(f"Total Questions Processed: {total_questions}")
print(f"Number of Correct Predictions: {correct_predictions}")
print(f"Accuracy Percentage: {accuracy_percentage:.2f}%")


Total Questions Processed: 146
Number of Correct Predictions: 66
Accuracy Percentage: 45.21%


##testing my model with this dataset

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the directory where your combined model (base model + LoRA adapter) is saved
combined_model_directory = '/content/drive/MyDrive/finetuned_model/llama_finetune_version'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(combined_model_directory)

# Load the model
model = AutoModelForCausalLM.from_pretrained(combined_model_directory)

print("Combined model and tokenizer loaded successfully.")


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Combined model and tokenizer loaded successfully.


In [None]:
import os
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Move model to the same device as input_ids (ensure this matches the device where model is loaded)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset from the TSV file
df = pd.read_csv('/content/drive/MyDrive/output_questions (2).tsv', sep='\t', on_bad_lines='warn')

# Trim any leading/trailing whitespace from column names
df.columns = df.columns.str.strip()

# Ensure necessary columns are present
required_columns = ['Question', 'Option A', 'Option B', 'Option C', 'Option D', 'Correct Answer', 'Explanation']
missing_columns = [col for col in required_columns if col not in df.columns]

if missing_columns:
    raise ValueError(f"Missing columns in the dataset: {', '.join(missing_columns)}")

# Sample 100 random questions
df_sample = df.sample(n=250, random_state=42)  # Adjust random_state for different samples

# Function to prepare the prompt for the model
def prepare_prompt(question, options):
    options_text = "\n".join([f"Option {chr(65+i)}: {opt}" for i, opt in enumerate(options)])
    formatted_prompt = (
        "You are a cybersecurity expert specializing in cyber threat intelligence. "
        "You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. "
        "Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.\n"
        f"**Question:** {question}\n**Options:**\n{options_text}\n**Important:** The last line of your answer should contain only the single letter corresponding to the best option, with no additional text.\n"
        "**Answer:**"
    )
    return formatted_prompt

# Function to tokenize input
def tokenize_input(formatted_prompt, tokenizer):
    encoding = tokenizer(formatted_prompt, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device) if 'attention_mask' in encoding else None
    return input_ids, attention_mask

# Function to generate response from the model
def generate_response(input_ids, attention_mask, model, max_length=512):
    response = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(response[0], skip_special_tokens=True)
    return generated_text

# Function to extract the answer from generated text
def extract_answer(generated_text):
    # Find the last line that contains only a single uppercase letter
    match = re.findall(r'\b[A-D]\b', generated_text)
    return match[-1] if match else "No valid answer found"

# Process the sampled questions and generate responses
results = []
for idx, row in df_sample.iterrows():
    print(f"Processing question {idx + 1}...")  # Debug print to track progress

    question = row['Question']
    options = [row['Option A'], row['Option B'], row['Option C'], row['Option D']]
    correct_answer = row['Correct Answer'].strip().upper()
    explanation = row['Explanation']

    # Prepare the prompt
    formatted_prompt = prepare_prompt(question, options)

    # Tokenize the input
    input_ids, attention_mask = tokenize_input(formatted_prompt, tokenizer)

    # Generate a response
    generated_text = generate_response(input_ids, attention_mask, model)

    # Extract the answer
    predicted_answer = extract_answer(generated_text)

    # Compare the predicted answer with the correct answer
    answer_match = predicted_answer == correct_answer

    # Append results
    results.append({
        'Question': question,
        'Options': options,
        'Correct Answer': correct_answer,
        'Predicted Answer': predicted_answer,
        'Match': answer_match,

    })

    # Save results incrementally
    results_df = pd.DataFrame(results)
    results_df.to_csv('/content/drive/MyDrive/finetuned_model/Testing_mymodel_250.csv', index=False)

    print(f"Processed {idx + 1} questions")  # Debug print to track progress

print("All questions processed")
print("Predicted answers saved to '/content/drive/MyDrive/finetuned_model/Testing_mymodel_250.csv'")


Processing question 2197...
Processed 2197 questions
Processing question 411...
Processed 411 questions
Processing question 1390...
Processed 1390 questions
Processing question 889...
Processed 889 questions
Processing question 1773...
Processed 1773 questions
Processing question 57...
Processed 57 questions
Processing question 45...
Processed 45 questions
Processing question 1107...
Processed 1107 questions
Processing question 369...
Processed 369 questions
Processing question 1739...
Processed 1739 questions
Processing question 2244...
Processed 2244 questions
Processing question 195...
Processed 195 questions
Processing question 1966...
Processed 1966 questions
Processing question 1809...
Processed 1809 questions
Processing question 1674...
Processed 1674 questions
Processing question 1617...
Processed 1617 questions
Processing question 2000...
Processed 2000 questions
Processing question 1463...
Processed 1463 questions
Processing question 2001...
Processed 2001 questions
Processin

AttributeError: 'float' object has no attribute 'strip'

In [None]:
import pandas as pd

# Load the results from the CSV file
results_df = pd.read_csv('/content/drive/MyDrive/finetuned_model/Testing_mymodel_250.csv')

# Calculate the number of correct predictions
correct_predictions = results_df['Match'].sum()

# Calculate the total number of questions processed
total_questions = len(results_df)

# Calculate the accuracy percentage
accuracy_percentage = (correct_predictions / total_questions) * 100

# Print the results
print(f"Total Questions Processed: {total_questions}")
print(f"Number of Correct Predictions: {correct_predictions}")
print(f"Accuracy Percentage: {accuracy_percentage:.2f}%")


Total Questions Processed: 146
Number of Correct Predictions: 75
Accuracy Percentage: 51.37%


# finetuning


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
import pandas as pd
prompt = """You are a cybersecurity expert specializing in cyber threat intelligence. Below is a question followed by multiple choices. Select the correct answer from the options provided.

### Question:
{}

### Options:
A) {}
B) {}
C) {}
D) {}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    questions = examples["question"]
    option_as = examples["Option_A "]
    option_bs = examples["Option_B"]
    option_cs = examples["Option_C "]
    option_ds = examples["Option_D"]
    correct_answers = examples["Correct_Answer"]

    texts = []
    for question, option_a, option_b, option_c, option_d, correct_answer in zip(questions, option_as, option_bs, option_cs, option_ds, correct_answers):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(question, option_a, option_b, option_c, option_d, correct_answer) + EOS_TOKEN
        texts.append(text)

    return { "text" : texts }

# Assuming you've loaded your custom dataset like this
from datasets import Dataset, DatasetDict

# Load your TSV dataset
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    return Dataset.from_pandas(df)

train_file_path = '/content/drive/MyDrive/myDB.tsv'
train_dataset = load_data(train_file_path)

dataset_dict = DatasetDict({
    'train': train_dataset,
})

# Apply the formatting function to your dataset
formatted_dataset = dataset_dict['train'].map(formatting_prompts_func, batched=True)

# Check the first example to ensure it's formatted correctly
print(formatted_dataset[0])


Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

{'question': 'Given a scenario where an adversary is using Command and Control (C2) over Tor network (MITRE ATT&CK T1036), which Diamond Model feature is primarily affected?', 'Option_A ': 'Adversary', 'Option_B': 'Infrastructure', 'Option_C ': 'Capability', 'Option_D': 'Victim', 'Correct_Answer': 'B', 'Explanation': "The adversary may use various capabilities like T1036 to communicate, but it's the infrastructure (Tor network) that is directly impacted.", 'text': 'You are a cybersecurity expert specializing in cyber threat intelligence. Below is a question followed by multiple choices. Select the correct answer from the options provided.\n\n### Question:\nGiven a scenario where an adversary is using Command and Control (C2) over Tor network (MITRE ATT&CK T1036), which Diamond Model feature is primarily affected?\n\n### Options:\nA) Adversary\nB) Infrastructure\nC) Capability\nD) Victim\n\n### Response:\nB<|end_of_text|>'}


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Shuffle and split the dataset into 70% training and 30% testing examples
shuffled_dataset = formatted_dataset.shuffle(seed=42)
train_size = int(0.7 * len(shuffled_dataset))
train_dataset = shuffled_dataset.select(range(train_size))  # 70% for training
test_dataset = shuffled_dataset.select(range(train_size, len(shuffled_dataset)))  # 30% for testing

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,  # Use the 70% training dataset
    eval_dataset=test_dataset,  # Use the 30% testing dataset
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=6,  # Increased for faster data processing
    packing=False,  # Set to True if sequences are short to speed up
    args=TrainingArguments(
        per_device_train_batch_size=8,  # Increased batch size
        gradient_accumulation_steps=1,  # Reduced for faster training
        warmup_steps=300,  # Increased warmup steps for stability
        num_train_epochs=3,  # Use epochs instead of steps for training duration
        learning_rate=1e-4,  # Adjusted learning rate
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=50,  # Reduced logging frequency
        optim="adamw_8bit",  # 8-bit optimizer
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        evaluation_strategy="steps",  # Evaluate during training
        eval_steps=50,  # Reduced evaluation frequency
        save_strategy="steps",  # Save model checkpoint during training
        save_steps=50,  # Reduced checkpoint frequency
        load_best_model_at_end=True,  # Load the best model based on eval metric
    ),
)

# Train the model
trainer.train()

# Evaluate the model on the test dataset
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")




Map (num_proc=6):   0%|          | 0/1750 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/750 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,750 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
\        /    Total batch size = 8 | Total steps = 657
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
50,1.9597,1.41698
100,1.1207,1.009516
150,0.9849,0.899981
200,0.8587,0.852218
250,0.7786,0.810153


Step,Training Loss,Validation Loss
50,1.9597,1.41698
100,1.1207,1.009516
150,0.9849,0.899981
200,0.8587,0.852218
250,0.7786,0.810153
300,0.7476,0.774964
350,0.7576,0.730721
400,0.6673,0.70039
450,0.63,0.708974
500,0.4752,0.67471


Evaluation results: {'eval_loss': 0.6419416069984436, 'eval_runtime': 96.9134, 'eval_samples_per_second': 7.739, 'eval_steps_per_second': 0.97, 'epoch': 3.0}


In [None]:
model.save_pretrained("/content/drive/MyDrive/finetuned_model/llama_0.64")
tokenizer.save_pretrained("/content/drive/MyDrive/finetuned_model/llama_0.64")

('/content/drive/MyDrive/finetuned_model/llama_0.64/tokenizer_config.json',
 '/content/drive/MyDrive/finetuned_model/llama_0.64/special_tokens_map.json',
 '/content/drive/MyDrive/finetuned_model/llama_0.64/tokenizer.json')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load the base model
base_model_directory = '/content/drive/MyDrive/finetuned_model/llama_model'
base_model = AutoModelForCausalLM.from_pretrained(base_model_directory)

# Load the LoRA adapter
lora_adapters_path = "/content/drive/MyDrive/finetuned_model/llama_0.64"
lora_model = PeftModel.from_pretrained(base_model, lora_adapters_path)

# Apply LoRA adapter to the base model
# Assuming PeftModel integrates the adapter with the base model
base_model_with_lora = lora_model

# Define the directory to save the combined model
combined_model_directory = '/content/drive/MyDrive/finetuned_model/full_version_llama_0.64'

# Save the base model
base_model.save_pretrained(combined_model_directory)

# Save the LoRA adapter
lora_model.save_pretrained(combined_model_directory)

# Load the tokenizer from a different source if not found in base_model_directory

tokenizer = AutoTokenizer.from_pretrained(base_model_directory)
tokenizer.save_pretrained(combined_model_directory)

print(f"Combined model saved to {combined_model_directory}")

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Combined model saved to /content/drive/MyDrive/finetuned_model/full_version_llama_0.64


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the directory where your combined model (base model + LoRA adapter) is saved
combined_model_directory = '/content/drive/MyDrive/finetuned_model/full_version_llama_0.64'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(combined_model_directory)

# Load the model
model = AutoModelForCausalLM.from_pretrained(combined_model_directory)

print("Combined model and tokenizer loaded successfully.")


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Combined model and tokenizer loaded successfully.


In [None]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Using cached transformers-4.44.2-py3-none-any.whl (9.5 MB)
Installing collected packages: transformers
Successfully installed transformers-4.44.2


In [None]:
import os
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Move model to the same device as input_ids (ensure this matches the device where model is loaded)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset from the TSV file
df = pd.read_csv('/content/drive/MyDrive/cti-mcq.tsv', sep='\t', on_bad_lines='warn')

# Trim any leading/trailing whitespace from column names
df.columns = df.columns.str.strip()

# Ensure necessary columns are present
required_columns = ['URL', 'Question', 'Option A', 'Option B', 'Option C', 'Option D', 'Prompt', 'GT']
missing_columns = [col for col in required_columns if col not in df.columns]

if missing_columns:
    raise ValueError(f"Missing columns in the dataset: {', '.join(missing_columns)}")

# Sample 250 random questions
df_sample = df.sample(n=250, random_state=42)  # Adjust random_state for different samples

# Function to prepare the prompt for the model
def prepare_prompt(question, options):
    options_text = "\n".join([f"Option {chr(65+i)}: {opt}" for i, opt in enumerate(options)])
    formatted_prompt = (
        "You are a cybersecurity expert specializing in cyber threat intelligence. "
        "You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. "
        "Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D.\n"
        f"**Question:** {question}\n**Options:**\n{options_text}\n**Important:** The last line of your answer should contain only the single letter corresponding to the best option, with no additional text.\n"
        "**Answer:**"
    )
    return formatted_prompt

# Function to tokenize input
def tokenize_input(formatted_prompt, tokenizer):
    encoding = tokenizer(formatted_prompt, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device) if 'attention_mask' in encoding else None
    return input_ids, attention_mask

# Function to generate response from the model
def generate_response(input_ids, attention_mask, model, max_length=512):
    response = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(response[0], skip_special_tokens=True)
    return generated_text

# Function to extract the answer from generated text
def extract_answer(generated_text):
    # Find the last line that contains only a single uppercase letter
    match = re.findall(r'\b[A-D]\b', generated_text)
    return match[-1] if match else "No valid answer found"

# Process the sampled questions and generate responses
results = []
for idx, row in df_sample.iterrows():
    print(f"Processing question {idx + 1}...")  # Debug print to track progress

    question = row['Question']
    options = [row['Option A'], row['Option B'], row['Option C'], row['Option D']]
    correct_answer = row['GT'].strip().upper()
    prompt = row['Prompt']  # If you want to use the existing prompt from the dataset

    # Prepare the prompt
    formatted_prompt = prepare_prompt(question, options)

    # Tokenize the input
    input_ids, attention_mask = tokenize_input(formatted_prompt, tokenizer)

    # Generate a response
    generated_text = generate_response(input_ids, attention_mask, model)

    # Extract the answer
    predicted_answer = extract_answer(generated_text)

    # Compare the predicted answer with the correct answer
    answer_match = predicted_answer == correct_answer

    # Append results
    results.append({
        'Question': question,
        'Options': options,
        'Correct Answer': correct_answer,
        'Predicted Answer': predicted_answer,
        'Match': answer_match,
    })

    # Save results incrementally
    results_df = pd.DataFrame(results)
    results_df.to_csv('/content/drive/MyDrive/finetuned_model/Testing_mymodel0.64_with_cti.csv', index=False)

    print(f"Processed {idx + 1} questions")  # Debug print to track progress

print("All questions processed")
print("Predicted answers saved to '/content/drive/MyDrive/finetuned_model/Testing_mymodel0.64_with_cti.csv'")


Processing question 1448...
Processed 1448 questions
Processing question 1115...
Processed 1115 questions
Processing question 1065...
Processed 1065 questions
Processing question 2288...
Processed 2288 questions
Processing question 1538...
Processed 1538 questions
Processing question 669...
Processed 669 questions
Processing question 1584...
Processed 1584 questions
Processing question 2405...
Processed 2405 questions
Processing question 498...
Processed 498 questions
Processing question 2481...
Processed 2481 questions
Processing question 472...
Processed 472 questions
Processing question 247...
Processed 247 questions
Processing question 924...
Processed 924 questions
Processing question 1132...
Processed 1132 questions
Processing question 671...
Processed 671 questions
Processing question 917...
Processed 917 questions
Processing question 1415...
Processed 1415 questions
Processing question 1607...
Processed 1607 questions
Processing question 403...
Processed 403 questions
Processin

In [1]:
import pandas as pd

# Load the results from the CSV file
results_df = pd.read_csv('C:/Users/ACER/Desktop/stage 2024/mcq/fine tuning/my data/Testing_mymodel0.64_with_cti.csv')

# Calculate the number of correct predictions
correct_predictions = results_df['Match'].sum()

# Calculate the total number of questions processed
total_questions = len(results_df)

# Calculate the accuracy percentage
accuracy_percentage = (correct_predictions / total_questions) * 100

# Print the results
print(f"Total Questions Processed: {total_questions}")
print(f"Number of Correct Predictions: {correct_predictions}")
print(f"Accuracy Percentage: {accuracy_percentage:.2f}%")


Total Questions Processed: 250
Number of Correct Predictions: 122
Accuracy Percentage: 48.80%


this is the best finetuned model on my generated dataset 


In [None]:
print("Execution completed. Ending the session.")
import os
os.kill(os.getpid(), 9)

fine tuning another time


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
import pandas as pd
prompt = """You are a cybersecurity expert specializing in cyber threat intelligence. Below is a question followed by multiple choices. Select the correct answer from the options provided.

### Question:
{}

### Options:
A) {}
B) {}
C) {}
D) {}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    questions = examples["question"]
    option_as = examples["Option_A "]
    option_bs = examples["Option_B"]
    option_cs = examples["Option_C "]
    option_ds = examples["Option_D"]
    correct_answers = examples["Correct_Answer"]

    texts = []
    for question, option_a, option_b, option_c, option_d, correct_answer in zip(questions, option_as, option_bs, option_cs, option_ds, correct_answers):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(question, option_a, option_b, option_c, option_d, correct_answer) + EOS_TOKEN
        texts.append(text)

    return { "text" : texts }

# Assuming you've loaded your custom dataset like this
from datasets import Dataset, DatasetDict

# Load your TSV dataset
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t')
    return Dataset.from_pandas(df)

train_file_path = '/content/drive/MyDrive/myDB.tsv'
train_dataset = load_data(train_file_path)

dataset_dict = DatasetDict({
    'train': train_dataset,
})

# Apply the formatting function to your dataset
formatted_dataset = dataset_dict['train'].map(formatting_prompts_func, batched=True)

# Check the first example to ensure it's formatted correctly
print(formatted_dataset[0])


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Shuffle and split the dataset into 70% training and 30% testing examples
shuffled_dataset = formatted_dataset.shuffle(seed=42)
train_size = int(0.7 * len(shuffled_dataset))
train_dataset = shuffled_dataset.select(range(train_size))  # 70% for training
test_dataset = shuffled_dataset.select(range(train_size, len(shuffled_dataset)))  # 30% for testing

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,  # Use the 70% training dataset
    eval_dataset=test_dataset,  # Use the 30% testing dataset
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=6,  # Increased for faster data processing
    packing=False,  # Set to True if sequences are short to speed up
    args=TrainingArguments(
        per_device_train_batch_size=8,  # Increased batch size
        gradient_accumulation_steps=1,  # Reduced for faster training
        warmup_steps=300,  # Increased warmup steps for stability
        num_train_epochs=3,  # Use epochs instead of steps for training duration
        learning_rate=1e-4,  # Adjusted learning rate
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=50,  # Reduced logging frequency
        optim="adamw_8bit",  # 8-bit optimizer
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        evaluation_strategy="steps",  # Evaluate during training
        eval_steps=50,  # Reduced evaluation frequency
        save_strategy="steps",  # Save model checkpoint during training
        save_steps=50,  # Reduced checkpoint frequency
        load_best_model_at_end=True,  # Load the best model based on eval metric
    ),
)

# Train the model
trainer.train()

# Evaluate the model on the test dataset
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")
