# DetectLLaMA

## Training the model

In [None]:
!pip install accelerate peft bitsandbytes transformers trl
#Installing Dependencies

In [2]:
# Installing More Dependencies
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
#These are the model paths and should point to the model. You will also need the full Llama 3 8B Instruct model in the same folder as the notebook
model_id = "./Meta-Llama-3-8B-Instruct"
#model_id = "./llama38B-Fine-tunedPhishv7/model"
output_model="llama38B-Fine-tunedPhishv9"
model_output_folder = "llama38B-Fine-tunedPhishv9/model"

In [5]:
#load in the dataset
df = pd.read_csv("./Phishing_Email.csv")
df = df[['Email Text', 'Email Type']].rename(columns={'Email Text': 'text', 'Email Type': 'label'})
df = df[['label', 'text']]
df.isnull().sum()
df = df.dropna()
df.duplicated().sum()
df = df.drop_duplicates(keep='first')
df.duplicated().sum()


0

In [16]:
train, test = train_test_split(df, test_size=1000, random_state=42)
train.to_csv("./PhishTrain.csv", index=False)
test.to_csv("./PhishTest.csv", index=False)

In [6]:
train = pd.read_csv("./PhishTrain.csv")
test = pd.read_csv("./PhishTest.csv")

In [7]:
#format the data
training_data = train.apply(lambda row: {"prompt": str(row["text"]), "response": f"This is a {str(row['label'])}"}, axis=1).tolist()

In [8]:
#this puts system prompts before the emails to help with learning
def prepare_train_data(data):
    # Convert the data to a Pandas DataFrame
    data_df = pd.DataFrame(data)

    # Create a new column called "text"
    data_df["text"] = data_df[["prompt", "response"]].apply(lambda x: "<|im_start|>system\n You are an expert in identifying phishing emails. The user will provide you an email and you should response 'This is a Phishing Email' if the email is a Phishing Email or 'This is a Safe Email' if it is a Safe Email depending on what the email is. You should only respond with this.<|im_end|>\n <|im_start|>user\n Today I want you to help me decide if this email is a phishing email. Here is the email" + x["prompt"] + " <|im_end|>\n<|im_start|>assistant\n" + x["response"] + "<|im_end|>\n", axis=1)

    # Create a new Dataset from the DataFrame
    data = Dataset.from_pandas(data_df)

    return data

In [9]:
#show that the training and testing data are varied
print(train['label'].value_counts(normalize=True))
print(test['label'].value_counts(normalize=True))

label
Safe Email        0.625952
Phishing Email    0.374048
Name: proportion, dtype: float64
label
Safe Email        0.628
Phishing Email    0.372
Name: proportion, dtype: float64


In [10]:
data = prepare_train_data(training_data)

In [10]:
def get_model_and_tokenizer(model_id):

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_id, quantization_config=bnb_config, device_map="auto"
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model, tokenizer

In [11]:
#get the model and tokenizers
model, tokenizer = get_model_and_tokenizer(model_id)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
#these are the training arguments
peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )

In [13]:
training_arguments = TrainingArguments(
        output_dir=output_model,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=16,
        optim="paged_adamw_32bit",
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=15,
        
        fp16=True,
        push_to_hub=False
    )
#max_steps=250,

In [14]:
trainer = SFTTrainer(
        model=model,
        train_dataset=data,
        peft_config=peft_config,
        dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length=1024
    )


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/16537 [00:00<?, ? examples/s]

  super().__init__(


In [15]:
print(model_id)
print(model_output_folder)
print(output_model)

./llama38B-Fine-tunedPhishv7/model
llama38B-Fine-tunedPhishv9/model
llama38B-Fine-tunedPhishv9


In [16]:
#train the model
trainer.train()

Step,Training Loss
10,2.5179
20,2.106
30,1.9076
40,1.882
50,1.8901
60,1.7873
70,1.8688
80,1.8079
90,1.8854
100,1.7791


TrainOutput(global_step=3870, training_loss=1.5449937660256714, metrics={'train_runtime': 28930.6377, 'train_samples_per_second': 8.574, 'train_steps_per_second': 0.134, 'total_flos': 8.617293829184176e+18, 'train_loss': 1.5449937660256714, 'epoch': 14.974607013301089})

In [17]:
#save the model
trainer.model.save_pretrained(model_output_folder)
tokenizer.save_pretrained(model_output_folder)

('llama38B-Fine-tunedPhishv9/model/tokenizer_config.json',
 'llama38B-Fine-tunedPhishv9/model/special_tokens_map.json',
 'llama38B-Fine-tunedPhishv9/model/tokenizer.json')

## Evaluate the Model

You may need to restart to fit the model in memory

In [1]:
# Installing More Dependencies
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os

Now we download our fine-tuned model directly from huggingface:


In [21]:
model_id="./llama38B-Fine-tunedPhishv7/model"

In [10]:
def get_model_and_tokenizer(model_id):
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.pad_token = tokenizer.eos_token
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
  )
  model = AutoModelForCausalLM.from_pretrained(
      model_id, quantization_config=bnb_config, device_map="auto"
  )
  model.config.use_cache=False
  model.config.pretraining_tp=1
  return model, tokenizer

In [11]:
model, tokenizer = get_model_and_tokenizer(model_id)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
#format the data
testing_data = test.apply(lambda row: {"prompt": str(row["text"]), "response": f"This is a {str(row['label'])}"}, axis=1).tolist()

In [13]:
#this formats the test data
import pandas as pd

def prepare_test_data(data):
    # Convert the data to a Pandas DataFrame
    data_df = pd.DataFrame(data)

    # Create a new column called "text"
    data_df["text"] = data_df[["prompt", "response"]].apply(lambda x: "<|im_start|>system\n You are an expert in identifying phishing emails. The user will provide you an email and you should response 'This is a Phishing Email' if the email is a Phishing Email or 'This is a Safe Email' if it is a Safe Email depending on what the email is. You should only respond with this.<|im_end|>\n <|im_start|>user\n Today I want you to help me decide if this email is a phishing email. Here is the email" + x["prompt"] + " <|im_end|>\n <|im_start|> assistant:", axis=1)

    # Create a new Dataset from the DataFrame
    data = Dataset.from_pandas(data_df)

    return data

In [14]:
testing = prepare_test_data(testing_data)

In [15]:
from tqdm import tqdm
from transformers import AutoTokenizer,AutoModelForCausalLM,pipeline
#this goest through the test set and runs it through the model
def predict(test, model, tokenizer):
    y_pred = []
    ans = []
    categories = ["Phishing", "Safe"]
    
    for i in tqdm(range(len(test))):
        prompt = test[i]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens=15, 
                        temperature=0.1)
        
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("assistant:")[-1].strip()
        ans.append(answer)
        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")
    
    return y_pred, ans

y_pred, ans = predict(testing['text'], model, tokenizer)

 51%|████████████████████▍                   | 512/1000 [02:52<02:13,  3.66it/s]This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (8192). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
100%|███████████████████████████████████████| 1000/1000 [05:32<00:00,  3.00it/s]


In [20]:
#this gets the ground truth of each email
y_true = []
for i in tqdm(range(len(testing))):
    t = testing[i]["response"]
    if t == "This is a Safe Email":
        y_true.append('Safe')
    else:
        y_true.append('Phishing')
        

100%|████████████████████████████████████| 1000/1000 [00:00<00:00, 25243.62it/s]


In [21]:
#this prints out how many were mispredicted and whether it was a hallucination or not
nones = []
phish = []
safe = []
for i in range(0,1000):
    if y_pred[i] == "none":
        nones.append(ans[i])
    elif y_pred[i] == "Phishing" and y_true[i] != "Phishing":
        phish.append(ans[i])
        
    elif y_pred[i] == "Safe" and y_true[i] != "Safe":
        safe.append(ans[i])


print("Mispredicts for Safe emails: " + str(len(phish)))
print("Mispredicts for Phishing emails: " + str(len(safe)))
print("Hallucinations: " + str(len(nones)))


Mispredicts for Safe emails: 339
Mispredicts for Phishing emails: 8
Hallucinations: 3


In [22]:
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
#this evalutates the model
def evaluate(y_true, y_pred):
    labels = ["Phishing", "Safe"]
    mapping = {label: idx for idx, label in enumerate(labels)}
    
    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data
    
    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [19]:
evaluate(y_true, y_pred)

Accuracy: 0.650
Accuracy for label Phishing: 0.976
Accuracy for label Safe: 0.457

Classification Report:
              precision    recall  f1-score   support

    Phishing       0.52      0.98      0.68       372
        Safe       0.97      0.46      0.62       628

   micro avg       0.65      0.65      0.65      1000
   macro avg       0.74      0.72      0.65      1000
weighted avg       0.80      0.65      0.64      1000


Confusion Matrix:
[[363   8]
 [339 287]]


In [22]:
evaluate(y_true, y_pred)

Accuracy: 0.815
Accuracy for label Phishing: 0.939
Accuracy for label Safe: 0.750

Classification Report:
              precision    recall  f1-score   support

    Phishing       0.91      0.94      0.93       344
        Safe       0.98      0.75      0.85       656

   micro avg       0.95      0.81      0.88      1000
   macro avg       0.95      0.84      0.89      1000
weighted avg       0.96      0.81      0.88      1000


Confusion Matrix:
[[323   8]
 [ 31 492]]


In [34]:
evaluate(y_true, y_pred)

Accuracy: 0.798
Accuracy for label Phishing: 0.914
Accuracy for label Safe: 0.729

Classification Report:
              precision    recall  f1-score   support

    Phishing       0.95      0.91      0.93       372
        Safe       0.97      0.73      0.83       628

   micro avg       0.96      0.80      0.87      1000
   macro avg       0.96      0.82      0.88      1000
weighted avg       0.96      0.80      0.87      1000


Confusion Matrix:
[[340  14]
 [ 19 458]]


In [39]:
evaluate(y_true, y_pred)

Accuracy: 0.799
Accuracy for label Phishing: 0.936
Accuracy for label Safe: 0.727

Classification Report:
              precision    recall  f1-score   support

    Phishing       0.92      0.94      0.93       344
        Safe       0.98      0.73      0.83       656

   micro avg       0.95      0.80      0.87      1000
   macro avg       0.95      0.83      0.88      1000
weighted avg       0.96      0.80      0.87      1000


Confusion Matrix:
[[322  10]
 [ 29 477]]


In [18]:
evaluate(y_true, y_pred)

Accuracy: 0.788
Accuracy for label Phishing: 0.910
Accuracy for label Safe: 0.724

Classification Report:
              precision    recall  f1-score   support

    Phishing       0.97      0.91      0.94       344
        Safe       0.98      0.72      0.83       656

   micro avg       0.97      0.79      0.87      1000
   macro avg       0.97      0.82      0.89      1000
weighted avg       0.97      0.79      0.87      1000


Confusion Matrix:
[[313  11]
 [ 10 475]]


In [146]:
evaluate(y_true, y_pred) #11/30 second attempt

Accuracy: 0.789
Accuracy for label Phishing: 0.924
Accuracy for label Safe: 0.718

Classification Report:
              precision    recall  f1-score   support

    Phishing       0.96      0.92      0.94       344
        Safe       0.98      0.72      0.83       656

   micro avg       0.98      0.79      0.87      1000
   macro avg       0.97      0.82      0.89      1000
weighted avg       0.98      0.79      0.87      1000


Confusion Matrix:
[[318   8]
 [ 12 471]]


In [135]:
evaluate(y_true, y_pred) #11/30 first attempt

Accuracy: 0.808
Accuracy for label Phishing: 0.922
Accuracy for label Safe: 0.748

Classification Report:
              precision    recall  f1-score   support

    Phishing       0.97      0.92      0.94       344
        Safe       0.98      0.75      0.85       656

   micro avg       0.98      0.81      0.88      1000
   macro avg       0.97      0.83      0.90      1000
weighted avg       0.98      0.81      0.88      1000


Confusion Matrix:
[[317   9]
 [ 11 491]]
