<a href="https://colab.research.google.com/github/orionhunts-ai/new_models_datasets/blob/main/dataset_pre_llama_mini_cyberdata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#. Critical Setup Steps

In [83]:
%pip install loguru uuid
import os
import sys
import random
import uuid
from loguru import logger
from google.colab import drive, userdata
drive.mount('/content/drive')


# Define the directory for logs
LOG_DIR = "./logs"
os.makedirs(LOG_DIR, exist_ok=True)
ran_1 = random.randint(0,20)
ran_2 = random.randint(0,20)
id=str(uuid.uuid4())[ran_1:ran_2]

# Define the log file path
LOG_FILE_PATH = os.path.join(LOG_DIR, f"app_{__name__}.log")

# Remove any default handlers
logger.remove()

# Add a console handler with colorized output
logger.add(
    sys.stderr,
    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
           "<level>{level: <8}</level> | "
           "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - "
           "<level>{message}</level>",
    level="DEBUG",
    colorize=True,
)

# Add a file handler with colorized output preserved in the log file
logger.add(
    LOG_FILE_PATH,
    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
           "<level>{level: <8}</level> | "
           "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - "
           "<level>{message}</level>",
    level="DEBUG",
    colorize=True,
    rotation="10 MB",  # Rotate the log file after 10 MB
    retention="10 days",  # Retain log files for 10 days
    compression="zip"  # Compress the rotated log files
)

# Example log messages for testing
def main():
    logger.debug("This is a debug message.")
    logger.info("This is an info message.")
    logger.warning("This is a warning message.")
    logger.error("This is an error message.")
    logger.critical("This is a critical message.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Fine Tuning OpenAI GPT-4o-Mini For (Free) Agentic

*   476 Training Examples
*   Cyber focused to assist with labelling threats

Cyber 👾

  *  Training (and reporting) on Google Colab
   for access to their high powered CUDA and  
  Leveraging
    * fine tuning on dataset ```"swaption2009/cyber-threat-intelligence-custom-data”```
    * Open AI offering this mini version of the already efficient gpt4o.
        * OAI claims that the mini is almost as performant (but it's 20x cheaper)
    * Sampling from the full set for those that are most relevant to Cyber Security Analysts.***

    * Aside from traditional and evolving Evaluations I will also deploy a number of the finely tuned models in a Microsoft Autogen agentic environment to see how they perform on basic analysis on a database.

    * ```Red Panda``` (a high performance streaming data alternative to ```Kafka``` will be used)



##  Data and Tool Preparation
**Summary:**

This study explores the fine-tuning of the Phi-3-small-instruct model (7.39 billion parameters) by using Daniel So's Unsloth for a Cyber Threat Intelligence (CTI) task using methods like Parameter-Efficient Fine-Tuning (PEFT), Low-Rank Adaptation (LoRA), and Quantized Low-Rank Adaptation (QLoRA). It aims to evaluate performance degradation, model collaboration in agentic environments, and the potential influence of GPT-4. Synthetic data from gretel.ai was also utilized to supplement the fine-tuning process and enhance data diversity and robustness.

In [84]:
# @title Install Core Libraries { run: "auto", display-mode: "form" }

%pip install -qqq transformers==4.42.4 datasets accelerate arrow pyarrow
%pip install -qqq torch requests joblib uuid tqdm
%pip install -qqq huggingface_hub wandb
%pip -qqq install -U  pyarrow
###LOGURU CONFIG### ALEX NOTE: DEPENDENCY BUT THIS IS NOT PROD



In [85]:
job_type="finetuning"
pretrained = "gpt4o-mini"

if job_type == "finetuning":
  ##### WHAT ARE YOU FINE TUNING #####
  ## FINE TUNING MODEL

  pretrained = "gpt4o-mini"
  project_name = f"orion-{job_type}-{pretrained}-0x0"

project_name = f"orion-{job_type}-general-0x0"

BASE_URL = userdata.get('__BASE_URL__')
data_content=f"{BASE_URL}'{project_name}/datasets/'"
model_content=f"{BASE_URL}/{project_name}/models/"

####

In [86]:
##### WHAT ARE YOU FINE TUNING #####
from google.colab import userdata
import huggingface_hub
import wandb
from huggingface_hub import notebook_login
hf_hub_key = userdata.get("HF_HUB_WRITE")
!export HF_HUB_WRITE=hf_hub_key
wandb_api = userdata.get("WANDB_API_KEY")
!export WANDB_API_KEY=wandb_api
#huggingface_hub.login(token=hf_hub_key, add_to_git_credential=True, write_permission=True)
#notebook_login(new_session=False, write_permission=True)
!wandb login $WANDB_API_KEY
!huggingface-cli login --token=$HF_HUB_WRITE
from tqdm.notebook import trange, tqdm
from tqdm import tqdm_notebook, tqdm_pandas, tqdm
import pandas as pd
import numpy as np
from typing import List, Dict, Union, Optional
import torch
import arrow
import pyarrow
import matplotlib.pyplot as plt
import joblib
tqdm.pandas()

domain = "cyber"

model_name = f'orion-{pretrained}-0x0-{id}'
dataset_name=f'orion-{domain}-dataset-0x0-{id}'
####PROJECT####
#### FINE TUNE

OLD_DATASET = "swaption2009/cyber-threat-intelligence-custom-data"
os.environ["WANDB_MODE"] = "online"
os.environ["WANDB_PROJECT"] = project_name
os.environ["WANDB_NOTEBOOK_NAME"] = f"{os.path.dirname(os.path.abspath(os.path.expanduser(__name__)))}"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

[34m[1mwandb[0m: Currently logged in as: [33morionhunts[0m ([33msynavatelabs[0m). Use [1m`wandb login --relogin`[0m to force relogin
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Traceback (most recent call last):
  File "/usr/local/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/commands/huggingface_cli.py", line 51, in main
    service.run()
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/commands/user.py", line 98, in run
    login(token=self.args.token, add_to_git_credential=self.args.add_to_git_credential)
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/_login.py", line 111, in login
    _login(token, add_to_git_credential=add_to_git_credential, write_permission=write_permission)
  Fil

In [87]:
# Construct the project stats string
_project_stats = f"""
#### {project_name.upper()} STATS ######
PWD: {os.getcwd()}
BASE_URL: {BASE_URL}
DATA_URL: ./{project_name}/datasets/
MODEL_URL: ./{project_name}/models/
PRETRAINED: {pretrained}
PROJECT NAME: {project_name}
DATA_OR_MODEL: {userdata.get('DATA_OR_MODEL')}
"""
logger.info(_project_stats)

[32m2024-07-29 08:47:11[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 12>[0m:[36m12[0m - [1m
#### ORION-FINETUNING-GENERAL-0X0 STATS ######
PWD: /content/drive/MyDrive/repos/datasets🤗
BASE_URL: /content/drive/MyDrive/repos/datasets🤗/
DATA_URL: ./orion-finetuning-general-0x0/datasets/
MODEL_URL: ./orion-finetuning-general-0x0/models/
PRETRAINED: gpt4o-mini
PROJECT NAME: orion-finetuning-general-0x0
DATA_OR_MODEL: DATA
[0m


In [88]:
def wandb_run(name, project_type, project=project_name):
  try:
    if wandb.run is None and os.environ["WANDB_MODE"] != "disabled":
      wandb.init(project=project, job_type=project_type, name=name)
      project_init = wandb.init(project=project_name, job_type= "data_preparation" ,dir=f"{BASE_URL}{project_name}/")
      logger.info(f"###WANDB_RUN:{wandb.run.name}###")
    #elif os.environ["WANDB_MODE"] == "disabled":
      logger.info(f"##### WANDB DISABLED ### ")

  except Exception as e:
    logger.error(e, exc_info=True)



In [89]:
####PROJECT DEFINITION######

model_prov = {"openai": f"gpt4o-mini",
               "mistral": f"mistral-7b",
               "orion": f"gpt-4o-mini"}

model_prov=model_prov.get("openai")



if wandb.run is None:
  run = wandb.init(project=project_name, job_type=job_type, dir=f"{BASE_URL}{project_name}")
  logger.info("run {} is running", {wandb.run.name})


# WandB Helpers


In [90]:
### LOG TRAINING ARTEFACT ###
def init_run(project_name=project_name, job_type="training"):
  if wandb.run is None:
    new_run = f'run-{random.random():.2f}'
    print(new_run)
    wandb.init(project_name, new_run, job_type="training")
  else:
    wandb.run.finish()
    new_run = f'run-{random.random():.2f}'
    wandb.init(project_name, new_run, job_type="training")


def log_model_artifact(artifact_type, project=project_name, model_name=model_name, run=wandb.run):
  if wandb.run != None:
    wandb.run.finish()






In [91]:
# @title DatasetLog


def preprocess_and_log(steps):

    with wandb.init(project=project_name, job_type=job_type) as run:

        processed_data = wandb.Artifact(
            dataset_name, type="dataset",
            description="Cyber Threat Intelligence Dataset",
            metadata=steps)

        # ✔️ declare which artifact we'll be using
        raw_data_artifact = run.use_artifact(f'{df_final}:latest')
        Download = False
        if Download==True:
          raw_dataset = raw_data_artifact.download()

        for split in ["training", "validation", "test"]:
            raw_split = read(raw_dataset, split)
            processed_dataset = preprocess(raw_split, **steps)

            with processed_data.new_file(split + ".pt", mode="wb") as file:
                x, y = processed_dataset.tensors
                torch.save((x, y), file)

        run.log_artifact(processed_data)

In [92]:
# Make a repo on HF
def hf_repo(name=project_name, token=userdata.get("HF_HUB_WRITE")):
  new_repot = False
  if new_repo == False or huggingface_hub.repo_exists(repo_id=name):
    pass
    logger.info(f"Repo {name} already exists")
  else:
    huggingface_hub.create_repo(repo_id=name)
    logger.info(f"Created repo {name}")


In [93]:
class WandBArtifact():
  def __init__(self, artifact_name, data, run=wandb.run, type=Union["data", "model","table"]):
    self.data=data
    self.artifact_name = artifact_name
    self.run = run
    self.artifact = artifact
    self.type = type
    self.out_path = out_path

    if self.type == "data":
      artifact = wandb.Table(dataframe=artifact_name, columns=[col for col in data.columns])

    # Add the table to an Artifact to increase the row
    # limit to 200000 and make it easier to reuse
    new_artifact = wandb.Artifact(f"{project}_{artifact_name}-{self.type}", type=self.type)
    artifact.add(data, artifact_name)

    # log the raw csv file within an artifact to preserve our data
    artifact.add_file(f"{BASE_URL}/{model_name}.csv")

    # Start a W&B run to log data
    run = f"{random.randint(0,50)}-run"
    if wandb.run != run:
      wandb.init(project=project_name, job_type="data_preparation", name=run)
    # Log the table to visualize with a run...
    run.log({"artifact_name": f"{artifact_name}"})
    run.log({"job_type": f"{job_type}"})
    run.save()

    # and Log as an Artifact to increase the available row limit!
    run.log_artifact(new_artifact)
    run.save()

    logger.info(f"{artifact} MADE\n")

# PRE-LOADING DATA

In [94]:
def load_and_log():

    # 🚀 start a run, with a type to label it and a project it can call home
    with wandb.init(project=project_name, job_type=job_type) as run:

        datasets = load()  # separate code for loading the datasets
        names = ["training", "validation", "test"]

        # 🏺 create our Artifact
        raw_data = wandb.Artifact(
            dataset_name, type="dataset",
            description="Cyber Threat Intelligence",
            metadata={"source": "orion@synavate.tech",
                      "sizes": [len(dataset) for dataset in datasets]})

        for name, data in zip(names, datasets):
            # 🐣 Store a new file in the artifact, and write something into its contents.
            with raw_data.new_file(name + ".pt", mode="wb") as file:
                x, y = data.tensors
                torch.save((x, y), file)

        # ✍️ Save the artifact to W&B.
        run.log_artifact(raw_data)



In [95]:
#from transformers import Trainer, TrainingArguments

In [96]:
from datasets import load_dataset
from datasets import load_dataset
import pandas as pd
import pyarrow

try:
  ds = load_dataset("synavate/cti-dataset-noanalysis")
  df_train = ds["train"].to_pandas()
  df_test = ds["test"].to_pandas()
except Exception as e:
  logger.error(e, exc_info=True)
  splits = {'train': 'data/train-00000-of-00001-a5f719ecb6a1a571.parquet', 'test': 'data/test-00000-of-00001-40c2079e33707fe3.parquet'}
  df_train = pd.read_parquet("hf://datasets/synavate/cti-dataset-noanalysis/" + splits['train'])
  df_test = pd.read_parquet("hf://datasets/synavate/cti-dataset-noanalysis/" + splits['test'])

# Data as Artifact Artifact

In [97]:
#export raw processed data CSV
os.chdir(BASE_URL)
out_path = os.path.join(BASE_URL,"/" "output/", project_name)


In [98]:
df_train["scenario_outcome"][0]

'Scenario: Credential theft has been known to be a key part of the Shamoon 2 attacks.!!$$!!Outcome: The threat actor in the Shamoon 2 attacks is using credential theft as a key part of their attack pattern.'

# Pre-Training DataLoading (Saved Data)


In [99]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cpu'

# Sentiment Analysis


In [100]:
import numpy as np
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig


In [101]:
def preprocess_text(text):
    new_text = []
    for t in text[t].split("!!$$!!"):
        t = 'scenario' if t.startswith("Scenario:") and len(t) > 1 else t
        df_sentiment["scenario"][t] = new_text
        t = 'outcome' if t.startswith("Outcome:") else t
        df_sentiment["outcome"][t] = new_text
        new_text.append(t)
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        text["scores"]=scores


def rank_scores(scores):
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = config.id2label[ranking[i]]
        s = scores[ranking[i]]
        scores["ranking"] = (f"{i+1}) {l} {np.round(float(s), 4)}")

In [102]:
df_sentiment = df_train.copy()
wandb.init(project=project_name, job_type="sentiment", dir=f"{BASE_URL}{project_name}/")
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [103]:
model.save_pretrained(save_directory=out_path, push_to_hub=True, token=userdata.get("HF_FINE"))
tokenizer.save_pretrained(save_directory=out_path)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

In [158]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

def sentiment_task_score(text, classifier=classifier):
  score = 0.0
  output = classifier(text)
  # Access the score from the dictionary in the list
  score = output[0]["score"]
  label = output[0]["label"]
  # Create a dictionary to store the results
  result = {}
  if score >= 0.7:
    score = score
    label = "Positive"
    result["scores"]= score
    result["labels"]= label
  elif score >= 0.5 and score < 0.7:
    score = score
    label = "Neutral"
    result["scores"]=score
    result["labels"]=label
  elif score <= 0.5 and score <= 0.7:
    score = score
    label="Neutral"
    result["scores"]=score
    result["labels"]=label
  elif score <= 0.5:
    score = score
    label = "Negative"
    result["scores"]=score
    result["labels"]=label
  else:
    result["scores"]=score
    result["labels"]=label
  # Return the dictionary with the results
  return result

df_sentiment = df_sentiment["scenario_outcome"].progress_apply(sentiment_task_score)
# Convert the results to a new DataFrame

df_results = pd.DataFrame(df_sentiment.tolist())
# Optionally, join the results with the original DataFrame
df_sentiment = pd.concat([df_sentiment, df_results], axis=1)






100%|██████████| 380/380 [04:50<00:00,  1.31it/s]


In [159]:
df_sentiment.head()
#df_sentiment.drop("__index_level_0__", inplace=True, axis=1)



Unnamed: 0,scenario_outcome,scores,labels
0,"{'scores': 0.8235331773757935, 'labels': 'Posi...",0.823533,Positive
1,"{'scores': 0.7905411720275879, 'labels': 'Posi...",0.790541,Positive
2,"{'scores': 0.5350706577301025, 'labels': 'Neut...",0.535071,Neutral
3,"{'scores': 0.5951200723648071, 'labels': 'Neut...",0.59512,Neutral
4,"{'scores': 0.5776233673095703, 'labels': 'Neut...",0.577623,Neutral


In [163]:
df_export=df_sentiment.copy()
os.mkdir(f"{BASE_URL}/{project_name}/")
df_export.to_csv(f"{BASE_URL}/{project_name}/orion-dataset-sentiment-analysis.csv")
df_export=push_to_hub("synavate/{project_name}-sentiment-OxO")


NameError: name 'push_to_hub' is not defined

In [6]:
#%pip install -qqq datasets huggingface_hub
from google.colab import userdata
BASE_URL=userdata.get('__BASE_URL__')
job_type="sentiment-analysis"
project_name = f"orion-{job_type}-general-0x0"
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi, HfFolder, Repository
import shutil

In [68]:
df_full=pd.read_csv('/content/drive/MyDrive/repos/datasets🤗/cti-dataset-noanalysis/output/dataset_name-3ffdb458--x00x.csv')
df_full.head()

Unnamed: 0.1,Unnamed: 0,text,diagnosis,solutions,text_pr,diagnosis_pr,solutions_pr,scenario_outcome
0,0,A cybersquatting domain save-russia[.]today is...,The diagnosis is a cyber attack that involves ...,1. Implementing DNS filtering to block access ...,"['cybersquatting', 'domain', 'today', 'launchi...","['diagnosis', 'cyber', 'attack', 'involves', '...","['implementing', 'dns', 'filtering', 'block', ...",Scenario: A cybersquatting domain save-russia[...
1,1,"Like the Android Maikspy, it first sends a not...",The diagnosis is that the entity identified as...,1. Implementing a robust anti-malware software...,"['like', 'android', 'maikspy', 'first', 'sends...","['diagnosis', 'entity', 'identified', 'softwar...","['implementing', 'robust', 'software', 'detect...","Scenario: Like the Android Maikspy, it first s..."
2,2,While analyzing the technical details of this ...,Diagnosis: APT37/Reaper/Group 123 is responsib...,1. Implementing advanced threat detection tech...,"['analyzing', 'technical', 'details', 'malware...","['diagnosis', '123', 'responsible', 'malware',...","['implementing', 'advanced', 'threat', 'detect...",Scenario: While analyzing the technical detail...
3,3,(Note that Flash has been declared end-of-life...,The diagnosis is a malware infection. The enti...,1. Implementing a robust antivirus software th...,"['note', 'flash', 'declared', 'adobe', 'since'...","['diagnosis', 'malware', 'infection', 'entity'...","['implementing', 'robust', 'antivirus', 'softw...",Scenario: (Note that Flash has been declared e...
4,4,Figure 21. Connection of Maikspy variants to 1...,The diagnosis is that Maikspy malware variants...,1. Implementing a robust firewall system that ...,"['figure', 'connection', 'maikspy', 'variants'...","['diagnosis', 'maikspy', 'malware', 'variants'...","['implementing', 'robust', 'firewall', 'system...",Scenario: Figure 21. Connection of Maikspy var...


In [173]:
df_complete=df_full.copy()
df_complete['labels']=df_export["labels"]
df_complete.shape
df_complete.to_csv("/content/drive/MyDrive/repos/datasets🤗/sentiment/full-sentiment-analysis-orion-cti-cyber-data-0x0.csv")

#ds_train, ds_test = df_completetrain_test_split(df_complete, test_size=0.2, random_state=42)
#df_complete.push_to_hub("synavate/orion-sentiment-analysis-cti-cyber-data-0x0")


In [198]:
### OpenAI VERSION ###
#df_to_ds = pd.read_csv("/content/drive/MyDrive/repos/datasets🤗/sentiment/full-sentiment-analysis-orion-cti-cyber-data-0x0.csv"")
#df_to_ds.drop(columns=["Unnamed: 0.1"], inplace=True)
#df_to_ds.to_csv("/content/drive/MyDrive/repos/datasets🤗/sentiment/full-sentiment-analysis-orion-cti-cyber-data-0x0.csv")
#ds = Dataset.from_pandas(df_to_ds)
#ds_train=ds.train_test_split(train_size=0.8)
sentiment_analysis_dataset = ds_train.copy()
DataDict = DatasetDict(sentiment_analysis_dataset)
DataDict.push_to_hub("synavate/v1.0-orion-sentiment-analysis-cti-cyber-data-0x0", token=userdata.get("HF_FINE"))




Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/synavate/v1.0-orion-sentiment-analysis-cti-cyber-data-0x0/commit/7dc754f3d8bb8d2ddecac132093f78f078c4b7e9', commit_message='Upload dataset', commit_description='', oid='7dc754f3d8bb8d2ddecac132093f78f078c4b7e9', pr_url=None, pr_revision=None, pr_num=None)

In [188]:
ds_train, ds_split = Dataset.train_test_split(train_size=0.8,test_size=0.2)
ds_train[0]

't'

DatasetDict({
    train: train
    test: test
})

In [64]:
df_export.head()

Unnamed: 0.1,Unnamed: 0,scenario_outcome,scores,labels,scenario_w_outcome
0,0,"{'scores': 0.8235331773757935, 'labels': 'Posi...",0.823533,Positive,"{'scores': 0.8235331773757935, 'labels': 'Posi..."
1,1,"{'scores': 0.7905411720275879, 'labels': 'Posi...",0.790541,Positive,"{'scores': 0.7905411720275879, 'labels': 'Posi..."
2,2,"{'scores': 0.5350706577301025, 'labels': 'Neut...",0.535071,Neutral,"{'scores': 0.5350706577301025, 'labels': 'Neut..."
3,3,"{'scores': 0.5951200723648071, 'labels': 'Neut...",0.59512,Neutral,"{'scores': 0.5951200723648071, 'labels': 'Neut..."
4,4,"{'scores': 0.5776233673095703, 'labels': 'Neut...",0.577623,Neutral,"{'scores': 0.5776233673095703, 'labels': 'Neut..."


(380, 4)

In [15]:
import pandas as pd
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi, HfFolder, Repository
import shutil

# Save the dataset locally

HF_WRITE=userdata.get("HF_FINE")
!export HF_WRITE=HF_FINE
api = HfApi(token=HF_WRITE)
# Authenticate and Initialize HfApi


# Verify the token works
user_info = api.whoami()
print(user_info)

{'type': 'user', 'id': '65a986f62ed95c799fcaee9f', 'name': 'synavate', 'fullname': 'Synavate Labs', 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/65a986f62ed95c799fcaee9f/jgKEP9AIlmwtaSlHDxWtF.jpeg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'hf-fine', 'role': 'fineGrained', 'createdAt': '2024-07-29T10:10:21.744Z', 'fineGrained': {'canReadGatedRepos': True, 'global': ['inference.serverless.write', 'discussion.write', 'post.write'], 'scoped': [{'entity': {'_id': '663f9b9f4ba1e60baea58207', 'type': 'dataset'}, 'permissions': ['repo.content.read', 'discussion.write', 'repo.write']}, {'entity': {'_id': '6698fc4c4b8f77ee8393d7a3', 'type': 'model'}, 'permissions': ['repo.content.read', 'discussion.write', 'repo.write']}, {'entity': {'_id': '66a24ba513bb4260edef8fb1', 'type': 'model'}, 'permissions': ['repo.content.read', 'discussion.write', 'repo.write']}, {'entity': {'_id': '66a430

In [None]:
from huggingface_hub import HfApi
import pandas as pd
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi
# Install necessary libraries
# Make sure you are logged into the Hugging Face CLI
# Install necessary libraries
#!pip install huggingface-hub datasets
#!huggingface-cli login --token
# Initialize HfApi
api = HfApi(user_agent="synavate",token=userdata.get('HF_FINE'))
#--token $HF_WRITE --add-to-git-credential


# Define your repository name and the namespace (usually your username)
repo_name = "cyber-orion-oai-chat-format-0x0"
namespace = "synavate"



# Create the dataset repository on the Hugging Face Hub
#api.create_repo(repo_id=f"{namespace}/{repo_name}", repo_type="dataset")



# Convert to pandas DataFrames
#df_train = pd.DataFrame(df_export[0:72])
#df_test = pd.DataFrame(df_export[72:])



In [169]:
df_out = df_oai.copy()
df_out = df_oai.drop(["Unnamed: 0"], axis=1)
df_out.to_csv("/content/drive/MyDrive/repos/datasets🤗/orion-finetuning-general-0x0/cyber-orion-oai-chat-format-0x0.csv")
df_out.shape

(70, 10)

In [137]:
from datasets import Dataset, DatasetDict
df = pd.read_csv("/content/drive/MyDrive/repos/datasets🤗/orion-finetuning-general-0x0/cyber-orion-oai-chat-format-0x0.csv")
dataset_ds = Dataset.from_pandas(df)
dataset_ds

Dataset({
    features: ['Unnamed: 0', 'text', 'diagnosis', 'solutions', 'text_pr', 'diagnosis_pr', 'solutions_pr ', 'scenario_outcome', 'scores', 'labels', 'prompt'],
    num_rows: 70
})

In [168]:
# Convert pandas DataFrames to Hugging Face Datasets
#%pip install datasets
from datasets import Dataset, DatasetDict
df_out.shape
#dataset_ds = Dataset.from_pandas(df_out)
#ds_split['test'] = dataset_ds.train_test_split(test_size=0.2)
#ds_split['train'] = dataset_ds.train_test_split(train_size=0.8)
#ds_split

(70, 10)

In [None]:


'''
( cache_dir: Optional = Nonedataset_name: Optional = Noneconfig_name: Optional = Nonehash: Optional = Nonebase_path: Optional = Noneinfo: Optional = Nonefeatures: Optional = Nonetoken: Union =
Noneuse_auth_token = 'deprecated'repo_id: Optional = Nonedata_files: Union = Nonedata_dir: Optional = Nonestorage_options: Optional = Nonewriter_batch_size:
Optional = Nonename = 'deprecated'**config_kwargs )
'''

# Convert pandas DataFrames to Hugging Face Datasets
#dataset_train = dataset_ds[71::]
#dataset_test = dataset_ds(df_out[0:70])
#dataset_train

local_path='/content/drive/MyDrive/repos/datasets🤗/orion-finetuning-general-0x0/'
repo_path='cyber-orion-oai-chat-format-0x0'

#dataset_dict.save_to_disk(dataset_dict_path=local_path)

# Push the dataset to the Hugging Face Hub
dataset_dict.push_to_hub(commit_message="Initial", token=userdata.get("HF_FINE"),repo_id=repo_name)
'''
# Clone the repository to your local machine
repo = Repository(local_dir=local_path, git_email='core@synavate.tech', clone_from="/synavate/sentiment-cti-analysis-cyber", repo_type="dataset", token='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')

# Copy the dataset files to the repository directory
shutil.copytree(src='/content/drive/MyDrive/repos/datasets🤗/orion-finetuning-general-0x0', dst=f"/synavate/sentiment-cti-analysis-cyber", dirs_exist_ok=True)

# Commit and push the files to the Hugging Face Hub
repo.push_to_hub(commit_message="Initial commit of my dataset")
'''


In [130]:


# Load a sample dataset


# Display the structure of the dataset
print(dataset_ds)

# Split the dataset into train and test (80% train, 20% test)
ds_split['test'] = dataset_ds.train_test_split(test_size=0.2)
ds_split['train'] = dataset_ds.train_test_split(train_size=0.8)

# Further split the train split into train and validation (80% train, 20% validation)
#train_valid_split = train_test_split['train'].train_test_split(test_size=0.25)




Dataset({
    features: ['text', 'diagnosis', 'solutions', 'text_pr', 'diagnosis_pr', 'solutions_pr ', 'scenario_outcome', 'scores', 'labels', 'prompt'],
    num_rows: 70
})


In [131]:
ds_split

DatasetDict({
    train: DatasetDict({
        train: Dataset({
            features: ['text', 'diagnosis', 'solutions', 'text_pr', 'diagnosis_pr', 'solutions_pr ', 'scenario_outcome', 'scores', 'labels', 'prompt'],
            num_rows: 56
        })
        test: Dataset({
            features: ['text', 'diagnosis', 'solutions', 'text_pr', 'diagnosis_pr', 'solutions_pr ', 'scenario_outcome', 'scores', 'labels', 'prompt'],
            num_rows: 14
        })
    })
    test: DatasetDict({
        train: Dataset({
            features: ['text', 'diagnosis', 'solutions', 'text_pr', 'diagnosis_pr', 'solutions_pr ', 'scenario_outcome', 'scores', 'labels', 'prompt'],
            num_rows: 56
        })
        test: Dataset({
            features: ['text', 'diagnosis', 'solutions', 'text_pr', 'diagnosis_pr', 'solutions_pr ', 'scenario_outcome', 'scores', 'labels', 'prompt'],
            num_rows: 14
        })
    })
})

In [None]:
# Combine the splits into a DatasetDict
ds_final = DatasetDict({
    'train': ds_split['train'],
    'test': ds_split['test']
})
ds_final

In [126]:
local_path="/content/drive/MyDrive/repos/datasets🤗"
repo_name="cyber-cti-orion-openai-prompt-0x0"
ds_split.push_to_hub(repo_id=repo_name, commit_message="Initial commit of my dataset", token=userdata.get("HF_FINE"))

TypeError: Values in `DatasetDict` should be of type `Dataset` but got type '<class 'datasets.dataset_dict.DatasetDict'>'

In [62]:
local_path="/content/drive/MyDrive/repos/datasets🤗"
repo_name="cyber-cti-orion-openai-prompt-0x0"
# Clone the repository to your local machine
repo = Repository(local_dir=local_path, git_email="core@synavate.tech", repo_type="dataset", token=userdata.get("HF_FINE"))

# Copy the dataset files to the repository directory
#shutil.copytree(d="/synavate/sentiment-cti-analysis-cyber", dirs_exist_ok=True)

# Commit and push the files to the Hugging Face Hub
repo.push_to_hub(commit_message="Initial commit of my dataset")


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.


In [173]:
!git config --global credential.helper store

In [147]:
df_sentiment.head()

Unnamed: 0,id,text,diagnosis,solutions,text_pr,diagnosis_pr,solutions_pr,scenario_outcome,analysis,rating,scores,labels,output
0,238,Credential theft has been known to be a key pa...,The threat actor in the Shamoon 2 attacks is u...,1. Implementing multi-factor authentication (M...,"['credential', 'theft', 'known', 'key', 'part'...","['threat', 'actor', 'shamoon', '2', 'attacks',...","['implementing', 'authentication', 'mfa', 'pre...",Scenario: Credential theft has been known to b...,,,,,
1,204,"While still not seen in the wild, similar expl...",The entity (SOFTWARE) has two vulnerabilities ...,1. Patch Management: The entity can implement ...,"['still', 'seen', 'wild', 'similar', 'exploits...","['entity', 'software', 'two', 'vulnerabilities...","['patch', 'management', 'entity', 'implement',...","Scenario: While still not seen in the wild, si...",,,,,
2,230,We found that Tropic Trooper’s latest activiti...,"Diagnosis: Tropic Trooper, a known threat acto...",1. Implement strict USB usage policies and pro...,"['found', 'tropic', 'trooper', 'latest', 'acti...","['diagnosis', 'tropic', 'trooper', 'known', 't...","['implement', 'strict', 'usb', 'usage', 'polic...",Scenario: We found that Tropic Trooper’s lates...,,,,,
3,397,"Stage1.exe purports to be ransomware, as it ov...",The diagnosis is a coordinated ransomware atta...,1. Regularly back up critical data and store i...,"['purports', 'ransomware', 'overwrites', 'targ...","['diagnosis', 'coordinated', 'ransomware', 'at...","['regularly', 'back', 'critical', 'data', 'sto...",Scenario: Stage1.exe purports to be ransomware...,,,,,
4,23,BIOPASS RAT Loader Backdoor.Win64.BIOPASS.A ...,The diagnosis is that the entity flash.exe has...,1. Use antivirus software to detect and remove...,"['biopass', 'rat', 'loader', '3195c355aa564ea6...","['diagnosis', 'entity', 'infected', 'biopass',...","['use', 'antivirus', 'software', 'detect', 're...",Scenario: BIOPASS RAT Loader Backdoor.Win64.B...,,,,,


In [167]:
df_senthub = df_sentiment.copy()
#df_senthub.to_csv(f"{BASE_URL}/{project_name}/df_senthub.csv")
#df_senthub.push_to_hub("synavate/orion-cti-sentiment-analysis-x0x")

NameError: name 'df_sentiment' is not defined

In [126]:
unique = df_sentiment["output"].describe()
unique


count          380
unique           3
top       Negative
freq           201
Name: output, dtype: object

In [166]:
df_final_sentiment=df_sentiment.copy()
#dir = f"{BASE_URL}/{project_name}/"
#os.mkdir(dir)
#df_final_sentiment.to_csv(f"{dir}/df_final_sentiment.csv")
df_final_sentiment.shape

NameError: name 'df_sentiment' is not defined

In [47]:
text = df_sentiment["scenario_outcome"].apply(preprocess_text)
logger.info(f"### {text} processed ###")
scores = df_sentiment["scores"].apply(rank_scores)
logger.info(f"### {scores} processed ###")
#text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

UnboundLocalError: local variable 't' referenced before assignment

In [None]:

import uuid
#uuid for files
id = str(uuid.uuid4())[0:6]
dir = f"{BASE_URL}{project_name}/models/"
# Initialize W&B run for sentiment job
training_run = wandb.init(project=project_name, name=f"Sentiment_Analysis_{id}",
                 job_type="sentiment",dir=dir)


# Define model and tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
#####
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)
#classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)
# Define paths and names

project = project_name
model_id = f"synOrion-cyber-{id}"
model_name = model_name
model_path = dir

# Save the model
#check_point = classifier.save_pretrained(model_name)
#wandb.save(check_point)

# Initialize a new W&B run to store the model


# Create a new artifact and add the model file to it
artifact = wandb.Artifact(name=f'{model_name}', type="model")
training_run.save()


In [1]:
df_train

NameError: name 'df_train' is not defined

In [None]:
# Function to predict sentiment
import scipy
import numpy as np
from transformers import pipeline


logger.info(device)
pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)
# Add DF for sentimenet analysis
def predict_sentiment(text=None, model=model, classifier=pipeline):
  classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)
  model.to(device)
  if text is not None:
      labels = ["Negative", "Neutral", "Positive"]
      from scipy.special import softmax
      encoded_input = tokenizer(text, return_tensors='pt', truncation=True).to(device)
      # Run the model
      #with torch.no_grad(
      output = model(**encoded_input)
      # Extract the sentiment scores
      scores = output[0][0].detach().numpy()
      scores = softmax(scores)
      # Truncate the text to the maximum length the model can handle
      result = classifier(scores)
      #result = output(scores)
      ranking = np.argsort(result)
      ranking = ranking[::-1]
      for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")
        labels = text["labels"][i] = l
        score = ["score"][i] = s
        text["labels"][i] = l
        text["scores"][i] = s

      return text.head()

## Apply to copied DF
sentiment = df_sentiment['scenario_outcome'].progress_apply(predict_sentiment)
#score = df_sentiment["scenario_outcome"].progress_apply(predict_sentiment)
#df_sentiment['sentiment'] = sentiment[0]
#df_sentiment['score'] = sentiment[1]


In [164]:
  #logger.error(e, exc_info=True)
  #artifact = wandb.Artifact(name="chat structure", type="dataset")
  #pre_data.add(df_chat, "df_scenario_outcome")
  #pre_data.log_artifact(artifact)
  #wandb.run.save()
  df_chat.shape

(70, 10)

In [None]:
# @title Model Registry { run: "auto" }
#huggingface_hub.login(token=HF_HUB, add_to_git_credential=True, write_permission=True)
# Start a new W&B run
run_name = f"{project}-save_model"

def check_run(run_name):
  if wandb.run is None:
    wandb.init(project=project, job_type="model", name=run_name)
  else:
    wandb.run.finish()
    wandb.init(project=project, job_type="model", name=run_name)

In [83]:
#%pip install -qqq wandb
#%pip install -qqq matplotlib
import matplotlib as plt
import wandb
run = wandb.init(project=project_name, job_type="model", name="Model-Reg")
data_table = wandb.Table(dataframe=df_complete)
run.log({"data_table": data_table})

table_plot = run.plot_table(data_table=data_table,fields=["scenario_outcome","solutions"], vega_spec_name={project_name})
run.save()
plt.plot(table_plot)
run.log({f"table_pot": f"{wandb.Graph(table_plot)}"})
wandb.save()

VBox(children=(Label(value='2.253 MB of 2.264 MB uploaded\r'), FloatProgress(value=0.9951488185616866, max=1.0…

AttributeError: module 'matplotlib' has no attribute 'plot'

# Final Processing Before Fine Tuning
GPU requires the data & model to be on the GPU (or at least the same device if not GPU) REF: Mac torch.backends.mps.available() rather than cuda

In [None]:
### ENCODE CATEGORICAL VARIABLES
def preprocess_dataframe(dataframe):
    """
    Preprocess the DataFrame to ensure all data is numerical.
    Convert non-numeric columns to numerical data or drop them.
    """
    for x in dataframe.columns:
        if type(dataframe[x]) == list:
            dataframe[x] = dataframe[x].astype('category')
            dataframe[x] = dataframe[x].cat.codes
    # Convert categorical columns to numerical data
    df_encoded= pd.get_dummies(dataframe)
    logger.info("Converted categorical columns to numerical")

    # Ensure all data is float32
    df_encoded = df_encoded.astype(np.float32)
    logger.info("Converted all data to float32")

    return df_encoded


In [None]:
# @title Evals
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Example ground truth labels and model predictions
y_true = y_true
y_pred = y_pred  # Predicted labels

# Calculate Precision
precision = precision_score(y_true, y_pred)
#print(f'Precision: {precision:.2f}')

# Calculate Recall
recall = recall_score(y_true, y_pred)
#print(f'Recall: {recall:.2f}')

# Calculate F1 Score
f1 = f1_score(y_true, y_pred)


# Calculate AUC-ROC
auc_roc = roc_auc_score(y_true, y_pred)


# Calculate Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)


Precision: 0.75
Recall: 0.60
F1 Score: 0.67
AUC-ROC: 0.70
Confusion Matrix:
[[4 1]
 [2 3]]


In [163]:
#df_export=pd.read_csv(f"/content/drive/MyDrive/repos/datasets🤗/orion-finetuning-general-0x0/orion-dataset-sentiment-analysis-oai-prompt.csv")
#df_export.shape
#os.makedirs("/content/drive/MyDrive/repos/datasets🤗/sentiment/")
df_export.to_csv("/content/drive/MyDrive/repos/datasets🤗/orion-dataset-sentiment-analysis-oai-prompt-0x0.csv")
#df_export.head()
df_from_pandas = pd.read_csv("/content/drive/MyDrive/repos/datasets🤗/orion-dataset-sentiment-analysis-oai-prompt-0x0.csv")
df_from_pandas.shape

(70, 13)

In [153]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/synavate/sentiment-cti-analysis-cyber/" + splits["train"])
df_chat = df_train.copy()
df_chat.shape

(70, 10)

In [152]:
df_

(70, 12)

In [91]:

# Define the row_to_chat function
def row_to_chat(row, index=1):
    return {
        "messages": [
            {
                "role": "system",
                "content": "You are a senior intelligence officer. Assess the {scenario!!$$!!outcome!!$$!!solutions!!$$!!sentiment label} which determines the type of attack {solutions} provided. Then you will rate the solutions {0: low 1: medium 2: high effectiveness}. Your answers will be in JSONl format."
            },
            {
                "role": "user",
                "content": row["scenario_outcome"] + " !!$$!! " + row["solutions"] + " !!$$!! " + row["labels"]
            },
            {
                "role": "assistant",
                "content": "My Analysis based on the cyber data given is: {}"
            }
        ]
    }

# Apply the function to each row and create a new column 'Analysis'


In [145]:
df_oai = df_chat.copy()
df_oai.drop(columns=["Unnamed: 0"], axis=1)
df_oai['prompt'] = df_oai.apply(row_to_chat, axis=1)
df_oai.to_csv("/content/drive/MyDrive/repos/datasets🤗/cti-dataset-noanalysis/output/orion-cti-cyber-data-openai-prompt-0x0.csv")


In [151]:
ds_oai = Dataset.from_pandas(df_oai)
print(ds_oai)
ds_train, ds_test = ds_oai.train_test_split(train_size=0.8, test_size=0.2)


#df_oai.push_to_hub("synavate/orion-cti-cyber-data-openai-prompt-0x0")

Dataset({
    features: ['Unnamed: 0', 'text', 'diagnosis', 'solutions', 'text_pr', 'diagnosis_pr', 'solutions_pr ', 'scenario_outcome', 'scores', 'labels', 'prompt'],
    num_rows: 70
})


In [87]:

df_oai["prompt"] = df_oai["scenario_outcome"].progress_apply(row_to_chat)
df_oai.head()

AttributeError: 'DataFrame' object has no attribute 'progress_apply'

In [None]:
df_openai = df_chat.copy()
for x in df_openai:
  df_openai.progress_apply(row_to_chat)
  print(f"### {df_chat[x]} processed ###")
df_chat.head()

## Encoding for Fine Tuning

In [None]:
### ENCODE NOW
df_encoded = df_pp.copy()

df_encoded["enc_solutions"] = df_encoded["solutions_pr"].progress_apply(preprocess_text)
df_encoded.head()

In [None]:
"""
Utility function to convert a dataframe to a PyTorch tensor.
- More important with large datasets to be on the GPU
"""
# Set device to GPU if available, otherwise CPU
device = "cuda:0" if torch.cuda.is_available() else "cpu"
logger.info(f"Data is on: {device}")


def to_numpy(dataframe):
    """
    Convert a DataFrame to a numpy array with float32 dtype.
    """
    try:
        df_numpy = dataframe.to_numpy(dtype=np.float32)
        logger.info("Data is numpy array")
        logger.info(f"Shape of numpy array: {df_numpy.shape}")
        return df_numpy
    except Exception as e:
        logger.error(f'Error converting DataFrame to numpy array: {e}', exc_info=True)

def df_to_tensor(df_numpy, device=device):
    """
    Convert a numpy array to a PyTorch tensor and move it to the specified device.
    """
    try:
        df_tensors = torch.tensor(df_numpy, dtype=torch.float32)
        logger.info("Data is PyTorch tensor with dtype torch.float32")
        df_tensors = df_tensors.to(device)
        logger.info(f"Data is on {device}")
        return df_tensors
    except Exception as e:
        logger.error(f'Error converting numpy array to PyTorch tensor: {e}', exc_info=True)


In [None]:
# Convert DataFrame to numpy array
df_numpy = to_numpy(df_scenario_outcome)
logger.debug(df_numpy)

# Convert numpy array to PyTorch tensor
df_tensor = df_to_tensor(df_numpy)
logger.debug(df_tensor)

# Logging the shape of the tensor
logger.info(f"Shape of tensor: {df_tensor.shape}")

# Verify the type of df_tensor
logger.info(f"Type of df_tensor: {type(df_tensor)}")


In [None]:
'''Sentiment Analysis:
Added in some meta data to match the Scenario as outlined in the initial text column mapping it
to a scenario, and outcome. Then asking for the sentiment of the solutions'''

import wandb
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoConfig

# Install necessary packages
!pip install -qqq transformers torch accelerate

# Initialize W&B run for sentiment job
training_run = wandb.init(project=project, name=f"Sentiment_Analysis_{id}",
                 job_type="sentiment",dir="/content/drive/MyDrive/models_datasets/models")


# Define model and tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
#####

model = AutoModelForSequenceClassification.from_pretrained(MODEL, output_hidden_states=True)
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)
wandb.log({"object": classifier})
# Define paths and names
project = project
model_id = "sentiment_model"
model_name = f"{project}-cyber{model_id}"
model_path = f"./content/drive/MyDrive/models_datasets/models/"

# Save the model
check_point = classifier.save_pretrained(model_name)
wandb.save(check_point)

# Initialize a new W&B run to store the model


# Create a new artifact and add the model file to it
artifact = wandb.Artifact(name=f'{model_name}', type="model")
training_run.save()


In [None]:
def metrics_log(save_model: bool = False):
      wandb.run.name.log({"acc": acc})

    # Model File
    if os.path.exists(f"{BASE_URL}/{project}/models/") == False:
      os.mkdir(f"{BASE_URL}/{project}/models/")
      run.link_model(path=f"{BASE_URL}/{project}/models/{model_name}.h5", registered_model_name=model_name)
      run.save()
      with open(f"{BASE_URL}/{project}/models/{model_name}.h5", "w") as f:
        f.write("Model: " + str(random.random()))
      run.finish()

In [None]:
import matplotlib.pyplot as plt

df_sentiment = df_scenario_outcome.copy()
df_sentiment.drop(["text", "diagnosis", "solutions"], axis=1)
print(df_sentiment.shape)

#Make a data table from a dataframe
data_table = wandb.Table(dataframe=df_sentiment)
table_plot = wandb.plot_table(data_table=data_table,fields=["text","diagnosis","solutions"], vega_spec_name={project})
wandb.run.save()
plt.show(table_plot)
run.log({f"table_pot": f"{wandb.Graph(table_plot)}"})

In [None]:
# prompt: Visualize in Weights and biases form everything until here

# Assuming 'run' is your active wandb run object

# Log summary metrics
wandb.run.summary["num_samples"] = len(df_scenario_outcome)
wandb.run.summary["num_unique_diagnoses"] = df_scenario_outcome["diagnosis"].nunique()
# Add other relevant summary stats

# Log the preprocessed dataframe as a table
preprocessed_table = wandb.Table(dataframe=df_pp,
                                 columns=["scenario_outcome", "solutions_pr"])
wandb.run.log({"preprocessed_data": preprocessed_table})

# Log the sentiment analysis results (assuming you have a df_sentiment with results)
sentiment_table = wandb.Table(dataframe=df_sentiment,
                               columns=["scenario_outcome", "solutions_pr", "sentiment"])
wandb.run.log({"sentiment_analysis_results": sentiment_table})

# Log the table plot you created earlier
wandb.run.log({"data_visualization": wandb.Image(table_plot)})

# Optionally log histograms of text lengths, word frequencies, etc.
# ...

# Finish the run
wandb.run.save()


In [None]:
run.link_model(
    path=model_path,
    registered_model_name=f"{model_name}",
    name="4o-mini-cyber",
    aliases=["gpt4omini, finetune"],
)

### NER for enriching the data more ###

In [None]:

artifact = wandb.Artifact(name=f'{model_name}', type="model")
artifact.add_file(local_path="./content/drive/MyDrive/models_datasets/models/", name=f'{local-path}-NER')
ner_model = 'dslim/bert-base-NER'
ner_tokenizer = AutoTokenizer.from_pretrained(ner_model)
ner_model = AutoModelForTokenClassification.from_pretrained(ner_model)

# Log the artifact to W&B
run.log_artifact(artifact)
huggingface_hub.save_pretrained_torch(model, model_name)
# Finish the W&B run
run.save()




In [None]:
import matplotlib.pyplot as plt

df_sentiment = df_scenario_outcome.copy()
df_sentiment.drop(["text", "diagnosis", "solutions"], axis=1)
print(df_sentiment.shape)

#Make a data table from a dataframe
data_table = wandb.Table(dataframe=df_sentiment)
table_plot = wandb.plot_table(data_table=data_table,fields=["text","diagnosis","solutions"], vega_spec_name={project})
wandb.run.save()
plt.show(table_plot)
run.log({f"table_pot": f"{wandb.Graph(table_plot)}"})

In [None]:
### Text-Diagnosis Concatenation & HotEncoder Target
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
# FIX THIS
df_encoded = df_tokenized.copy()
df_encoded['text_diagnosis'] = df_text_diagnosis['text_processed'] + df_text_diagnosis['diagnosis_processed']

df_encoded.head()

# Simple Word to Vec Model

In [None]:
# Train a Word2Vec model (example)
df_2vec =
apply_word2Vec[column for column in columns]
model = Word2Vec(df_domains['text_processed'], min_count=1)

In [None]:
# Apply the function to each column
df_w2v = df_domains.copy()
df_w2v['text_processed'] = df_domains['text'].apply(preprocess_text)
df_w2v['diagnosis_processed'] = df_domains['diagnosis'].apply(preprocess_text)
df_w2v['solutions_processed'] = df_domains['solutions'].apply(preprocess_text)

# OpenAI Embeddings with Gpt4o-Mini finely tuned and using the small OAI # Embeddings

In [None]:
client = OpenAI()


In [None]:
df_analysis = df_gpt.copy()
x = 0
y = random.randint(0,400)
row_numbers = []
while x < 40:
  row_numbers.append(y)
  x += 1
  y += random.randint(0,100)
print(row_numbers)
f"{row_numbers}. AND {len(row_numbers)}"
#df_analysis.drop(rows=row_numbers)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
file_path="/content/drive/MyDrive/repos/datasets🤗/cti-dataset-noanalysis/output/dataset_name-3ffdb458--x00x.csv"
df_gpt = pd.read_csv(file_path)
df_gpt.head()

# Humanloop for prompt experiments

In [None]:
#df_analysis = df_gpt.copy()


'''for x in range(6000):
    row_to_drop = random.randint(0, len(df_analysis) - 1)
    df_analysis.drop(index=row_to_drop, inplace=True)
    print(f"### {x} ### Row dropped: {row_to_drop}")'''


df_completions= df_analysis.copy()
df_completions.drop(columns=['Unnamed: 0'])
df_completions["id"] = ""
df_completions=df_completions[0:50]
df_complete=df_completions.copy()
#df_complete = df_completions["id"][0:50]
df_complete.head()
#df_complete.shape
df_complete["id"] = range(0,len(df_complete["id"]))
df_complete.shape

NameError: name 'df_analysis' is not defined

In [None]:
%pip install --upgrade humanloop
import os
from humanloop import Humanloop

HUMANLOOP_API_KEY = userdata.get("HUMANLOOP_API_KEY")
df_analys

humanloop = Humanloop(
    api_key=HUMANLOOP_API_KEY,
)

response = humanloop.chat_deployed(
    project="cti-2009-data",
    inputs={},
    messages=[{ "role": "user", "content": "Tell a joke" }],
    provider_api_keys={
        "openai": "OPENAI_KEY_HERE"
    }
)

print(response.data[0].output)


In [224]:
df_smol.columns
df_smol.drop(columns=["Unnamed: 0"], inplace=True)

In [253]:
!pip install --upgrade openai
!openai migrate

Retrieving Grit CLI metadata from https://api.keygen.sh/v1/accounts/custodian-dev/artifacts/marzano-linux-x64
Fetching release URL from: https://api.keygen.sh//v1/accounts/custodian-dev/artifacts/marzano-linux-x64
Fetching release URL from: https://api.keygen.sh//v1/accounts/custodian-dev/artifacts/gouda-linux-x64
Fetching release URL from: https://api.keygen.sh//v1/accounts/custodian-dev/artifacts/workflow_runner-linux-x64
Fetching release URL from: https://api.keygen.sh//v1/accounts/custodian-dev/artifacts/cli-linux-x64
Fetching release URL from: https://api.keygen.sh//v1/accounts/custodian-dev/artifacts/timekeeper-linux-x64
ERROR (code: 200) - Too many params for text: expected maximum 1


In [256]:

import random
import pandas as pd
import json
# from wandb.integration.openai import WandbLogger
from google.colab import userdata

# Set your OpenAI API key

from openai import OpenAI

client = OpenAI(
  api_key=userdata.get('OPENAI_API_KEY')

)

def analyze_scenario(row):
    scenario = row["scenario_outcome"]
    solution = row["solutions"]
    scores = row["scores"]
    labels = row["labels"]
    user_content = f"Scenario: {scenario} !!$!! Solution: {solution} !!$!! Sentiment Score: {scores}:{labels}"

    messages = [
        {"role": "system", "content": """You are a Senior Cyber threat analyst. You will be given scenario examples in the form Scenario: {} !!$$!! Solutions: {}.
        Prepare your analysis and advise 0 bad solution and 1 good solution. Think step by step and explain your analysis. Output in JSON and a Python Dictionary.
        Example: {{
              "id": int,
              "analysis": str,
              "rating": 0, 1 or 2
              "next_steps": list
        }}
        """},
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": "Here is my concise Analysis formatted as a JSON with Metadata: {}"}
    ]

    try:
        response = client.completions.create(
            model="gpt-4o-mini",
            prompt=messages,
            max_tokens=300,
            temperature=0.1,
            top_p=1,
            frequency_penalty=0.3,
            presence_penalty=0
        )


        # Extract JSON from the assistant's response
        try:
            analysis = response.choices[0].text

            return analysis
        except json.JSONDecodeError:
            print("Error: Invalid JSON format in the assistant's response.")
            return None

    except Exception as e:
        print(f"Error during OpenAI API call: {e}")
        return None



# Use WandbLogger for logging
#wandb_logger = WandbLogger()



#wandb_logger.log({"analysis": df_analyse.to_dict()})

#print(df_analyse)




AttributeError: 'DataFrame' object has no attribute 'push_to_hub'

In [257]:
# Assuming df_openai is already defined and loaded
df_backup = df_openai.copy()
df_smol = df_openai.head(50).copy()
df_smol.drop(columns=["Unnamed: 0"], inplace=True)
df_smol["analysis"] = ""

# Apply the function and store the results
df_smol["analysis"] = df_smol.apply(analyze_scenario, axis=1)

# Check the results
df_smol.head()

Error during OpenAI API call: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Error during OpenAI API call: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Error during OpenAI API call: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 

KeyboardInterrupt: 

In [246]:
df_smol.head()

Unnamed: 0,text,diagnosis,solutions,text_pr,diagnosis_pr,solutions_pr,scenario_outcome,scores,labels,analysis
0,A cybersquatting domain save-russia[.]today is...,The diagnosis is a cyber attack that involves ...,1. Implementing DNS filtering to block access ...,"['cybersquatting', 'domain', 'today', 'launchi...","['diagnosis', 'cyber', 'attack', 'involves', '...","['implementing', 'dns', 'filtering', 'block', ...",Scenario: A cybersquatting domain save-russia[...,0.823533,Positive,
1,"Like the Android Maikspy, it first sends a not...",The diagnosis is that the entity identified as...,1. Implementing a robust anti-malware software...,"['like', 'android', 'maikspy', 'first', 'sends...","['diagnosis', 'entity', 'identified', 'softwar...","['implementing', 'robust', 'software', 'detect...","Scenario: Like the Android Maikspy, it first s...",0.790541,Positive,
2,While analyzing the technical details of this ...,Diagnosis: APT37/Reaper/Group 123 is responsib...,1. Implementing advanced threat detection tech...,"['analyzing', 'technical', 'details', 'malware...","['diagnosis', '123', 'responsible', 'malware',...","['implementing', 'advanced', 'threat', 'detect...",Scenario: While analyzing the technical detail...,0.535071,Neutral,
3,(Note that Flash has been declared end-of-life...,The diagnosis is a malware infection. The enti...,1. Implementing a robust antivirus software th...,"['note', 'flash', 'declared', 'adobe', 'since'...","['diagnosis', 'malware', 'infection', 'entity'...","['implementing', 'robust', 'antivirus', 'softw...",Scenario: (Note that Flash has been declared e...,0.59512,Neutral,
4,Figure 21. Connection of Maikspy variants to 1...,The diagnosis is that Maikspy malware variants...,1. Implementing a robust firewall system that ...,"['figure', 'connection', 'maikspy', 'variants'...","['diagnosis', 'maikspy', 'malware', 'variants'...","['implementing', 'robust', 'firewall', 'system...",Scenario: Figure 21. Connection of Maikspy var...,0.577623,Neutral,


In [276]:
### OpenAI VERSION ###
#df_to_ds = pd.read_csv("/content/drive/MyDrive/repos/datasets🤗/sentiment/full-sentiment-analysis-orion-cti-cyber-data-0x0.csv")
#df_to_ds.drop(columns=["Unnamed: 0.1"], inplace=True)
#df_openai = df_to_ds.copy()
df_openai.shape

#df_openai["Analysis"]


#### TO DATASET
#df_backup.to_csv("/content/drive/MyDrive/repos/datasets🤗/sentiment/openai-prompts-sentiment-analysis-orion-cti-cyber-data-V2-0x0.csv")
df_backup.head()
#df_llama = df_backup.drop(columns=["Unnamed: 0", "text", "diagnosis", "solutions", "text_pr", "diagnosis_pr"], axis=1)
#os.mkdir("/content/drive/MyDrive/repos/datasets🤗/llama_analysis")
#df_llama.to_csv("/content/drive/MyDrive/repos/datasets🤗/llama_analysis/base-llama3.1-orion-cti-cyber-data-V2-0x0.csv")df_llama = df_llama.to_csv(path="/content/drive/MyDrive/repos/datasets🤗/smol-llama_analysis/base-llama3.1-orion-mindata-cyber-data-V2-0x0.csvcolumns=["scenario_outcome","scores", "labels"], inplace=True)
#df_llama.to_csv("/content/drive/MyDrive/repos/datasets🤗/llama_analysis/smol-llama3.1-orion-mindata-cyber-data-V2-0x0.csv")
ds = Dataset.from_pandas(df_llama)
ds_train=ds.train_test_split(train_size=0.8)
llama_dataset = ds_train.copy()
DataDict = DatasetDict(llama_dataset)
DataDict.push_to_hub("smol-llama3.1-orion-mindata-cyber-data-V2-0x0", token=userdata.get("HF_FINE"))


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/synavate/smol-llama3.1-orion-mindata-cyber-data-V2-0x0/commit/96143d9a3a64fc279c7899a28ec9963f97ec53a1', commit_message='Upload dataset', commit_description='', oid='96143d9a3a64fc279c7899a28ec9963f97ec53a1', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
#
Analyse scenarious

In [None]:
df_int = df_filter.copy()
df_int["id"] = df_int.apply(lambda x: range(len((df_int["id"][x].astype(int)))), axis=1)
df_int.head()


In [None]:
'''df_filter = df_gpt.copy()
df_response = df_filter.copy()
df_gpt_filter = df_filter["id"].iloc[row_numbers].copy()
df_gpt_filter.head()

In [None]:
df_gpt["analysis"] = ""
df_gpt.head()
df_backup = df_gpt.copy()

In [None]:
df_gpt["analysis"] = ""
new_column_order = ["id", "text", "diagnosis", "solutions", "text_pr", "diagnosis_pr", "solutions_pr"]

# Create the new DataFrame with the specified column order
df_new = df_old[new_column_order]

# Print the new DataFrame
print("\nNew DataFrame with specified column order:")
print(df_new)
df_nlp.head()


In [None]:
# Example: Creating a new 'id' column based on existing 'text' column

df_data["id"] = df_data.apply(lambda x: int(range(len(df_data["scenario_outcome"]))), axis=1)
df_data["id"].head()

In [None]:
embedded_data_oai = df_encoded.copy()
embedded_data_oai['text_embedding'] = embedded_data_oai[''].progress_apply(get_openai_embedding)

In [None]:
%pip install seaborn
import matplotlib.pyplot as plt
import seaborn as sns

# Start a new run for visualizations
viz_run = wandb.init(project="morpheus_cyber_gpt-4o-mini", job_type="visualization")

# --- Distribution of Sentiment ---
plt.figure(figsize=(8, 6))
sns.countplot(data=df_sentiment, x='sentiment')
plt.title('Distribution of Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
# Log the plot to W&B
viz_run.log({"sentiment_distribution": wandb.Image(plt)})
plt.show()

# --- Word Cloud of Text ---
from wordcloud import WordCloud
text_corpus = ' '.join(df_scenario_outcome['text_pr'].astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_corpus)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Text Data')
# Log the plot to W&B
viz_run.log({"text_wordcloud": wandb.Image(plt)})
plt.show()

# --- Word Cloud of Solutions ---
solutions_corpus = ' '.join(df_scenario_outcome['solutions_pr '].astype(str).tolist())
wordcloud_solutions = WordCloud(width=800, height=400, background_color='white').generate(solutions_corpus)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_solutions, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Solutions Data')
# Log the plot to W&B
viz_run.log({"solutions_wordcloud": wandb.Image(plt)})
plt.show()

# --- Length Distribution of Text ---
plt.figure(figsize=(8, 6))
sns.histplot(df_scenario_outcome['text_pr'].str.len(), bins=30)
plt.title('Distribution of Text Length')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
# Log the plot to W&B
viz_run.log({"text_length_distribution": wandb.Image(plt)})
plt.show()

# --- Correlation Heatmap (if applicable) ---
# If you have numerical features, you can create a correlation heatmap
# Example:
# numeric_features = df_scenario_outcome[['column1', 'column2']]  # Replace with actual numerical columns
# correlation_matrix = numeric_features.corr()
# plt.figure(figsize=(10, 8))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
# plt.title('Correlation Heatmap')
# viz_run.log({"correlation_heatmap": wandb.Image(plt)})
# plt.show()

# --- Create an artifact and save the visualizations ---
artifact = wandb.Artifact(name="pre_finetuning_visualizations", type="visualizations")
# Add any files you want to include in the artifact (e.g., images, data files)
# artifact.add_file("path/to/your/file.png")

# Log the artifact to W&B
viz_run.log_artifact(artifact)

# Finish the visualization run
viz_run.finish()


# Preparing to Train
1. Isolate important columns.


In [None]:
!pip install openai
import openai
openai.api_key = userdata.get("OPENAI_API_KEY")
from wandb.integration.openai.fine_tuning import WandbLogger
data=f"{BASE_URL}/{project}/{model_name}.jsonl"

from openai import OpenAI
client = OpenAI()

client.files.create(
  file=open("mydata.jsonl", "rb"),
  purpose="fine-tune"
)

# Finetuning logic
id = uuid.uuidv4()
if FINETUNE_JOB_ID == True:
  WandbLogger.sync(project=fine_tune_job_id=FINETUNE_JOB_ID)


WandbLogger.sync(entity="orion-agents-org")

# Fine Tuning Using Different Approaches
1. Open AI gpt-4o mini with small embeddings from OAI
2. Open AI gpt-4o mini with Word2Vec

1. Word2Vec model with Sentence Transformers

** After we have the models we will train them

In [None]:
### CHOOSE MODEL TO FINE TUNE
model = "gpt4o-mini"
###

prompt = f"{model}_prompt"

model_prompt = """
"messages": [
            {"role": "system", "content": "For each scenario, there is a scenario with a diagnosis and solutions. Decide if they are good solutions. 0 is bad, 1 is OK, 2, is GOOD"},
            {"role": "user", "content": {"scenario_outcome": "A ransomware attack encrypted critical files. Diagnosis: The attack vector was a phishing email. Solutions: 1. Isolate infected systems, 2. Pay the ransom, 3. Restore from backups."}},
            {"role": "assistant", "content": "By my assessment, the solutions were: 1. Good, 2. Not advisable, 3. Good."}
"""



"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

messages: {
### {"role": "system", "content": "For each scenario, there is a scenario with a diagnosis and solutions. Decide if they are good solutions."}
{}}
{}
### {"role": "user", "content": {"scenario_outcome": "A ransomware attack encrypted critical files. Diagnosis: The attack vector was a phishing email. Solutions: 1. Isolate infected systems, 2. Pay the ransom, 3. Restore from backups."}},
{}
###  {"role": "assistant", "content": "By my assessment, the solutions were: 1. Good, 2. Not advisable, 3. Good."}
{}}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("swaption2009/cyber-threat-intelligence-custom-data", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [None]:
# prompt: train test split from sklearn 0.1 size , random_stat42

from sklearn.model_selection import train_test_split
train, test = train_test_split(df_train, test_size=0.1, random_state=42)


In [None]:
### Need to concat the features.
#System Messages : 1 Assistant
```json
[
    {
        "messages": [
            {"role": "system", "content": "For each scenario, there is a scenario with a diagnosis and solutions. Decide if they are good solutions."},
            {"role": "user", "content": {"scenario_outcome": "A ransomware attack encrypted critical files. Diagnosis: The attack vector was a phishing email. Solutions: 1. Isolate infected systems, 2. Pay the ransom, 3. Restore from backups."}},
            {"role": "assistant", "content": "By my assessment, the solutions were: 1. Good, 2. Not advisable, 3. Good."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "For each scenario, there is a scenario with a diagnosis and solutions. Decide if they are good solutions."},
            {"role": "user", "content": {"scenario_outcome": "Data breach exposing customer information. Diagnosis: Misconfigured cloud storage. Solutions: 1. Notify affected customers, 2. Implement stricter access controls, 3. Ignore the breach."}},
            {"role": "assistant", "content": "By my assessment, the solutions were: 1. Good, 2. Good, 3. Bad."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "For each scenario, there is a scenario with a diagnosis and solutions. Decide if they are good solutions."},
            {"role": "user", "content": {"scenario_outcome": "Unauthorized access to internal network. Diagnosis: Weak password policy. Solutions: 1. Change all passwords, 2. Implement MFA, 3. Monitor network traffic."}},
            {"role": "assistant", "content": "By my assessment, the solutions were: 1. Good, 2. Good, 3. Good."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "For each scenario, there is a scenario with a diagnosis and solutions. Decide if they are good solutions."},
            {"role": "user", "content": {"scenario_outcome": "DDoS attack causing service disruption. Diagnosis: Insufficient network defenses. Solutions: 1. Increase bandwidth, 2. Implement rate limiting, 3. Deploy DDoS protection service."}},
            {"role": "assistant", "content": "By my assessment, the solutions were: 1. Partial, 2. Good, 3. Good."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "For each scenario, there is a scenario with a diagnosis and solutions. Decide if they are good solutions."},
            {"role": "user", "content": {"scenario_outcome": "Malware infection on multiple devices. Diagnosis: Lack of antivirus software. Solutions: 1. Install antivirus software, 2. Perform a full system scan, 3. Disconnect infected devices from the network."}},
            {"role": "assistant", "content": "By my assessment, the solutions were: 1. Good, 2. Good, 3. Good."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "For each scenario, there is a scenario with a diagnosis and solutions. Decide if they are good solutions."},
            {"role": "user", "content": {"scenario_outcome": "Phishing attack leading to credential theft. Diagnosis: Lack of user training. Solutions: 1. Conduct phishing awareness training, 2. Change compromised credentials, 3. Implement email filtering."}},
            {"role": "assistant", "content": "By my assessment, the solutions were: 1. Good, 2. Good, 3. Good."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "For each scenario, there is a scenario with a diagnosis and solutions. Decide if they are good solutions."},
            {"role": "user", "content": {"scenario_outcome": "SQL injection attack compromising database. Diagnosis: Lack of input validation. Solutions: 1. Implement input validation, 2. Use parameterized queries, 3. Perform regular security audits."}},
            {"role": "assistant", "content": "By my assessment, the solutions were: 1. Good, 2. Good, 3. Good."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "For each scenario, there is a scenario with a diagnosis and solutions. Decide if they are good solutions."},
            {"role": "user", "content": {"scenario_outcome": "Unauthorized access to sensitive data. Diagnosis: Inadequate access controls. Solutions: 1. Restrict access to sensitive data, 2. Implement role-based access control, 3. Regularly review access logs."}},
            {"role": "assistant", "content": "By my assessment, the solutions were: 1. Good, 2. Good, 3. Good."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "For each scenario, there is a scenario with a diagnosis and solutions. Decide if they are good solutions."},
            {"role": "user", "content": {"scenario_outcome": "Insider threat leaking confidential information. Diagnosis: Lack of monitoring. Solutions: 1. Implement user activity monitoring, 2. Conduct background checks, 3. Establish a whistleblower policy."}},
            {"role": "assistant", "content": "By my assessment, the solutions were: 1. Good, 2. Good, 3. Good."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": "For each scenario, there is a scenario with a diagnosis and solutions. Decide if they are good solutions."},
            {"role": "user", "content": {"scenario_outcome": "Zero-day exploit used in an attack. Diagnosis: Outdated software. Solutions: 1. Apply patches promptly, 2. Use intrusion detection systems, 3. Maintain an incident response plan."}},
            {"role": "assistant", "content": "By my assessment, the solutions were: 1. Good, 2. Good, 3. Good."}
        ]
    }
]
```


In [None]:
'''System Messages : Multiple Assistants
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris", "weight": 0}, {"role": "user", "content": "Can you be more sarcastic?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already.", "weight": 1}]}
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "William Shakespeare", "weight": 0}, {"role": "user", "content": "Can you be more sarcastic?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?", "weight": 1}]}
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "384,400 kilometers", "weight": 0}, {"role": "user", "content": "Can you be more sarcastic?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters.", "weight": 1}]}

In [None]:
import json

#Define Template
system_messages = [{sys}]
prefix = {'messages':{"role:"system", "content": "Here are a variety of solutions to cyber problems. Analyze and give a binary 0 for no and 1 for yes.'}}}
postfix =

with open("./text_sql.json", "w") as f:
    json.dump(template, f)

In [None]:
# DF to JSON Serialized
df_to_json = df_domain.to_json('./text_sql.json', orient='records')

In [None]:
run = wandb.init(project="Cyber-Phi-Small-8k-instruct", job_type="dataset")
artifact = wandb.Artifact(name="df_to_json", type="dataset")
run.log_artifact(artifact)

run.finish()



---



## Model Training
1. Tokenize with TikToken
2. @ 4bit for improved speed traded off for lower precision calculation on weights.

In [None]:
run2 = wandb.init(project="Cyber-Phi-Small-8k-instruct", job_type="train")
run3 = wandb.init(project="gpt4o-mini", job_type="train")

In [None]:
# Load model directly from HuggingFace
%pip install -qq tiktoken einops
%pip install  -q torch==2.2.2+cu121 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 torchtext==0.15.2 torchdata==0.6.1 --extra-index-url https://download.pytorch.org/whl/cu121 -U
_logger.info(device)
from unsloth import FastLanguageModel
import torch
import tiktoken
import einops
from transformers import AutoModelForCausalLM

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-small-8k-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    #token = os.getenv("WANDB_API_KEY"), # use one if using gated models like meta-llama/Llama-2-7b-hf
)

### Extract Entities for Graph
1. Make new dataFrame wth text an relations broken down, then labelled with Node or Relationship.


In [None]:
import numpy as np
df_Graph = df_train.copy()
def graph_df(text):
  columns = text.unique()
  graph_df=pd.DataFrame(columns=columns)
  return graph_df
print(df_Graph.head())
graph_df = graph_df(df_Graph['entities'])
graph_df.shape
new_df = pd.DataFrame(columns=graph_df[0::])
new_df.head()
for k, v in enumerate(new_df.index):
  print(f'k is {k} and v is {v}')
  print(v)
  #new_df[f"{v}"] = df_Graph['entities'][k].split(',')
  #print(new_df[f"{v}]"])
  #print(graph_df.head())

#print(graph_df.describe)
#print(graph_df.head())
#print(np.array_split(values=graph_df,)

In [None]:
def load_and_log():

    # 🚀 start a run, with a type to label it and a project it can call home
    with wandb.init(project="artifacts-data-models", job_type="load-data") as run:

        datasets = load()  # separate code for loading the datasets
        names = ["training", "validation", "test"]

        # 🏺 create our Artifact
        raw_data = wandb.Artifact(
            "cyber-phi", type="dataset",
            description="Cyber-Phi",
            metadata={"source": "torchvision.datasets.MNIST",
                      "sizes": [len(dataset) for dataset in datasets]})

        for name, data in zip(names, datasets):
            # 🐣 Store a new file in the artifact, and write something into its contents.
            with raw_data.new_file(name + ".pt", mode="wb") as file:
                x, y = data.tensors
                torch.save((x, y), file)

        # ✍️ Save the artifact to W&B.
        run.log_artifact(raw_data)

load_and_log()



---
###APPENDIX A

### 🤗Fine-Tuning Techniques: 🤗

**PEFT** (Parameter-Efficient Fine-Tuning): Fine-tunes pre-trained models by adjusting only a small subset of parameters, reducing computational costs.

**LoRA** (Low-Rank Adaptation): Enhances transformer models by injecting and training low-rank matrices within each layer, minimizing the number of trainable parameters.

**QLoRA** (Quantized Low-Rank Adaptation): Combines low-rank adaptation with weight quantization to achieve efficient fine-tuning with reduced memory and computational requirements.

**Full Fine-Tuning:** Updates all parameters of the pre-trained model, offering high flexibility at the cost of increased computational resources.

**Distillation:** Trains a smaller model to mimic the behavior of a larger pre-trained model, optimizing efficiency while maintaining performance.