# Pipeline 2
+ knowledge base
+ vector database
+ rag

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [None]:
from dotenv import load_dotenv
load_dotenv("/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/.env", override=True)

True

## Knowledge base
In this component we perform the loading of the knowledge base

In [None]:
%cd /content/drive/Shareddrives/Knowledge Graph/code/pipeline2/0.knowledge_base
!pip install -r requirements.txt

/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/0.knowledge_base


In [None]:
%cd /content/drive/Shareddrives/Knowledge Graph/code/pipeline2/files

/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/files


In [None]:
import rdflib
import json
import os

def knowledge_base():
  def parse_rdf(file_path):
      g = rdflib.Graph()
      try:
          g.parse(file_path, format='ttl')
      except rdflib.exceptions.ParserError as e:
          print(f"Errore di parsing: {e}")
      triples = [(str(s), str(p), str(o)) for s, p, o in g]
      return triples

  input_path = os.getenv("input_path")
  rdf_triples = parse_rdf(input_path)
  print(f'{len(rdf_triples)} triples extracted')
  with open(os.getenv("output_path"), 'w', encoding='utf-8') as f:
    json.dump(rdf_triples, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
  knowledge_base()

31310 triples extracted


In [None]:
with open("/content/triples.json", "r") as file:
      triples = json.load(file)

In [None]:
for triple in triples:
  print(triple)

['http://example.org/entities/MailDatabase', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://example.org/d3f/Database']
['http://example.org/entities/IMAPServer', 'http://example.org/d3f/reads', 'http://example.org/entities/MailDatabase']
['http://example.org/entities/EmailClient', 'http://example.org/d3f/SendsEmail', 'http://example.org/entities/SMTPServer']
['http://example.org/entities/EmailClient', 'http://www.w3.org/2000/01/rdf-schema#label', 'Email Client']
['http://example.org/entities/IMAPServer', 'http://www.w3.org/2000/01/rdf-schema#label', 'IMAP Server']
['http://example.org/entities/MailDatabase', 'http://www.w3.org/2000/01/rdf-schema#label', 'Mail Database']
['http://example.org/entities/SMTPServer', 'http://example.org/d3f/writes', 'http://example.org/entities/MailDatabase']
['http://example.org/entities/SMTPServer', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://example.org/d3f/MailServer']
['http://example.org/entities/IMAPServer', 'http://example

## Vector Database

In [None]:
%cd /content/drive/Shareddrives/Knowledge Graph/code/pipeline2/1.vector_database
!pip install -r requirements.txt

/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/1.vector_database
Collecting git+https://github.com/pykeen/pykeen.git (from -r requirements.txt (line 3))
  Cloning https://github.com/pykeen/pykeen.git to /tmp/pip-req-build-w8h2dca2
  Running command git clone --filter=blob:none --quiet https://github.com/pykeen/pykeen.git /tmp/pip-req-build-w8h2dca2
  Resolved https://github.com/pykeen/pykeen.git to commit 8b4d4811bb70a9765f4c5dd45011cdd87749502a
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting click_default_group (from pykeen==1.11.1.dev0->-r requirements.txt (line 3))
  Downloading click_default_group-1.2.4-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting optuna>=2.0.0 (from pykeen==1.11.1.dev0->-r requirements.txt (line 3))
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting more_click (from pykeen==1.11.1.dev0->-r requi

In [None]:
%cd /content/drive/Shareddrives/Knowledge Graph/code/pipeline2/files

/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/files


In [None]:
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure, Property, DataType
import re
import os
import json

# Funzione per estrarre l'ultima parte dell'URL
def extract_name(url):
    return re.split(r'[#/]', url)[-1]

def vector_database():
    weaviate_url = os.getenv("WEAVIATE_URL")
    weaviate_api_key = os.getenv("WEAVIATE_API_KEY")

    client = weaviate.connect_to_weaviate_cloud(
        cluster_url=weaviate_url,
        auth_credentials=Auth.api_key(weaviate_api_key),
        headers={
            "X-OpenAI-Api-Key": os.environ["OPENAI_API_TOKEN"]
        }
    )

    print(client.is_ready())

    client.collections.create(
        "Triple",
        vectorizer_config=Configure.Vectorizer.text2vec_openai(),
        properties=[
            Property(name="triple", data_type=DataType.TEXT),
            Property(name="source", data_type=DataType.TEXT)
        ]
    )

    with open(os.getenv("output_path"), "r") as file:
        triples = json.load(file)

    for index, triple in enumerate(triples, start=1):
        subject = extract_name(triple[0])
        predicate = extract_name(triple[1])
        object_ = extract_name(triple[2])
        source = subject + '--' + predicate + '--' + object_

        # Payload da caricare
        data_object = {
            "triple": source,
            "source": source
        }

        collection = client.collections.get("Triple")

        with collection.batch.dynamic() as batch:
            batch.add_object(properties=data_object)

        # Stampa lo stato di avanzamento
        print(f"Caricata tripla {index} di {len(triples)}: {data_object}")

if __name__ == '__main__':
    vector_database()


[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Caricata tripla 7438 di 31310: {'triple': 'T1594--definition--002)). These sites may also have details highlighting business operations and relationships.(Citation: Comparitech Leak)', 'source': 'T1594--definition--002)). These sites may also have details highlighting business operations and relationships.(Citation: Comparitech Leak)'}
Caricata tripla 7439 di 31310: {'triple': 'CWE-606--cwe-id--CWE-606', 'source': 'CWE-606--cwe-id--CWE-606'}
Caricata tripla 7440 di 31310: {'triple': 'CWE-531--subClassOf--CWE-540', 'source': 'CWE-531--subClassOf--CWE-540'}
Caricata tripla 7441 di 31310: {'triple': 'CWE-655--label--Insufficient Psychological Acceptability', 'source': 'CWE-655--label--Insufficient Psychological Acceptability'}
Caricata tripla 7442 di 31310: {'triple': 'CWE-1245--label--Improper Finite State Machines (FSMs) in Hardware Logic', 'source': 'CWE-1245--label--Improper Finite State Machines (FSMs) in Hardware Logic'

ERROR:weaviate-client:{'message': 'Failed to send 1 objects in a batch of 1. Please inspect client.batch.failed_objects or collection.batch.failed_objects for the failed objects.'}


Caricata tripla 12438 di 31310: {'triple': 'CWE-119--type--NamedIndividual', 'source': 'CWE-119--type--NamedIndividual'}
Caricata tripla 12439 di 31310: {'triple': 'PacketLog--type--Class', 'source': 'PacketLog--type--Class'}
Caricata tripla 12440 di 31310: {'triple': 'CCI-002009_v2022-04-05--definition--The information system accepts Personal Identity Verification (PIV) credentials from other federal agencies.', 'source': 'CCI-002009_v2022-04-05--definition--The information system accepts Personal Identity Verification (PIV) credentials from other federal agencies.'}
Caricata tripla 12441 di 31310: {'triple': 'NIST_SP_800-53_R5_AC-7_4--member-of--NIST_SP_800-53_R5', 'source': 'NIST_SP_800-53_R5_AC-7_4--member-of--NIST_SP_800-53_R5'}
Caricata tripla 12442 di 31310: {'triple': 'Reference-CAR-2021-02-001%3AWebshell-IndicativeProcessTree_MITRE--kb-organization--MITRE', 'source': 'Reference-CAR-2021-02-001%3AWebshell-IndicativeProcessTree_MITRE--kb-organization--MITRE'}
Caricata tripla 124



[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Caricata tripla 26311 di 31310: {'triple': 'T1212--subClassOf--CredentialAccessTechnique', 'source': 'T1212--subClassOf--CredentialAccessTechnique'}
Caricata tripla 26312 di 31310: {'triple': 'MessageTransferAgent--type--Class', 'source': 'MessageTransferAgent--type--Class'}
Caricata tripla 26313 di 31310: {'triple': 'CWE-328--type--Class', 'source': 'CWE-328--type--Class'}
Caricata tripla 26314 di 31310: {'triple': 'LinuxCloneArgumentCLONE_THREAD--subClassOf--OSAPICreateThread', 'source': 'LinuxCloneArgumentCLONE_THREAD--subClassOf--OSAPICreateThread'}
Caricata tripla 26315 di 31310: {'triple': 'NIST_SP_800-53_R5_AC-2_7--type--NamedIndividual', 'source': 'NIST_SP_800-53_R5_AC-2_7--type--NamedIndividual'}
Caricata tripla 26316 di 31310: {'triple': 'T1499--subClassOf--ImpactTechnique', 'source': 'T1499--subClassOf--ImpactTechnique'}
Caricata tripla 26317 di 31310: {'triple': 'CCI-001211_v2022-04-05--label--CCI-001211', 'sou

## RAG

In [None]:
%cd /content/drive/Shareddrives/Knowledge Graph/code/pipeline2/2.rag
!cat requirements.txt

/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/2.rag
langchain
langchain-community
weaviate-client
langchain-openai
langchain-weaviate

In [None]:
%cd /content/drive/Shareddrives/Knowledge Graph/code/pipeline2/2.rag
!pip install -r requirements.txt

/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/2.rag
Collecting langchain-community (from -r requirements.txt (line 2))
  Downloading langchain_community-0.3.14-py3-none-any.whl.metadata (2.9 kB)
Collecting weaviate-client (from -r requirements.txt (line 3))
  Downloading weaviate_client-4.10.2-py3-none-any.whl.metadata (3.6 kB)
Collecting langchain-openai (from -r requirements.txt (line 4))
  Downloading langchain_openai-0.2.14-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-weaviate (from -r requirements.txt (line 5))
  Downloading langchain_weaviate-0.0.3-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community->-r requirements.txt (line 2))
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community->-r requirements.txt (line 2))
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain (from -r requirements.txt (line 1)

In [None]:
%cd /content/drive/Shareddrives/Knowledge Graph/code/pipeline2/files

/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/files


In [None]:
import os
from weaviate.classes.query import MetadataQuery
from langchain_openai import ChatOpenAI
import weaviate
from weaviate.classes.init import Auth
import textwrap

def rag():
    weaviate_url = os.getenv("WEAVIATE_URL")
    weaviate_api_key = os.getenv("WEAVIATE_API_KEY")

    client = weaviate.connect_to_weaviate_cloud(
        cluster_url=weaviate_url,
        auth_credentials=Auth.api_key(weaviate_api_key),
        headers={
            "X-OpenAI-Api-Key": os.environ["OPENAI_API_TOKEN"]
        }
    )

    print(client.is_ready())


    # Esegui ricerca di similarità
    def similarity_search(question: str):
        jeopardy = client.collections.get("Triple")
        response = jeopardy.query.near_text(
            query=question,
            limit=4,
            return_metadata=MetadataQuery(distance=True)
        )
        return response.objects

    # Funzione per generare una risposta dal modello LLM
    def generate_RAG_answer(question: str, context: str):
        llm = ChatOpenAI(
            temperature=0,
            api_key=os.getenv("OPENAI_API_TOKEN"),
            model_name="gpt-3.5-turbo"
        )
        prompt = f"Answer the question based on the context: \n\nContext: {context}\n\nQuestion: {question}"
        response = llm.invoke(prompt)
        return response.content

        # Funzione per generare una risposta dal modello LLM
    def generate_LLM_answer(question: str):
        llm = ChatOpenAI(
            temperature=0,
            api_key=os.getenv("OPENAI_API_TOKEN"),
            model_name="gpt-3.5-turbo"
        )
        response = llm.invoke(question)
        return response.content

    # Funzione per stampare i risultati in un formato leggibile
    def pretty_print_response(question: str):
        print(f"Question: {question}")
        print("=" * 90)
        print("Similarity Search Results:")
        # Recupera gli oggetti simili
        context = similarity_search(question)
        for o in context:
          print(o.properties)
          print(o.metadata.distance)
        print("=" * 90)
        # Genera la risposta basata sul contesto
        answer = generate_RAG_answer(question, context)
        print("=" * 90)
        print("RAG Answer:")
        print(textwrap.fill(answer, 60))
        print("=" * 90)
        print("LLM Answer:")
        print(textwrap.fill(generate_LLM_answer(question), 60))
        print("=" * 90)

    question = "What does the T1115 technique read?"
    pretty_print_response(question)

    print("")
    print("")
    question2 = "What is the definition of PacketLog?"
    pretty_print_response(question2)

    print("")
    print("")
    question3 = "What offensive techniques may modify Windows Registry Key?"
    pretty_print_response(question3)

    print("")
    print("")
    question4 = "What techniques may access the Mail Server?"
    pretty_print_response(question4)

    print("")
    print("")
    question5 = "What is the definition of EmailRemoval?"
    pretty_print_response(question5)




if __name__ == "__main__":
    rag()

True
Question: What does the T1115 technique read?
Similarity Search Results:
{'triple': 'T1114.001--reads--Email', 'source': 'T1114.001--reads--Email'}
0.39078670740127563
{'triple': 'T1115--reads--Clipboard', 'source': 'T1115--reads--Clipboard'}
0.4021607041358948
{'triple': 'T1100--comment--This technique has been revoked by T1505.003', 'source': 'T1100--comment--This technique has been revoked by T1505.003'}
0.42233288288116455
{'triple': 'T1150--comment--This technique has been revoked by T1547.011', 'source': 'T1150--comment--This technique has been revoked by T1547.011'}
0.43406248092651367
RAG Answer:
The T1115 technique reads Clipboard.
LLM Answer:
The T1115 technique is a Windows registry modification
technique that involves reading the Windows registry to
gather information about the system and its configuration.
This technique is commonly used by threat actors to gather
information for reconnaissance purposes.


Question: What is the definition of PacketLog?
Similarity Sear

In [None]:
import os
from weaviate.classes.query import MetadataQuery
from langchain_openai import ChatOpenAI
import weaviate
from weaviate.classes.init import Auth
import textwrap

def rag():
    weaviate_url = os.getenv('WEAVIATE_URL_ARCH')
    weaviate_api_key = os.getenv('WEAVIATE_API_KEY_ARCH')

    client = weaviate.connect_to_weaviate_cloud(
        cluster_url=weaviate_url,
        auth_credentials=Auth.api_key(weaviate_api_key),
        headers={
            "X-OpenAI-Api-Key": os.environ["OPENAI_API_TOKEN"]
        }
    )

    print(client.is_ready())


    # Esegui ricerca di similarità
    def similarity_search(question: str):
        jeopardy = client.collections.get("Triple")
        response = jeopardy.query.near_text(
            query=question,
            limit=4,
            return_metadata=MetadataQuery(distance=True)
        )
        return response.objects

    # Funzione per generare una risposta dal modello LLM
    def generate_RAG_answer(question: str, context: str):
        llm = ChatOpenAI(
            temperature=0,
            api_key=os.getenv("OPENAI_API_TOKEN"),
            model_name="gpt-3.5-turbo"
        )
        prompt = f"Answer the question based on the context: \n\nContext: {context}\n\nQuestion: {question}"
        response = llm.invoke(prompt)
        return response.content

        # Funzione per generare una risposta dal modello LLM
    def generate_LLM_answer(question: str):
        llm = ChatOpenAI(
            temperature=0,
            api_key=os.getenv("OPENAI_API_TOKEN"),
            model_name="gpt-3.5-turbo"
        )
        response = llm.invoke(question)
        return response.content

    # Funzione per stampare i risultati in un formato leggibile
    def pretty_print_response(question: str):
        print(f"Question: {question}")
        print("=" * 90)
        print("Similarity Search Results:")
        # Recupera gli oggetti simili
        context = similarity_search(question)
        for o in context:
          print(o.properties)
          print(o.metadata.distance)
        print("=" * 90)
        # Genera la risposta basata sul contesto
        answer = generate_RAG_answer(question, context)
        print("=" * 90)
        print("RAG Answer:")
        print(textwrap.fill(answer, 60))
        print("=" * 90)
        print("LLM Answer:")
        print(textwrap.fill(generate_LLM_answer(question), 60))
        print("=" * 90)

    question = "What does the user do in my architecture?"
    pretty_print_response(question)
    print("--------------------------------------------------------------------------------------")
    question2 = "How many types of mail servers do I have in my architecture?"
    pretty_print_response(question2)
    print("--------------------------------------------------------------------------------------")
    question3 = "Which mail server retrieves emails for the user?"
    pretty_print_response(question3)


if __name__ == "__main__":
    rag()

True
Question: What does the user do in my architecture?
Similarity Search Results:
{'triple': 'EmailClient--type--User', 'source': 'EmailClient--type--User'}
0.7078037261962891
{'triple': 'IMAPServer--reads--MailDatabase', 'source': 'IMAPServer--reads--MailDatabase'}
0.7525630593299866
{'triple': 'IMAPServer--EmailRetrieval--EmailClient', 'source': 'IMAPServer--EmailRetrieval--EmailClient'}
0.7657593488693237
{'triple': 'SMTPServer--writes--MailDatabase', 'source': 'SMTPServer--writes--MailDatabase'}
0.7871342897415161
RAG Answer:
In your architecture, the user is associated with the
'EmailClient' type.
LLM Answer:
In your architecture, the user interacts with the system by
inputting commands, requests, or data through the user
interface. The user's actions trigger processes within the
system, such as data processing, communication with external
systems, or generating outputs. The user may also receive
feedback, notifications, or results from the system based on
their interactions. Ov