# Pipeline 2
+ knowledge base
+ vector database
+ rag

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [3]:
from dotenv import load_dotenv
load_dotenv("/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/.env", override=True)

True

## Knowledge base
In this component we perform the loading of the knowledge base

In [None]:
%cd /content/drive/Shareddrives/Knowledge Graph/code/pipeline2/0.knowledge_base
!pip install -r requirements.txt

/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/0.knowledge_base


In [None]:
%cd /content/drive/Shareddrives/Knowledge Graph/code/pipeline2/files

/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/files


In [None]:
import rdflib
import json
import os

def knowledge_base():
  def parse_rdf(file_path):
      g = rdflib.Graph()
      try:
          g.parse(file_path, format='ttl')
      except rdflib.exceptions.ParserError as e:
          print(f"Errore di parsing: {e}")
      triples = [(str(s), str(p), str(o)) for s, p, o in g]
      return triples

  input_path = os.getenv("input_path")
  rdf_triples = parse_rdf(input_path)
  print(f'{len(rdf_triples)} triples extracted')
  with open(os.getenv("output_path"), 'w', encoding='utf-8') as f:
    json.dump(rdf_triples, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
  knowledge_base()

31310 triples extracted


In [None]:
with open("/content/triples.json", "r") as file:
      triples = json.load(file)

In [None]:
for triple in triples:
  print(triple)

['http://example.org/entities/MailDatabase', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://example.org/d3f/Database']
['http://example.org/entities/IMAPServer', 'http://example.org/d3f/reads', 'http://example.org/entities/MailDatabase']
['http://example.org/entities/EmailClient', 'http://example.org/d3f/SendsEmail', 'http://example.org/entities/SMTPServer']
['http://example.org/entities/EmailClient', 'http://www.w3.org/2000/01/rdf-schema#label', 'Email Client']
['http://example.org/entities/IMAPServer', 'http://www.w3.org/2000/01/rdf-schema#label', 'IMAP Server']
['http://example.org/entities/MailDatabase', 'http://www.w3.org/2000/01/rdf-schema#label', 'Mail Database']
['http://example.org/entities/SMTPServer', 'http://example.org/d3f/writes', 'http://example.org/entities/MailDatabase']
['http://example.org/entities/SMTPServer', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://example.org/d3f/MailServer']
['http://example.org/entities/IMAPServer', 'http://example

## Vector Database

In [4]:
%cd /content/drive/Shareddrives/Knowledge Graph/code/pipeline2/1.vector_database
!pip install -r requirements.txt

/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/1.vector_database
Collecting git+https://github.com/pykeen/pykeen.git (from -r requirements.txt (line 3))
  Cloning https://github.com/pykeen/pykeen.git to /tmp/pip-req-build-if2eq7dg
  Running command git clone --filter=blob:none --quiet https://github.com/pykeen/pykeen.git /tmp/pip-req-build-if2eq7dg
  Resolved https://github.com/pykeen/pykeen.git to commit 8b4d4811bb70a9765f4c5dd45011cdd87749502a
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting weaviate-client (from -r requirements.txt (line 4))
  Downloading weaviate_client-4.10.2-py3-none-any.whl.metadata (3.6 kB)
Collecting dataclasses-json (from pykeen==1.11.1.dev0->-r requirements.txt (line 3))
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting click_default_group (from pykeen==1.11.1.dev0->-r requirements.tx

In [14]:
%cd /content/drive/Shareddrives/Knowledge Graph/code/pipeline2/files

/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/files


In [15]:
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure, Property, DataType
import re
import os
import json

# Funzione per estrarre l'ultima parte dell'URL
def extract_name(url):
    return re.split(r'[#/]', url)[-1]

def vector_database():
    weaviate_url = os.getenv("WEAVIATE_URL")
    weaviate_api_key = os.getenv("WEAVIATE_API_KEY")

    client = weaviate.connect_to_weaviate_cloud(
        cluster_url=weaviate_url,
        auth_credentials=Auth.api_key(weaviate_api_key),
        headers={
            "X-OpenAI-Api-Key": os.environ["OPENAI_API_TOKEN"]
        }
    )

    print(client.is_ready())

    client.collections.create(
        "Triple",
        vectorizer_config=Configure.Vectorizer.text2vec_openai(),
        properties=[
            Property(name="triple", data_type=DataType.TEXT),
            Property(name="source", data_type=DataType.TEXT)
        ]
    )

    with open(os.getenv("output_path"), "r") as file:
        triples = json.load(file)

    for index, triple in enumerate(triples[:500], start=1):
        subject = extract_name(triple[0])
        predicate = extract_name(triple[1])
        object_ = extract_name(triple[2])
        source = subject + '--' + predicate + '--' + object_

        # Payload da caricare
        data_object = {
            "triple": source,
            "source": source
        }

        collection = client.collections.get("Triple")

        with collection.batch.dynamic() as batch:
            batch.add_object(properties=data_object)

        # Stampa lo stato di avanzamento
        print(f"Caricata tripla {index} di {len(triples)}: {data_object}")

if __name__ == '__main__':
    vector_database()


True
Caricata tripla 1 di 31310: {'triple': 'Dyna-Q--label--Dyna-Q', 'source': 'Dyna-Q--label--Dyna-Q'}
Caricata tripla 2 di 31310: {'triple': 'T1555.005--definition--Adversaries may acquire user credentials from third-party password managers.(Citation: ise Password Manager February 2019) Password managers are applications designed to store user credentials, normally in an encrypted database. Credentials are typically accessible after a user provides a master password that unlocks the database. After the database is unlocked, these credentials may be copied to memory. These databases can be stored as files on disk.(Citation: ise Password Manager February 2019)', 'source': 'T1555.005--definition--Adversaries may acquire user credentials from third-party password managers.(Citation: ise Password Manager February 2019) Password managers are applications designed to store user credentials, normally in an encrypted database. Credentials are typically accessible after a user provides a maste

## RAG

In [6]:
%cd /content/drive/Shareddrives/Knowledge Graph/code/pipeline2/2.rag
!pip install -r requirements.txt

/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/2.rag
Collecting langchain-community (from -r requirements.txt (line 2))
  Downloading langchain_community-0.3.14-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-openai (from -r requirements.txt (line 4))
  Downloading langchain_openai-0.2.14-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-weaviate (from -r requirements.txt (line 5))
  Downloading langchain_weaviate-0.0.3-py3-none-any.whl.metadata (2.7 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community->-r requirements.txt (line 2))
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain (from -r requirements.txt (line 1))
  Downloading langchain-0.3.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.25 (from langchain->-r requirements.txt (line 1))
  Downloading langchain_core-0.3.29-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community->-r req

In [7]:
%cd /content/drive/Shareddrives/Knowledge Graph/code/pipeline2/files

/content/drive/Shareddrives/Knowledge Graph/code/pipeline2/files


In [21]:
import os
from langchain_weaviate.vectorstores import WeaviateVectorStore
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
import weaviate
from weaviate.classes.init import Auth
import textwrap

def rag():
    weaviate_url = os.getenv("WEAVIATE_URL")
    weaviate_api_key = os.getenv("WEAVIATE_API_KEY")

    client = weaviate.connect_to_weaviate_cloud(
        cluster_url=weaviate_url,
        auth_credentials=Auth.api_key(weaviate_api_key),
        headers={
            "X-OpenAI-Api-Key": os.environ["OPENAI_API_TOKEN"]
        }
    )

    print(client.is_ready())


    embedding_model = OpenAIEmbeddings(
        api_key=os.getenv("OPENAI_API_TOKEN")
    )
    vectorstore = WeaviateVectorStore(client=client, index_name="Triple", text_key="triple", embedding=embedding_model, attributes=["source"])

    retriever = vectorstore.as_retriever()
    llm = ChatOpenAI(
        temperature=0,
        api_key=os.getenv("OPENAI_API_TOKEN"),
        model_name="gpt-3.5-turbo")
    chain = RetrievalQAWithSourcesChain.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
    )

    def prettychain(question: str) -> str:
        """Pretty print the chain's response to a question"""
        response = chain.invoke({"question": question},
            return_only_outputs=True,)
        print(textwrap.fill(response['answer'], 60))

    def prettychain3(question: str) -> None:
        """Pretty print the chain's response to a question, including sources"""
        response = chain.invoke({"question": question}, return_only_outputs=True)
        answer = response.get("answer", "No answer provided.")
        sources = response.get("sources", "No sources found.")
        print("Answer:")
        print(textwrap.fill(answer, 60))
        print("\nSources:")
        print(textwrap.fill(sources, 60))

    def prettychain2(question) -> str:
        """Pretty print the response to a question directly from the LLM"""
        response = llm.invoke(question)
        print(response.content)

    question = "What does the T1115 technique read?"
    print(question)
    print("=" * 90)
    print(" " * 35 + "RAG" + " " * 35)
    print("=" * 90)
    prettychain3(question)
    print("=" * 90)
    print(" " * 35 + "LLM" + " " * 35)
    print("=" * 90)
    prettychain2(question)
    print("=" * 90)
    print(" " * 35 + "SIMILARITY SEARCH" + " " * 35)
    print("=" * 90)
    print(vectorstore.similarity_search(question))
    print("=" * 90)
    print(" " * 35 + "RELEVANT DOCUMENTS" + " " * 35)
    print("=" * 90)
    print(retriever.invoke(question))

    print("")
    print("")
    question2 = "What is the definition of PacketLog?"
    print(question2)
    print("=" * 90)
    print(" " * 35 + "RAG" + " " * 35)
    print("=" * 90)
    prettychain3(question2)
    print("=" * 90)
    print(" " * 35 + "LLM" + " " * 35)
    print("=" * 90)
    prettychain2(question2)
    print("=" * 90)
    print(" " * 35 + "SIMILARITY SEARCH" + " " * 35)
    print("=" * 90)
    print(vectorstore.similarity_search(question2))
    print("=" * 90)
    print(" " * 35 + "RELEVANT DOCUMENTS" + " " * 35)
    print("=" * 90)
    print(retriever.invoke(question2))




if __name__ == "__main__":
    rag()

True
What does the T1115 technique read?
                                   RAG                                   
Answer:
The T1115 technique reads the clipboard.

Sources:
T1115--reads--Clipboard
                                   LLM                                   
The T1115 technique is a Windows registry modification technique that involves reading the Windows registry to gather information about the system and its configuration. This technique is commonly used by threat actors to gather information for reconnaissance purposes.
                                   SIMILARITY SEARCH                                   
[Document(metadata={'source': 'T1115--reads--Clipboard'}, page_content='T1115--reads--Clipboard'), Document(metadata={'source': 'EncryptedTunnels--definition--Encrypted encapsulation of routable network traffic.'}, page_content='EncryptedTunnels--definition--Encrypted encapsulation of routable network traffic.'), Document(metadata={'source': 'T1499.003--label--Applica

In [None]:
import os
from langchain_weaviate.vectorstores import WeaviateVectorStore
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
import weaviate
from weaviate.classes.init import Auth
import textwrap

def rag():
    weaviate_url = "https://ztzcuq1r6ogar9anhtr2a.c0.europe-west3.gcp.weaviate.cloud"
    weaviate_api_key = "40WeAARm6JgzWOYN9rGUiY2ANZOlG4PtFmiY"

    client = weaviate.connect_to_weaviate_cloud(
        cluster_url=weaviate_url,
        auth_credentials=Auth.api_key(weaviate_api_key),
        headers={
            "X-OpenAI-Api-Key": os.environ["OPENAI_API_TOKEN"]
        }
    )

    print(client.is_ready())


    embedding_model = OpenAIEmbeddings(
        api_key=os.getenv("OPENAI_API_TOKEN")
    )
    vectorstore = WeaviateVectorStore(client=client, index_name="Triple", text_key="triple", embedding=embedding_model, attributes=["source"])

    retriever = vectorstore.as_retriever()
    llm = ChatOpenAI(
        temperature=0,
        api_key=os.getenv("OPENAI_API_TOKEN"),
        model_name="gpt-3.5-turbo")
    chain = RetrievalQAWithSourcesChain.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
    )

    def prettychain(question: str) -> str:
        """Pretty print the chain's response to a question"""
        response = chain({"question": question},
            return_only_outputs=True,)
        print(textwrap.fill(response['answer'], 60))

    #def prettychain(question) -> str:
    #    """Pretty print the chain's response to a question"""
    #    response = chain({"question": question}, return_only_outputs=True)
    #    print(response)

    def prettychain2(question) -> str:
        """Pretty print the response to a question directly from the LLM"""
        response = llm.invoke(question)
        print(response.content)

    question = "What does the user do in my architecture?"
    print(question)
    print("RAG")
    prettychain(question)
    print("LLM")
    prettychain2(question)
    print("--------------------------------------------------------------------------------------")
    question2 = "How many types of mail servers do I have in my architecture?"
    print(question2)
    print("RAG")
    prettychain(question2)
    print("LLM")
    prettychain2(question2)
    print("--------------------------------------------------------------------------------------")
    question3 = "Which mail server retrieves emails for the user?"
    print(question3)
    print("RAG")
    prettychain(question3)
    print("LLM")
    prettychain2(question3)


    #print(vectorstore.similarity_search(question))
    #print(retriever.get_relevant_documents(question))



if __name__ == "__main__":
    rag()

True
What does the user do in my architecture?
RAG
The user uses the EmailClient in the architecture.
LLM
In your architecture, the user interacts with the system by inputting commands, requests, or data through the user interface. The user's actions trigger processes within the system, such as data processing, communication with external systems, or generating outputs. The user may also receive feedback, notifications, or results from the system based on their interactions. Overall, the user plays a crucial role in driving the functionality and behavior of the architecture.
--------------------------------------------------------------------------------------
How many types of mail servers do I have in my architecture?
RAG
You have two types of mail servers in your architecture.
LLM
It is not possible to determine the number of mail servers in your architecture without more information about your specific setup. The number of mail servers can vary depending on the size and complexity 