In [4]:
from dotenv import load_dotenv
from langchain.embeddings import VertexAIEmbeddings
load_dotenv()
from fastapi import FastAPI
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatVertexAI
from langchain.embeddings import VertexAIEmbeddings
from langchain.vectorstores import ElasticVectorSearch

In [6]:
embeddings = VertexAIEmbeddings()
db = ElasticVectorSearch(
    elasticsearch_url="http://elasticsearch:9200",
    index_name="elastic-index",
    embedding=embeddings,
)




In [10]:
qa = RetrievalQA.from_chain_type(
    llm=ChatVertexAI(temperature=0),
    chain_type="stuff",
    retriever=db.as_retriever(),
    return_source_documents=True
)


In [12]:
query = "What could a malicious user potentially do"


In [14]:
    response = qa(query)

In [15]:
response

{'query': 'What could a malicious user potentially do',
 'result': ' A malicious user could potentially view sensitive information of other organisations, or take over other superhost user accounts on the site. ',
 'source_documents': [Document(page_content='An example of the results of a brute force attack is included in the screenshot below. In this attack, document ids between 1085000 and 1085999 were brute forced with event id 145 at the endpoint GET /simdojo/events/145/documents/<document id>, each responding with a valid document which belonged to a user from another organisation:\n\nFigure - Document brute force results: A response status code of 200 indicates that a document from another organisation was able to be accessed.\n\nRecommendation\n\nWe recommend adding an authorisation check at the aﬀected endpoint to ensure that the user requesting the document should be able to read the document.\n\nAffected Endpoint\n\nGET /simdojo/events/<event id>/documents/<document id>\n\nMa

In [26]:
response['source_documents'][2].metadata['source']

'/app/data/Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf'

In [27]:
results = [doc.metadata['source'] for doc in response['source_documents']]

In [28]:
results

['/app/data/Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf',
 '/app/data/Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf',
 '/app/data/Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf',
 '/app/data/Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf']

In [29]:
set(results)

{'/app/data/Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf'}

In [30]:
import os.path


In [31]:
results = [os.path.basename(doc.metadata['source']) for doc in response['source_documents']]

In [32]:
results

['Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf',
 'Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf',
 'Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf',
 'Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf']

In [33]:
set(results)

{'Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf'}

In [34]:
sources = [os.path.basename(doc.metadata['source']) for doc in response['source_documents']]


In [35]:
sources

['Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf',
 'Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf',
 'Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf',
 'Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf']

In [36]:
sources = set(os.path.basename(doc.metadata['source']) for doc in response['source_documents'])
sources

{'Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf'}

In [37]:
list(os.path.basename(doc.metadata['source']) for doc in response['source_documents'])

['Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf',
 'Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf',
 'Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf',
 'Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf']

In [38]:
list(set(os.path.basename(doc.metadata['source']) for doc in response['source_documents']))

['Market Dojo (M23-1042) Application and Organisation Penetration Test.pdf']

In [39]:
response


{'query': 'What could a malicious user potentially do',
 'result': ' A malicious user could potentially view sensitive information of other organisations, or take over other superhost user accounts on the site. ',
 'source_documents': [Document(page_content='An example of the results of a brute force attack is included in the screenshot below. In this attack, document ids between 1085000 and 1085999 were brute forced with event id 145 at the endpoint GET /simdojo/events/145/documents/<document id>, each responding with a valid document which belonged to a user from another organisation:\n\nFigure - Document brute force results: A response status code of 200 indicates that a document from another organisation was able to be accessed.\n\nRecommendation\n\nWe recommend adding an authorisation check at the aﬀected endpoint to ensure that the user requesting the document should be able to read the document.\n\nAffected Endpoint\n\nGET /simdojo/events/<event id>/documents/<document id>\n\nMa

In [40]:
response['result']


' A malicious user could potentially view sensitive information of other organisations, or take over other superhost user accounts on the site. '