In [None]:
pip uninstall -r requirements.txt

In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Importing Libraries
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone as PineconeClient, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
import os
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
## Read the file
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [5]:
doc=read_doc("D:/Security Policy Assistant/data")
len(doc)

55

In [28]:
# Divide data into chunks

def chunk_data(docs, chunk_size=1000, chunk_overlap=150):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    documents = text_splitter.split_documents(docs)
    return documents


In [29]:
documents = chunk_data(docs=doc)
documents

[Document(metadata={'producer': 'Skia/PDF m126 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Access Control Policy Template', 'source': 'D:\\Security Policy Assistant\\data\\Access-Control-Policy-Template-1.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content="AccessControl PolicyTemplate\nDocument Control\nOrganizationName: [Name]Version: 1.0EffectiveDate: [Date]ReviewDate: [Date]Approvedby: [Name]PolicyOwner: [Name]PolicyContact: [Email/Phone]\n---\nPurpose\nThepurposeof thisAccessControl Policyistoestablishtheprinciplesandstandardsbywhich[OrganizationName] will provideaccesstoinformationsystems, ensuresecurity, protectsensitivedata, andcomplywithrelevant regulations.\nScope\nThispolicyappliestoall employees, contractors, vendors, andother personnel withaccessto[OrganizationName]'sinformationsystemsanddata.\nPolicyStatement\n[OrganizationName] will implement andmaintainaneffectiveaccesscontrol systemthatensuresthat accesstoinformationisrestri

In [30]:
# Embedding Technique of OpenAI
embeddings = OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000019BEF138A00>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000019BEFA1F430>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [31]:
vectors = embeddings.embed_query("How are you?")

In [32]:
len(vectors)

1536

In [33]:
# Storing in VectorDB

pc = PineconeClient(
    api_key=os.environ["PINECONE_API_KEY"]
)


In [34]:
index_name = "security-policy-index"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,   # OpenAI embedding size
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)


In [35]:
vectorstore = PineconeVectorStore.from_documents(
    documents,
    embeddings,
    index_name=index_name
)
print(index.describe_index_stats())


{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 403}},
 'total_vector_count': 403,
 'vector_type': 'dense'}


In [36]:
vectorstore = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=OpenAIEmbeddings(),
)

def retrieve_query(query, k=5):
    results = vectorstore.similarity_search_with_score(query, k=k)
    return results


In [37]:
results = retrieve_query("What is the policy for vendor access?")
results


[(Document(id='287c9dd1-f716-498c-a6be-4df7a20d2638', metadata={'creationdate': '', 'creator': 'PyPDF', 'page': 2.0, 'page_label': '3', 'producer': 'Skia/PDF m133 Google Docs Renderer', 'source': 'D:\\Security Policy Assistant\\data\\Vendor-Risk-Assessment-Template.pdf', 'title': 'Vendor Risk Assessment Template', 'total_pages': 7.0}, page_content='VolumeofInteraction\nHowfrequentlydoyouinteract withthisvendor?\n3. Vendor SecurityPractices\nAssessthevendor’scybersecuritypracticesandtheir approachtoriskmanagement.\nSecurityPractice Questions Yes/No Comments/Evidence\nSecurityPolicies Doesthevendorhavedocumentedsecuritypolicies?\nThird-PartyCertiﬁcations\nDotheyholdcertiﬁcations(ISO27001, SOC2, etc.)?\nList certiﬁcationsifany.\nDataProtectionMeasures\nIsdataencrypted(intransit andat rest)?\nIncident ResponsePlan\nDotheyhaveanincident responseprocedure?\nAccessControls Arerole-basedaccesscontrols(RBAC)implemented?\nVulnerabilityManagement\nHowfrequentlydotheyperformvulnerabilityscans?\nEm

In [38]:
def confidence_score(results):
    if not results:
        return "Low", 0.0

    avg_distance = sum(score for _, score in results) / len(results)


    if avg_distance >= 0.8:
        return "High", avg_distance
    elif avg_distance >= 0.65:
        return "Medium", avg_distance
    else:
        return "Low", avg_distance


In [46]:
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0.2
)

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a security compliance assistant.

Rules:
- Answer ONLY from the context
- If information is missing, say "Insufficient evidence"
- Be concise and factual

Context:
{context}

Question:
{question}

Answer:
"""
)


chain = prompt | llm | StrOutputParser()


In [48]:
def answer_question(question):
    results = retrieve_query(question)

    confidence, score = confidence_score(results)

    if confidence == "Low":
         return {
             "answer": "Insufficient evidence",
             "confidence": confidence
         }

    context = "\n\n".join(
        doc.page_content for doc, _ in results
    )

    answer = chain.invoke({
        "context": context,
        "question": question
    })

    return {
        "answer": answer,
        "confidence": confidence
    }


In [50]:
response = answer_question(
    "“How is third-party risk managed?”"
)

print(response)



{'answer': "Third-party risk is managed by assessing the vendor's cybersecurity practices, including their security policies, certifications, data protection measures, incident response plan, access controls, vulnerability management, and employee training.", 'confidence': 'High'}


In [51]:
questions = [
    "What is the policy for vendor access?",
    "How are third-party risks managed?",
    "What approvals are required for external users?",
    "How are security incidents reported?",
    "Who is responsible for access reviews?"
]

for q in questions:
    print(q)
    print(answer_question(q))
    print("----")


What is the policy for vendor access?
{'answer': 'Regular training programs must be conducted to ensure all users are aware of the access control policies and procedures. Users must acknowledge their understanding and acceptance of the access control policy. Violations may result in disciplinary action, up to and including termination of employment. The policy must be reviewed and updated regularly (at least annually) to ensure its continued effectiveness and relevance.', 'confidence': 'High'}
----
How are third-party risks managed?
{'answer': 'Third-party risks are managed by conducting vendor risk assessments to evaluate potential security, operational, financial, and compliance risks introduced by third-party vendors. Organizations ensure that vendors meet their standards for data protection, reliability, and regulatory compliance to mitigate supply chain vulnerabilities, data breaches, and disruptions to critical operations.', 'confidence': 'High'}
----
What approvals are required 