In [None]:
# Install required packages
!pip install  markdownify gdown
!pip install sentence-transformers numpy
!pip install langchain-community
!pip install "pinecone[grpc]"
!apt-get install -y poppler-utils
!apt-get install -y tesseract-ocr
!pip install pdfminer.six markdownify gdown
!pip install neo4j

# Standard Libraries
import os
import re
import numpy as np
import gdown
import time
from google.colab import drive
from markdownify import markdownify
import gdown
from pdfminer.high_level import extract_text
from google.colab import drive

# LangChain Libraries
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Pinecone
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

# Embedding Models
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone as LangchainPinecone

Collecting markdownify
  Downloading markdownify-0.14.1-py3-none-any.whl.metadata (8.5 kB)
Downloading markdownify-0.14.1-py3-none-any.whl (11 kB)
Installing collected packages: markdownify
Successfully installed markdownify-0.14.1
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12=

Converting CIS AWS Foundations Benchmark (PDF) to Markdown using PDF Embedded Text Extractor

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Define Google Drive File ID
file_id = "1X2b7MrtpbmYb1-syXUzdATe1rB88AETK"

# Define paths
pdf_path = f"/content/{file_id}.pdf"
output_md_path = "/content/drive/My Drive/Capstone_G3/CIS_AWS_Foundations_Benchmark.md"

# Download the PDF from Google Drive
gdown.download(f"https://drive.google.com/uc?id={file_id}", pdf_path, quiet=False)

# Extract text from PDF using pdfminer.six (NO OCR)
raw_text = extract_text(pdf_path)

# Define regex for detecting command-line blocks
cli_pattern = re.compile(r"^\s*(aws|gcloud|kubectl|az|terraform|docker|git|npm|pip)\s+.*", re.MULTILINE)

# Function to format text as Markdown
def format_as_markdown(text):
    lines = text.split("\n")
    in_code_block = False
    markdown_lines = []

    for line in lines:
        line = line.strip()

        # Detect CLI command (starts with aws, gcloud, etc.)
        if cli_pattern.match(line):
            if not in_code_block:
                markdown_lines.append("\n```bash")  # Start a code block
                in_code_block = True
            markdown_lines.append(line)
        else:
            if in_code_block:
                markdown_lines.append("```\n")  # End the code block
                in_code_block = False
            markdown_lines.append(line)

    if in_code_block:
        markdown_lines.append("```\n")  # Ensure code block is closed

    return "\n".join(markdown_lines)

# Convert extracted text to Markdown format
markdown_text = format_as_markdown(raw_text)

# Save Markdown output in Google Drive
with open(output_md_path, "w", encoding="utf-8") as f:
    f.write(markdown_text)

print(f"Markdown file saved in Google Drive: {output_md_path}")


Mounted at /content/drive


Downloading...
From: https://drive.google.com/uc?id=1X2b7MrtpbmYb1-syXUzdATe1rB88AETK
To: /content/1X2b7MrtpbmYb1-syXUzdATe1rB88AETK.pdf
100%|██████████| 2.03M/2.03M [00:00<00:00, 133MB/s]


Markdown file saved in Google Drive: /content/drive/My Drive/Capstone_G3/CIS_AWS_Foundations_Benchmark.md


Chunking: Splitting Markdown into corresponding subsections for efficient processing

In [None]:
# Path to the Markdown file in Google Drive
markdown_file_path = "/content/drive/My Drive/Capstone_G3/CIS_AWS_Foundations_Benchmark.md"

# Read the Markdown content
with open(markdown_file_path, "r", encoding="utf-8") as f:
    markdown_text = f.read()

# Dictionary to store sections
sections = {}
current_section = None

# Regular expression to match section headers (e.g., "4.10 Ensure security group changes are monitored (Manual)")
header_pattern = re.compile(r"^(\d+(?:\.\d+)*)\s+(.+)")

# Process each line in the Markdown file
for line in markdown_text.split("\n"):
    line = line.strip()

    # Check if the line is a section header
    match = header_pattern.match(line)
    if match:
        current_section = match.group(1) + " " + match.group(2)  # Combine number and title
        sections[current_section] = []  # Create a new section
    elif current_section:
        sections[current_section].append(line)

# Convert sections to structured format
for key in sections:
    sections[key] = "\n".join(sections[key]).strip()

# Save each section as a separate file if it contains more than 50 words
output_dir = "/content/drive/My Drive/Capstone_G3/markdown_sections"
os.makedirs(output_dir, exist_ok=True)

for section, content in sections.items():
    word_count = len(content.split())  # Count words in the section
    if word_count > 50:  # Only keep markdowns with more than 50 words
        filename = f"{output_dir}/{section.replace(' ', '_').replace('/', '_')}.md"
        with open(filename, "w", encoding="utf-8") as f:
            f.write(content)
        print(f"Saved section: {filename} (Word count: {word_count})")

# Print extracted section names
print(f"Sections extracted (more than 50 words): {list(sections.keys())}")


Saved section: /content/drive/My Drive/Capstone_G3/markdown_sections/1.1_Maintain_current_contact_details_(Manual).md (Word count: 473)
Saved section: /content/drive/My Drive/Capstone_G3/markdown_sections/17.2_Establish_and_Maintain_Contact_Information_for.md (Word count: 55)
Saved section: /content/drive/My Drive/Capstone_G3/markdown_sections/1.2_Ensure_security_contact_information_is_registered_(Manual).md (Word count: 248)
Saved section: /content/drive/My Drive/Capstone_G3/markdown_sections/17.6_Define_Mechanisms_for_Communicating_During.md (Word count: 58)
Saved section: /content/drive/My Drive/Capstone_G3/markdown_sections/1.3_Ensure_security_questions_are_registered_in_the_AWS_account.md (Word count: 286)
Saved section: /content/drive/My Drive/Capstone_G3/markdown_sections/5.1_Establish_and_Maintain_an_Inventory_of_Accounts.md (Word count: 57)
Saved section: /content/drive/My Drive/Capstone_G3/markdown_sections/1.4_Ensure_no_'root'_user_account_access_key_exists_(Automated).md (W

Generate Vector Embeddings



In [None]:
# Load the free Hugging Face embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Load split sections from Google Drive
sections_dir = "/content/drive/My Drive/Capstone_G3/markdown_sections"
documents = []

for filename in os.listdir(sections_dir):
    if filename.endswith(".md"):  # Only process Markdown files
        file_path = os.path.join(sections_dir, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
            documents.append(Document(page_content=text, metadata={"source": filename}))

# Split into smaller chunks for embeddings
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(documents)

# Convert text chunks to embeddings
text_chunks = [doc.page_content for doc in docs]  # Extract raw text
embeddings = embedding_model.encode(text_chunks, convert_to_numpy=True)  # Free embeddings

# Convert embeddings to NumPy array for easy storage & manipulation
embeddings_array = np.array(embeddings)

print(f"Successfully generated {len(embeddings_array)} vector embeddings")
print(f"Embedding Shape: {embeddings_array.shape}")  # (#chunks, embedding_dim)

# Save embeddings as a NumPy file (optional)
np.save("/content/drive/My Drive/Capstone_G3/embeddings.npy", embeddings_array)
print("Embeddings saved to Google Drive!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Successfully generated 773 vector embeddings
Embedding Shape: (773, 384)
Embeddings saved to Google Drive!


Store Vector Embeddings in Pinecone Vector Database

In [None]:
# Define Pinecone API Key
PINECONE_API_KEY = "pcsk_6F3jX2_HcwCRS3d1tqkf2zk5jriMKmqLZqzMuVgMSXi6Y7j66dDXHMHWXU1bgqNfhpjhkW"

# Initialize Pinecone Client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Define Index Name
index_name = "cis-aws-benchmark"

# 🔹 Create the Index
pc.create_index(
    name=index_name,
    dimension=384,  # Must match embedding model output size
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1" # Region for free version
    )
)

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(2)

# Connect to the index
index = pc.Index(index_name)

# Load stored embeddings
embeddings_path = "/content/drive/My Drive/Capstone_G3/embeddings.npy"
embeddings_array = np.load(embeddings_path)

# Load section metadata (file names) from markdown sections
sections_dir = "/content/drive/My Drive/Capstone_G3/markdown_sections"
section_files = [f for f in os.listdir(sections_dir) if f.endswith(".md")]

print(f"🔹 Number of Markdown files: {len(section_files)}")
print(f"🔹 Number of Embeddings: {embeddings_array.shape[0]}")


# Ensure metadata matches embeddings
assert embeddings_array.shape[0] >= len(section_files), "Embeddings should be equal or more than sections!"

from itertools import cycle
import math

# Create a cycle iterator of section_files to ensure every embedding gets a file name
file_iterator = cycle(section_files)

records = []
for i, chunk in enumerate(embeddings_array):
    file_name = next(file_iterator)  # Get the next file in a cycle
    doc_id = f"chunk_{i}"  # Unique ID per chunk
    metadata = {"source": file_name}  # Associate chunk with original file
    records.append({
        "id": doc_id,
        "values": chunk.tolist(),
        "metadata": metadata
    })

print(f"Prepared {len(records)} vectors for Pinecone.")

# 🔹 Batch Upsert (1000 vectors per request)
batch_size = 1000  # Max allowed by Pinecone
num_batches = math.ceil(len(records) / batch_size)

for i in range(num_batches):
    batch = records[i * batch_size : (i + 1) * batch_size]
    index.upsert(vectors=batch, namespace="capstone-g3")
    print(f"✅ Upserted batch {i+1}/{num_batches} ({len(batch)} vectors)")

print(f"Successfully stored {len(records)} document embeddings in Pinecone")


# Wait for vectors to be indexed
time.sleep(10)

In [None]:
# Load embedding model for queries
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Define example query
query_text = "How to enable MFA for AWS root user?"

# Convert query to an embedding
query_embedding = embedding_model.encode([query_text])

# Perform similarity search in Pinecone
results = index.query(
    namespace="capstone-g3",
    vector=query_embedding[0].tolist(),
    top_k=3,
    include_values=False,
    include_metadata=True
)

# Display results
print("\n🔹 Top Matching Sections from Pinecone:")
for match in results["matches"]:
    print(f"\n📄 Section: {match['metadata']['source']}")
    print(f"🔹 Similarity Score: {match['score']}")

🔹 Number of Markdown files: 80
🔹 Number of Embeddings: 773
Prepared 773 vectors for Pinecone.
✅ Upserted batch 1/1 (773 vectors)
Successfully stored 773 document embeddings in Pinecone

🔹 Top Matching Sections from Pinecone:


Store Vector Embeddings in Neo4J Graph Database

In [None]:
from neo4j import GraphDatabase

# Define Neo4j AuraDB Credentials
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "PCOXLhRZ8mXyxfRzHmCprMQbdngXdGYTJmo-Gnrb6VA"
URI = "neo4j+s://1817bb70.databases.neo4j.io"

# Establish Connection
driver = GraphDatabase.driver(URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

with driver.session() as session:
    driver.verify_connectivity()
    print("✅ Successfully connected to Neo4j AuraDB")


✅ Successfully connected to Neo4j AuraDB


In [None]:
from scipy.spatial.distance import cosine
import numpy as np
import os

# Function to Store Vector Embeddings & Create Relationships in Neo4j
def store_embeddings_with_relationships(driver, embeddings_array, section_files, similarity_threshold=0.8):
    with driver.session() as session:
        chunk_ids = []

        for i, (embedding, file_name) in enumerate(zip(embeddings_array, section_files)):
            vector_data = [float(x) for x in embedding.tolist()]  # Ensure proper float array

            query = """
            MERGE (c:Chunk {id: $id})
            SET c.text = $text, c.source = $source, c.vector = $vector
            RETURN c.id AS chunk_id
            """

            result = session.run(query, {
                "id": f"chunk_{i}",
                "text": file_name,
                "source": file_name,
                "vector": vector_data
            })

            chunk_id = result.single()["chunk_id"]
            chunk_ids.append((chunk_id, embedding))

        # 🔹 Create Relationships Between Similar Chunks
        for i, (id1, emb1) in enumerate(chunk_ids):
            for j, (id2, emb2) in enumerate(chunk_ids):
                if i < j:  # Avoid duplicate comparisons
                    similarity = 1 - cosine(emb1, emb2)  # Cosine similarity

                    if similarity >= similarity_threshold:
                        session.run("""
                        MATCH (c1:Chunk {id: $id1}), (c2:Chunk {id: $id2})
                        MERGE (c1)-[:SIMILAR_TO {score: $score}]->(c2)
                        """, {"id1": id1, "id2": id2, "score": similarity})

        print(f"✅ Successfully stored {len(embeddings_array)} vector embeddings & created relationships in Neo4j.")

# Load embeddings from file
embeddings_path = "/content/drive/My Drive/Capstone_G3/embeddings.npy"
embeddings_array = np.load(embeddings_path)

# Load section metadata (file names) from markdown sections
sections_dir = "/content/drive/My Drive/Capstone_G3/markdown_sections"
section_files = [f for f in os.listdir(sections_dir) if f.endswith(".md")]

print(f"🔹 Number of Markdown files: {len(section_files)}")
print(f"🔹 Number of Embeddings: {embeddings_array.shape[0]}")

# Store embeddings in Neo4j **WITH RELATIONSHIPS**
store_embeddings_with_relationships(driver, embeddings_array, section_files)

# Ensure Neo4j Vector Index Exists
def create_vector_index(driver):
    query = """
    CREATE VECTOR INDEX vector_index FOR (c:Chunk)
    ON (c.vector) OPTIONS {indexConfig: {`vector.dimensions`: 384, `vector.similarity_function`: "cosine"}}
    """
    with driver.session() as session:
        session.run(query)
        print("✅ Vector index created in Neo4j")

# Ensure vector index exists
create_vector_index(driver)

🔹 Number of Markdown files: 80
🔹 Number of Embeddings: 773
✅ Successfully stored 773 vector embeddings & created relationships in Neo4j.
✅ Vector index created in Neo4j


In [None]:
# Function to Perform Similarity Search in Neo4j with Relationships
def search_neo4j_with_relationships(driver, query_vector, top_k=3):
    query = """
    CALL db.index.vector.queryNodes('vector_index', $top_k, $query_vector)
    YIELD node, score
    OPTIONAL MATCH (node)-[r:SIMILAR_TO]->(related)
    RETURN node.source AS Section, score, collect(related.source) AS RelatedSections
    ORDER BY score DESC
    LIMIT $top_k
    """
    with driver.session() as session:
        results = session.run(query, query_vector=query_vector, top_k=top_k)
        return results.data()

# Querying Neo4j for Similarity Search
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
query_text = "How to enable MFA for AWS root user?"
query_embedding = embedding_model.encode([query_text]).tolist()[0]

# Run similarity search in Neo4j
results = search_neo4j_with_relationships(driver, query_embedding)

# Display results
print("\n🔹 Top Matching Sections from Neo4j:")
for match in results:
    print(f"\n📄 Section: {match['Section']}")
    print(f"🔹 Similarity Score: {match['score']:.4f}")
    print(f"🔗 Related Sections: {match['RelatedSections']}")

# Close the connection
driver.close()


🔹 Top Matching Sections from Neo4j:

📄 Section: 8.1_Establish_and_Maintain_an_Audit_Log_Management.md
🔹 Similarity Score: 0.9273
🔗 Related Sections: ['5.7_Ensure_that_the_EC2_Metadata_Service_only_allows_IMDSv2.md']

📄 Section: 3.11_Encrypt_Sensitive_Data_at_Rest.md
🔹 Similarity Score: 0.9249
🔗 Related Sections: ['8.1_Establish_and_Maintain_an_Audit_Log_Management.md', '5.7_Ensure_that_the_EC2_Metadata_Service_only_allows_IMDSv2.md']

📄 Section: 2.2.1_Ensure_that_encryption-at-rest_is_enabled_for_RDS_instances.md
🔹 Similarity Score: 0.9090
🔗 Related Sections: ['1.1_Establish_and_Maintain_Detailed_Enterprise_Asset.md', '5.6_Ensure_routing_tables_for_VPC_peering_are_"least_access".md']


AWS Asset Detection

In [None]:
import boto3
sts_client = boto3.client("sts")
print(sts_client.get_caller_identity())  # Should print your AWS account details

In [None]:
from botocore.exceptions import NoCredentialsError, PartialCredentialsError

def list_aws_assets():
    try:
        # Initialize a session
        session = boto3.Session()
        detected_assets = {}

        # AWS IAM
        iam_client = session.client("iam")
        users = iam_client.list_users()
        roles = iam_client.list_roles()
        detected_assets["IAM Users"] = [user["UserName"] for user in users.get("Users", [])]
        detected_assets["IAM Roles"] = [role["RoleName"] for role in roles.get("Roles", [])]

        # IAM Access Analyzer
        analyzer_client = session.client("accessanalyzer")
        analyzers = analyzer_client.list_analyzers()
        detected_assets["IAM Access Analyzers"] = [a["name"] for a in analyzers.get("analyzers", [])]

        # AWS Config
        config_client = session.client("config")
        config_rules = config_client.describe_config_rules()
        detected_assets["AWS Config Rules"] = [rule["ConfigRuleName"] for rule in config_rules.get("ConfigRules", [])]

        # AWS CloudTrail
        cloudtrail_client = session.client("cloudtrail")
        trails = cloudtrail_client.describe_trails()
        detected_assets["CloudTrail Trails"] = [trail["Name"] for trail in trails.get("trailList", [])]

        # AWS CloudWatch
        cloudwatch_client = session.client("cloudwatch")
        alarms = cloudwatch_client.describe_alarms()
        detected_assets["CloudWatch Alarms"] = [alarm["AlarmName"] for alarm in alarms.get("MetricAlarms", [])]

        # AWS Simple Notification Service (SNS)
        sns_client = session.client("sns")
        topics = sns_client.list_topics()
        detected_assets["SNS Topics"] = [t["TopicArn"] for t in topics.get("Topics", [])]

        # AWS Simple Storage Service (S3)
        s3_client = session.client("s3")
        s3_buckets = s3_client.list_buckets()
        detected_assets["S3 Buckets"] = [b["Name"] for b in s3_buckets.get("Buckets", [])]

        # Elastic Compute Cloud (EC2)
        ec2_client = session.client("ec2")
        instances = ec2_client.describe_instances()
        detected_assets["EC2 Instances"] = [i["InstanceId"] for r in instances.get("Reservations", []) for i in r.get("Instances", [])]

        # Relational Database Service (RDS)
        rds_client = session.client("rds")
        rds_instances = rds_client.describe_db_instances()
        detected_assets["RDS Instances"] = [db["DBInstanceIdentifier"] for db in rds_instances.get("DBInstances", [])]

        # AWS VPC
        vpc_client = session.client("ec2")
        vpcs = vpc_client.describe_vpcs()
        detected_assets["VPCs"] = [vpc["VpcId"] for vpc in vpcs.get("Vpcs", [])]

        return detected_assets

    except (NoCredentialsError, PartialCredentialsError):
        print("AWS credentials not provided or are incorrect.")
        return {}


In [None]:
list_aws_assets()