In [1]:
import requests
from bs4 import BeautifulSoup
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_ollama import OllamaLLM

In [2]:
BASE_URL = "https://cloud.google.com"
DOCS_URL = "https://cloud.google.com/deployment-manager/docs/apis"

In [8]:
def get_links(url):
    """ 
    Extract relevant API documentation links from the main Google Cloud API page.
    
    - Fetches the page content
    - Extracts all relevant hyperlinks
    - Returns a list of API documentation URLs    
    """

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    links = set()

    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        if href.startswith("/deployment-manager/docs/reference/v2beta"):
            links.add(BASE_URL + href)
        elif href.startswith("http"):
            links.add(href)

    return list(links)



In [4]:
def get_page_content(url):
    """
    Extracts the main text content from a given documentation page.
    
    - Fetches page data
    - Extracts relevant text (headings, paragraphs, code blocks)
    - Returns cleaned text
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return ""

    soup = BeautifulSoup(response.text, "html.parser")
    content = []

    for tag in soup.find_all(["h1", "h2", "h3", "p", "pre", "code"]):
        content.append(tag.get_text(strip=True))

    return "\n".join(content)

In [None]:
#  Step 1: Get all API documentation links
doc_links = get_links(DOCS_URL)

#  Step 2: Crawl each page and extract content
api_docs = {}
for link in doc_links:
    print(f"Fetching: {link}")
    api_docs[link] = get_page_content(link)

# Step 3: Save extracted data to a file
doc_file = "deployment_manager_docs.txt"
with open(doc_file, "w", encoding="utf-8") as f:
    for url, content in api_docs.items():
        f.write(f"### URL: {url}\n{content}\n\n")

# Step 4: Load extracted text
with open(doc_file, "r", encoding="utf-8") as f:
    raw_text = f.read()

Fetching: https://cloud.google.com/docs/tech-area-overviews
Fetching: https://cloud.google.com/docs
Fetching: https://cloud.google.com/deployment-manager/docs
Fetching: https://cloud.google.com/deployment-manager/docs/reference/v2beta/compositeTypes/update
Fetching: https://cloud.google.com/docs/iac
Fetching: https://cloud.google.com/deployment-manager/docs/reference/v2beta/manifests/get
Fetching: https://cloud.google.com/deployment-manager/docs/manage-cloud-resources-deployment
Fetching: https://cloud.google.com/deployment-manager/docs/reference/v2beta/compositeTypes/insert
Fetching: https://creativecommons.org/licenses/by/4.0/
Error fetching https://creativecommons.org/licenses/by/4.0/: HTTPSConnectionPool(host='creativecommons.org', port=443): Max retries exceeded with url: /licenses/by/4.0/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1018)')))
Fetching: https://clou

In [10]:
# Step 5: Split text into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_text(raw_text)

# Step 6: Store embeddings in ChromaDB
embedding_function = HuggingFaceEmbeddings(model_name="local_model")
vectorstore_path = "./gcp_deployment_vectorstore"

vectorstore = Chroma.from_texts(chunks, embedding=embedding_function, persist_directory=vectorstore_path)
vectorstore.persist()

# Step 7: Load vector database
retriever = vectorstore.as_retriever()

  embedding_function = HuggingFaceEmbeddings(model_name="local_model")
  from .autonotebook import tqdm as notebook_tqdm
  vectorstore.persist()


In [13]:
# Step 8: Load Local Model
llm = OllamaLLM(model="llama3.2")

# Step 9: Create the RAG Chain
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

# Step 10: Query the RAG system
query = "what is the https method for this 'https://www.googleapis.com/deploymentmanager/v2beta/projects/project/global/deployments/deployment/manifests/manifest' and list down all parameter for it"
response = qa_chain.invoke(query)

print("\n AI Response:\n", response["result"])


 AI Response:
 Based on the provided context, I can tell you that the URL you're referring to is a Google Cloud Deployment Manager API endpoint.

The HTTP method for this endpoint is GET, not HTTPS. The full URL is:

https://www.googleapis.com/deploymentmanager/v2beta/projects/project/global/deployments/deployment/manifests/manifest

As for the parameters, according to the documentation, there are no query parameters listed in the provided context. However, I would recommend checking the official API documentation for more information on available parameters.

Here is a sample of what the GET request might look like:

```
GET https://www.googleapis.com/deploymentmanager/v2beta/projects/project/global/deployments/deployment/manifests/manifest
```

Please note that without more information or the full list of methods, I can't provide an exhaustive list of parameters.
