# We are testing the vertexAI for llm and embeddings for Rag systems

In [None]:
# importing the required libraries
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from google.oauth2 import service_account

# Replace with the path to your service account key file
filename = "key.json"
credentials = service_account.Credentials.from_service_account_file(filename)
from llama_index.llms.vertex import Vertex

# Initialize the Vertex AI model
llm = Vertex(
    model="gemini-pro",  # Specify the model, e.g., "text-bison" or "gemini-pro"
    project=credentials.project_id,
    credentials=credentials,
    temperature=0.0,  # Adjust as needed
    additional_kwargs={}
)

In [None]:
# get API key and create embeddings
from dotenv import load_dotenv
import os
load_dotenv()
goog_api_key = os.getenv("GOOGLE_API_KEY")

# imports for embeddings
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.core import Settings

model_name = "models/embedding-004"

# Create the embedding model
embed_model_gemini = GeminiEmbedding(
    model_name=model_name, api_key=goog_api_key, title="this is a document"
)

# Set the embedding model
Settings.embed_model = embed_model_gemini

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load the data from the directory
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("./data").load_data()

# Create the index
index = VectorStoreIndex.from_documents(documents, llm=llm)

In [None]:
# making a query engine on the index
query_engine = index.as_query_engine(llm = llm)
response = query_engine.query("what is the core principle of organic production?")
response

Response(response='The core principle of organic production is to avoid the use of artificial substances in the food production process. This means that organic farmers must rely on natural methods for fertilization, pest control, and weed suppression. Organic farming also emphasizes the importance of biodiversity and soil health.', source_nodes=[NodeWithScore(node=TextNode(id_='c346be82-3fb7-4766-9a8e-db4550083f54', embedding=None, metadata={'file_path': '/Users/manishb27/Desktop/KisaanCompanion/data/ogranicFarming.txt', 'file_name': 'ogranicFarming.txt', 'file_type': 'text/plain', 'file_size': 81991, 'creation_date': '2024-12-14', 'last_modified_date': '2024-12-14'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo

In [None]:
# testing another query
response = query_engine.query("how do we look at pest problems in organic farming?")
response

Retrying llama_index.llms.vertex.utils.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ServiceUnavailable: 503 recvmsg:Connection reset by peer.


Response(response='In organic farming, pest problems are viewed differently than in conventional farming. Organic farmers rely on natural methods to control pests, rather than using synthetic pesticides. This means that they need to be more proactive in preventing pests from becoming a problem in the first place.\n\nOne of the most important things that organic farmers can do to prevent pests is to promote biodiversity on their farms. This means planting a variety of crops, as well as flowers and other plants that attract beneficial insects. Beneficial insects, such as ladybugs and lacewings, can help to control pest populations.\n\nOrganic farmers also need to be careful about the types of fertilizers and compost they use. Some fertilizers and compost can contain weed seeds or other pests. It is important to use certified organic fertilizers and compost to avoid introducing pests into your garden.\n\nFinally, organic farmers need to be prepared to take action if pests do become a prob

In [38]:
print(response.response)

In organic farming, pest problems are viewed differently than in conventional farming. Organic farmers rely on natural methods to control pests, rather than using synthetic pesticides. This means that they need to be more proactive in preventing pests from becoming a problem in the first place.

One of the most important things that organic farmers can do to prevent pests is to promote biodiversity on their farms. This means planting a variety of crops, as well as flowers and other plants that attract beneficial insects. Beneficial insects, such as ladybugs and lacewings, can help to control pest populations.

Organic farmers also need to be careful about the types of fertilizers and compost they use. Some fertilizers and compost can contain weed seeds or other pests. It is important to use certified organic fertilizers and compost to avoid introducing pests into your garden.

Finally, organic farmers need to be prepared to take action if pests do become a problem. There are a number o

In [None]:
from llama_index.indices.managed.vertexai import VertexAIIndex

# TODO(developer): Replace these values with your project information
project_id = "kisaan-companion-marketplace"
location = "us-central1"

# Optional: If creating a new corpus
corpus_display_name = "Oragnic Farming information"
corpus_description = "Vertex AI Corpus for LlamaIndex"

# Create a corpus or provide an existing corpus ID
index = VertexAIIndex(
    project_id,
    location,
    corpus_display_name=corpus_display_name,
    corpus_description=corpus_description,
)
print(f"Newly created corpus name is {index.corpus_name}.")



Newly created corpus name is projects/507105004236/locations/us-central1/ragCorpora/6838716034162098176.


In [None]:
# Upload local file
file_name = index.insert_file(
    file_path="/data/ogranicFarming.txt",
    metadata={
        "display_name": "organic_farming_wiki",
        "description": "Wiki Pedia Article on ogranic farming",
    },
)

In [None]:
# collecting the ids
PROJECT_ID = os.env("projectID") # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}
VS_INDEX_ENDPOINT_NAME = os.getenv("endpointName")  # @param {type:"string"}
DOC_FOLDER = "./data"  # @param {type:"string"}
GCS_BUCKET = os.getenv("bucketName")  # @param {type:"string"}
VS_INDEX_NAME = os.getenv("vs_index_name")  # @param {type:"string"}
VS_INDEX_ENDPOINT_NAME = os.getenv("VS_INDEX_ENDPOINT_NAME")  # @param {type:"string"}

vertexai.init(project=PROJECT_ID, location=REGION)

In [None]:
# Imports
import os
import vertexai
from google.cloud import aiplatform, storage
from langchain import hub
from llama_index.core import (
    Document,
    PromptTemplate,
    Settings,
    SimpleDirectoryReader,
    StorageContext,
    SummaryIndex,
    VectorStoreIndex,
)
from llama_index.core.agent import ReActAgent
from llama_index.core.base.base_query_engine import BaseQueryEngine
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.objects import ObjectIndex
from llama_index.core.prompts import LangchainPromptTemplate
from llama_index.core.prompts.base import BasePromptTemplate
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.embeddings.vertex import VertexTextEmbedding
from llama_index.llms.vertex import Vertex
from llama_index.vector_stores.vertexaivectorsearch import VertexAIVectorStore

In [None]:
# creating the required buckets and indexes

def create_bucket_class_location(bucket_name: str) -> storage.Bucket:
    """
    Create a new bucket in the US region with the coldline storage
    class.
    """
    storage_client = storage.Client()

    # Searching for existing GCS bucket
    for bucket in storage_client.list_buckets():
        if bucket.name == bucket_name:
            print(f"GCS Bucket {bucket_name} exists already in resource.")
            return bucket

    # Creating new bucket
    bucket = storage_client.bucket(bucket_name)
    bucket.storage_class = "STANDARD"
    new_bucket = storage_client.create_bucket(bucket, location=REGION)

    print(
        f"Created bucket {new_bucket.name} in {new_bucket.location} with storage class {new_bucket.storage_class}"
    )

    return new_bucket


def create_vector_search_index(
    index_name: str, index_dimensions: int
) -> aiplatform.MatchingEngineIndex:
    """
    Creates a Vector Index
    NOTE : This operation can take upto 30 minutes
    """

    # check if index exists
    index_names = [
        index.resource_name
        for index in aiplatform.MatchingEngineIndex.list(
            filter=f"display_name={index_name}"
        )
    ]

    if len(index_names) == 0:
        print(f"Creating Vector Search index {index_name} ...")
        vs_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
            display_name=index_name,
            dimensions=index_dimensions,
            # distance_measure_type="DOT_PRODUCT_DISTANCE",
            shard_size="SHARD_SIZE_SMALL",
            index_update_method="STREAM_UPDATE",  # allowed values BATCH_UPDATE , STREAM_UPDATE,
            approximate_neighbors_count=5,
        )
        print(
            f"Vector Search index {vs_index.display_name} created with resource name {vs_index.resource_name}"
        )
    else:
        vs_index = aiplatform.MatchingEngineIndex(index_name=index_names[0])
        print(
            f"Vector Search index {vs_index.display_name} exists with resource name {vs_index.resource_name}"
        )

    return vs_index


# Create a new bucket in the US region with the coldline storage class
def create_vector_search_endpoint(
    endpoint_name: str,
) -> aiplatform.MatchingEngineIndexEndpoint:
    """
    Creates a Vector Search endpoint.
    """
    endpoint_names = [
        endpoint.resource_name
        for endpoint in aiplatform.MatchingEngineIndexEndpoint.list(
            filter=f"display_name={endpoint_name}"
        )
    ]

    if len(endpoint_names) == 0:
        print(f"Creating Vector Search index endpoint {endpoint_name} ...")
        vs_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
            display_name=endpoint_name, public_endpoint_enabled=True
        )
        print(
            f"Vector Search index endpoint {vs_endpoint.display_name} created with resource name {vs_endpoint.resource_name}"
        )
    else:
        vs_endpoint = aiplatform.MatchingEngineIndexEndpoint(
            index_endpoint_name=endpoint_names[0]
        )
        print(
            f"Vector Search index endpoint {vs_endpoint.display_name} exists with resource name {vs_endpoint.resource_name}"
        )

    return vs_endpoint


# Deploy the Vector Search endpoint

def deploy_vector_search_endpoint(
    vs_index: aiplatform.MatchingEngineIndex,
    vs_endpoint: aiplatform.MatchingEngineIndexEndpoint,
    index_name: str,
) -> aiplatform.MatchingEngineIndexEndpoint:
    """
    Deploys a Vector Search endpoint.
    """
    # check if endpoint exists
    index_endpoints = [
        (deployed_index.index_endpoint, deployed_index.deployed_index_id)
        for deployed_index in vs_index.deployed_indexes
    ]

    if len(index_endpoints) == 0:
        print(
            f"Deploying Vector Search index {vs_index.display_name} at endpoint {vs_endpoint.display_name} ..."
        )
        vs_deployed_index = vs_endpoint.deploy_index(
            index=vs_index,
            deployed_index_id=index_name,
            display_name=index_name,
            machine_type="e2-standard-16",
            min_replica_count=1,
            max_replica_count=1,
        )
        print(
            f"Vector Search index {vs_index.display_name} is deployed at endpoint {vs_deployed_index.display_name}"
        )
    else:
        vs_deployed_index = aiplatform.MatchingEngineIndexEndpoint(
            index_endpoint_name=index_endpoints[0][0]
        )
        print(
            f"Vector Search index {vs_index.display_name} is already deployed at endpoint {vs_deployed_index.display_name}"
        )

    return vs_deployed_index

In [None]:
# Set up the environment
def setup():
    # The number of dimensions for the gecko text embeddings is 768
    VS_DIMENSIONS = 768
    # Vertex AI Vector Search Index configuration

    aiplatform.init(project=PROJECT_ID, location=REGION)

    GCS_BUCKET_URI = f"gs://{GCS_BUCKET}"
    new_bucket = create_bucket_class_location(GCS_BUCKET)
    vs_index = create_vector_search_index(VS_INDEX_NAME, VS_DIMENSIONS)
    vs_endpoint = create_vector_search_endpoint(VS_INDEX_ENDPOINT_NAME)
    vs_deployed_index = deploy_vector_search_endpoint(
        vs_index, vs_endpoint, VS_INDEX_NAME
    )

    return new_bucket, vs_index, vs_endpoint, vs_deployed_index

In [None]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/key.json"

In [None]:
# Set up the environment and initialize the LLM and storage
def initialize_llm_and_storage(
    vs_index: aiplatform.MatchingEngineIndex,
    vs_endpoint: aiplatform.MatchingEngineIndexEndpoint,
) -> StorageContext:
    """
    Initializes Vertex AI Vector Store given a Vector Search index and deployed endpoint.
    Configures embedding and LLMs models to be gecko and Gemini.
    """
    # setup storage
    vector_store = VertexAIVectorStore(
        project_id=PROJECT_ID,
        region=REGION,
        index_id=vs_index.resource_name,
        endpoint_id=vs_endpoint.resource_name,
        gcs_bucket_name=GCS_BUCKET,
    )

    # set storage context
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    # Initialize the Vertex AI model
    llm = Vertex(
        model="gemini-pro",  # Specify the model, e.g., "text-bison" or "gemini-pro"
        project=credentials.project_id,
        credentials=credentials,
        temperature=0.0,  # Adjust as needed
        additional_kwargs={}
    )
    Settings.llm = llm

    model_name = "models/text-embedding-004"

    embed_model_gemini = GeminiEmbedding(
        model_name=model_name, api_key=goog_api_key
    )

    Settings.embed_model = embed_model_gemini


    return storage_context

In [None]:
# Load the data from the directory
data_folder = 'data'
(bucket, vs_index, vs_endpoint, deployed_endpoint) = setup()
storage_context = initialize_llm_and_storage(vs_index, vs_endpoint)
docs = SimpleDirectoryReader(data_folder).load_data()

In [None]:
index = VectorStoreIndex.from_documents(
    docs, storage_context=storage_context
)

In [None]:
# create a query engine and query
query_engine = index.as_query_engine()
response = query_engine.query("What is the core principle of organic production?")
print(response)

In [33]:
# create your index
index = VectorStoreIndex.from_documents(
    docs, storage_context=storage_context
)

Upserting datapoints MatchingEngineIndex index: projects/507105004236/locations/us-central1/indexes/8892458619312799744
MatchingEngineIndex index Upserted datapoints. Resource name: projects/507105004236/locations/us-central1/indexes/8892458619312799744


In [34]:
# create a query engine and query
query_engine = index.as_query_engine()
response = query_engine.query("What is the core principle of organic production?")
print(response)

The core principle of organic production is to use naturally occurring, non-synthetic inputs such as compost manure, green manure, and bone meal. Organic farming also emphasizes techniques such as crop rotation, companion planting, and mixed cropping. Biological pest control methods such as the fostering of insect predators are also encouraged.
