# Install dependencies

In [1]:
import os
from dotenv import load_dotenv

In [2]:
# google gemini api key
google_api_key= os.getenv("GOOGLE_API_KEY")

In [3]:
if google_api_key== "":
    print("Api key not found")
else:
    print("Api key found")

Api key found


In [4]:
from IPython.display import Markdown, display # for decoration
# from llama
from llama_index.core import SimpleDirectoryReader # to directory reader to read data
from llama_index.core import VectorStoreIndex # to store vectors
from llama_index.core import ServiceContext # for keeping model, embedding together for info retrieval
from llama_index.core import StorageContext, load_index_from_storage # to store, load embeddings
from llama_index.embeddings.gemini import GeminiEmbedding # convert text data into Embedding
from llama_index.llms.gemini import Gemini # model
# from google
import google.generativeai as genai # models



  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# first configure/set Google_API_KEY env key as API key
genai.configure(api_key= google_api_key)

In [6]:
# check if your API key is working or not by getting list of genai models
for models in genai.list_models():
    print(models)

Model(name='models/chat-bison-001',
      base_model_id='',
      version='001',
      display_name='PaLM 2 Chat (Legacy)',
      description='A legacy text-only model optimized for chat conversations',
      input_token_limit=4096,
      output_token_limit=1024,
      supported_generation_methods=['generateMessage', 'countMessageTokens'],
      temperature=0.25,
      top_p=0.95,
      top_k=40)
Model(name='models/text-bison-001',
      base_model_id='',
      version='001',
      display_name='PaLM 2 (Legacy)',
      description='A legacy model that understands text and generates text as an output',
      input_token_limit=8196,
      output_token_limit=1024,
      supported_generation_methods=['generateText', 'countTextTokens', 'createTunedTextModel'],
      temperature=0.7,
      top_p=0.95,
      top_k=40)
Model(name='models/embedding-gecko-001',
      base_model_id='',
      version='001',
      display_name='Embedding Gecko',
      description='Obtain a distributed representatio

In [7]:
for models in genai.list_models():
    if "generateContent" in models.supported_generation_methods:
        print(models.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


# 1. Data Ingestion

-- I can do question, answering directly with llms 
-- Here, we r loading data (user provided data i.e can be database or source); use RAG sys --> Llamaindex to retrieve data 

load_data --> for loading all files from directory

In [8]:
# create data loader obj
reader= SimpleDirectoryReader(input_dir= "../data", required_exts=[".txt"])
# load method -> 
documents= reader.load_data()
print(f"Loaded {len(documents)} docs")
print(documents)

Loaded 3 docs
[Document(id_='c3f0e99c-4b11-46ad-8242-96575620691c', embedding=None, metadata={'file_path': 'c:\\Users\\anjik\\Desktop\\MLOPs_projects\\QA_System\\data\\AI.txt', 'file_name': 'AI.txt', 'file_type': 'text/plain', 'file_size': 17688, 'creation_date': '2024-04-28', 'last_modified_date': '2024-04-28'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text="artificial intelligence\r\nArtificial intelligence (AI), the ability of a digital computer or computer-controlled robot to perform tasks commonly associated with intelligent beings. The term is frequently applied to the project of developing systems endowed with the intellectual processes characteristic of humans, such as the ability to reason, discover meaning, generalize, or learn from past exper

# 2. Embedding

In [9]:
# load embedding model --> Gemini model --> i/p text; o/p text
gemini_embed_model= GeminiEmbedding(model_name= "models/embedding-001")

# load llm Gemini model using ----> llama_index.llms.gemini or google.generative as genai also
llm_model= Gemini(api_key= google_api_key, model_name="models/gemini-pro")

# configure ServiceContext (class is a container --> objects for configuring every index, query i.e llm model, embedding model, chunnk size etc) 
service_context= ServiceContext.from_defaults(llm= llm_model,
    embed_model= gemini_embed_model,
    chunk_size=800,
    chunk_overlap=20)

service_context

  service_context= ServiceContext.from_defaults(llm= llm_model,


ServiceContext(llm_predictor=LLMPredictor(system_prompt=None, query_wrapper_prompt=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>), prompt_helper=PromptHelper(context_window=32768, num_output=2048, chunk_overlap_ratio=0.1, chunk_size_limit=None, separator=' '), embed_model=GeminiEmbedding(model_name='models/embedding-001', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x00000224B0926280>, title=None, task_type='retrieval_document'), transformations=[SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x00000224B0926280>, id_func=<function default_id_func at 0x00000224AC046940>, chunk_size=800, chunk_overlap=20, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?')], llama_logger=<llama_index.core.service_context_elements.llama_logger.LlamaLogger object at 0x00000224B0819EE0>, callback_

# 3. Indexing

In [10]:
# create vector index
index= VectorStoreIndex.from_documents(documents= documents, service_context=service_context)
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x224b09b5fa0>

In [11]:
# store the vector embedding index to local database (storage inside Experiments dir---> have all info, metainfo)
index.storage_context.persist()

# 4. Querying

In [17]:
query_engine= index.as_query_engine()
response=query_engine.query("What is machine learning?")
print(response.response)

Machine learning is a branch of artificial intelligence (AI) and computer science which focuses on the use of data and algorithms to imitate the way that humans learn, gradually improving its accuracy.


In [13]:
query_engine= index.as_query_engine()
response= query_engine.query("Give me history of Artificial intelligence")
print(response.response)

The development of the digital computer in the 1940s demonstrated that computers could be programmed to carry out very complex tasks with great proficiency. However, despite continuing advances in computer processing speed and memory capacity, there are as yet no programs that can match full human flexibility over wider domains or in tasks requiring much everyday knowledge. On the other hand, some programs have attained the performance levels of human experts and professionals in performing certain specific tasks, so that artificial intelligence in this limited sense is found in applications as diverse as medical diagnosis, computer search engines, voice or handwriting recognition, and chatbots.


In [14]:
query_engine= index.as_query_engine()
response= query_engine.query("What is machine learning, artificial intelligence?")
print(response.response)

Machine learning is a branch of artificial intelligence (AI) and computer science which focuses on the use of data and algorithms to imitate the way that humans learn, gradually improving its accuracy.


In [18]:
query_engine= index.as_query_engine()
response= query_engine.query("Difference between supervised and unsupervised machine learning in tablular farmat")
print(response.response)

| Supervised Machine Learning | Unsupervised Machine Learning |
|---|---|
| Uses labeled datasets | Uses unlabeled datasets |
| Algorithms are trained to classify data or predict outcomes | Algorithms discover hidden patterns or data groupings |
| Requires human intervention to label data | Does not require human intervention to label data |
| Examples: neural networks, naïve bayes, linear regression, logistic regression, random forest, support vector machine (SVM) | Examples: neural networks, k-means clustering, probabilistic clustering methods |


In [15]:
query_engine= index.as_query_engine()
response= query_engine.query("What are mlflow components")
print(response.response)

MLflow Tracking, Model Registry, Evaluate, Prompt Engineering UI, Recipes, Projects


In [16]:
query_engine= index.as_query_engine()
response= query_engine.query("Who is Malleswari Gelli")
print(response.response)

The provided context does not mention Malleswari Gelli, so I cannot answer this question from the provided context.
