# Step 1: Set up the environment

In [1]:
import os
import google.generativeai as genai

if os.getenv("COLAB_RELEASE_TAG"):
   COLAB = True
   print("Running on COLAB environment.")
else:
   COLAB = False
   print("WARNING: Running on LOCAL environment.")


Running on COLAB environment.


In [2]:
# Clone the data repository into colab
!git clone https://github.com/openknowledge/workshop-genai-data.git
PROCESSED_DATA_PATH = "/content/workshop-genai-data/processed/gutenberg/"
EVALUATION_DATA_PATH = "/content/workshop-genai-data/evaluation/"

fatal: destination path 'workshop-genai-data' already exists and is not an empty directory.


In [3]:
# import colab specific lib to read user data (aka colab managed secrets)
from google.colab import userdata

In [4]:
# Initialize Google GenAI Client API with GOOGLE_API_KEY to be able to call the model.
# Note: GEMINI_API_KEY must be set as COLAB userdata before!
GOOGLE_API_KEY=userdata.get('GEMINI_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [5]:
# Install additional libraries
%%capture
!pip install -qU langchain-text-splitters
!pip install chromadb

In [6]:
# Import additional libraries
from langchain_text_splitters import RecursiveCharacterTextSplitter
from chromadb import EphemeralClient
import requests
import re
import uuid
import json
import typing_extensions as typing
from google.generativeai.types import HarmCategory, HarmBlockThreshold
import pandas as pd
from pathlib import Path
import time


In [7]:
# Configure pandas display options
pd.set_option("max_colwidth", None)

In [8]:
# Set default values for model, model parameters and prompt
DEFAULT_MODEL = "gemini-1.5-flash"
DEFAULT_CONFIG_TEMPERATURE = 0.0
DEFAULT_CONFIG_TOP_K = 1
DEFAULT_CONFIG_MAX_OUTPUT_TOKENS = 200
DEFAULT_SYSTEM_PROMPT = "Your are a friendly assistant"
DEFAULT_USER_PROMPT = " "

# Set defaults for retrieval
DEFAULT_K = 3
DEFAULT_CHUNK_OVERLAP = 100
DEFAULT_CHUNK_SIZE = 2000

## Define helper functions

In [9]:
# This will be the chromadb collection we use as a knowledge base. We do not need the in-memory EphemeralClient.
chromadb_client = EphemeralClient()
chromadb_collection = chromadb_client.create_collection(name="default")


In [10]:
def call_genai_model_for_completion(
        model_name: str = DEFAULT_MODEL,
        config_temperature:float = DEFAULT_CONFIG_TEMPERATURE,
        config_top_k: int = DEFAULT_CONFIG_TOP_K,
        config_max_output_tokens: int = DEFAULT_CONFIG_MAX_OUTPUT_TOKENS,
        system_prompt : str = DEFAULT_SYSTEM_PROMPT,
        user_prompt : str = DEFAULT_USER_PROMPT,
        verbose: bool = False
        ):

    if verbose:
        # print out summary of input values / parameters
        print(f'Generating answer for following config:')
        print(f'  - SYSTEM PROMPT used:\n {system_prompt}')
        print(f'  - USER PROMPT used:\n {user_prompt}')
        print(f'  - MODEL used:\n {model_name} (temperature = {config_temperature}, top_k = {config_top_k}, max_output_tokens = {config_max_output_tokens})')

    # create generation config
    model_config = genai.GenerationConfig(
        max_output_tokens=config_max_output_tokens,
        temperature=config_temperature,
        top_k=config_top_k
    )

    # create genai model with generation config
    genai_model = genai.GenerativeModel(
        model_name= model_name,
        generation_config= model_config
    )

    # Attention: We manipulated the safety settings
    response = genai_model.generate_content(
        contents=[system_prompt, user_prompt], safety_settings={
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    })
    return response

In [11]:
# RAG building blocks

# Get content of books. The content will already be cleansed.
def load_file_content(file_name: str) -> str:
  with open(f"{PROCESSED_DATA_PATH}{file_name}", "r") as f:
    return f.read()

# Building Block "Chunking": Split the content into smaller chunks
def do_chunk(text: str, chunk_size: int = DEFAULT_CHUNK_SIZE, chunk_overlap: int = DEFAULT_CHUNK_OVERLAP) -> list[str]:
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=chunk_size,
      chunk_overlap=chunk_overlap,
      length_function=len,
  )
  return text_splitter.split_text(text=text)

# Building Block "Embedding": Create multi dimensional embeddings for a given chunk.
def do_embed(chunk: str) -> list[float]:
  return genai.embed_content(model=EMBEDDING_MODEL, content=chunk).get("embedding")

def do_batch_embed(chunks: list[str]) -> list[list[float]]:
  return genai.embed_content(model=EMBEDDING_MODEL, content=chunks).get("embedding")

# Building Block "Knowledgebase": Store embeddings and the corresponding content in a vectorstore
def persist_embeddings(chunks: list[str], embeddings: list[float], clear_knowledgebase: bool)-> None:
  if clear_knowledgebase:
    chromadb_collection = chromadb_client.delete_collection(name="default")

  chromadb_collection = chromadb_client.get_or_create_collection(name="default")
  # Persist the embeddings and the chunks in the knowledgebase
  ids = [str(uuid.uuid4()) for _ in chunks]
  chromadb_collection.add(ids=ids, documents=chunks, embeddings=embeddings)

# Building Block "Augmentation": Create an updated prompt by merging the original user input with the provided context
# Attention: We manipulated the augmented prompt in order to see the guardrails in action
def augment(user_input: str, context: list[str]) -> str:
  prepared_context = "\n".join(context)
  augmented_prompt = f"""
    Answer the question as detailed as possible from the provided context. If the answer is not in
    provided context just say, 'answer is not available in the context', don't provide the wrong answer.
    Respond short and concisely.
    Context:\n{prepared_context}?\n
    Question: \n{user_input}\n

    Answer:
  """
  return augmented_prompt

# Building Block "Top-k Fetching": Get the k semantically closest chunks to the user input from the knowledgebase
def do_top_k_fetching(user_input_embedding: list[float], k: int) -> tuple[list[str],list[float]]:
  # Since we will do the fetching always only for one user_input,
  # instead of querying for multiple embeddings simultanously as allowed by the choma API,
  # we add the embeddings below to a list and return only the first document (chunk)
  chromadb_collection = chromadb_client.get_or_create_collection(name="default")
  results = chromadb_collection.query(query_embeddings=[user_input_embedding], n_results=k)
  return (results["documents"][0], results["distances"][0]) # Return the distances to get better insights

# Building Block "Generation": Use the generation model to create a response
def generate_response(prompt: str) -> str:
  completion_result = call_genai_model_for_completion(
      model_name=GENERATION_MODEL,
      user_prompt=prompt,
  )
  return completion_result.text

In [12]:
def do_ingestion(file_names: list[str], chunk_size: int = DEFAULT_CHUNK_SIZE, clear_knowledgebase: bool = False) -> None:
  # Ingest file by file
  for file_name in file_names:
    # Load prepared book content
    file_content = load_file_content(file_name)

    # Chunk the content into smaller chunks
    chunks = do_chunk(file_content, chunk_size=chunk_size)

    # Embed the chunks
    embeddings = do_batch_embed(chunks)

    # Persist the embeddings and the chunks in the knowledgebase
    persist_embeddings(chunks, embeddings, clear_knowledgebase)

In [13]:
def do_rag(user_input: str, k: int = DEFAULT_K, verbose: bool = False) -> tuple[str, list[str], list[float]]:
  # Embed the user input
  user_input_embedding = do_embed(chunk=user_input)

  # "R" like "Retrieval": Get the k semantically closest chunks to the user input from the knowledgebase
  (context, distances) = do_top_k_fetching(user_input_embedding=user_input_embedding, k=k)
  if verbose:
    print(f'Retrieved context:\n {context}')

  # "A" like "Augmented": Create the augmented prompt
  augmented_prompt = augment(user_input=user_input, context=context)
  if verbose:
    print(f'Augmented prompt:\n {augmented_prompt}')

  # "G" like "Generation": Generate a response
  response = generate_response(prompt=augmented_prompt)

  return (response, context, distances)


In [14]:
# Define a custom exception
class FactCheckingValidationError(Exception):
  pass

# Define a response format
class FactCheckingValidationAnswer(typing.TypedDict):
    is_grounded: bool


def guard_fact_checking(bot_response: str, context: list[str]) -> str:
  # Prepare the context to be used in the guard prompt
  context = "\n".join(context)

  # Define the prompt for the guardrail
  guard_prompt = f"""
    You are given a task to identify if the answer is grounded and entailed to the context.
    You will only use the contents of the context and not rely on external knowledge.
    'context': {context} 'answer': {bot_response}
    """

  # Call the guardrail model with the desired output format
  model = genai.GenerativeModel(GUARDING_MODEL)
  result = model.generate_content(
      guard_prompt,
      generation_config=genai.GenerationConfig(
          response_mime_type="application/json", response_schema=FactCheckingValidationAnswer
      ),
  )

  # Evaluate the validation
  fact_checking_validation = json.loads(result.text)
  if not fact_checking_validation["is_grounded"]:
    error_msg = f"The bot answer '{bot_response}' is not grounded in the context '{context}'"
    raise FactCheckingValidationError(error_msg)
  return bot_response

In [15]:
def do_fact_checked_rag(user_input: str,k: int = DEFAULT_K, verbose: bool = False) -> tuple[str, list[str], list[float]]:
    (answer, context, distances) = do_rag(user_input=user_input, k=k, verbose=False)
    if verbose:
        print(f'Bot answer before guardrail:\n {answer}')
    guarded_output = guard_fact_checking(bot_response=answer, context=context)
    return (guarded_output, context, distances)

In [16]:
def print_insights(dataframe: pd.DataFrame):
  mean_response_time = round(dataframe['response_time'].mean(skipna=True), 2)
  mean_min_distance = round(dataframe['min_context_distance'].mean(skipna=True), 2)
  mean_mean_distance = round(dataframe['mean_context_distance'].mean(skipna=True), 2)
  n_hallucinations = dataframe['min_context_distance'].isna().sum()

  print(f'Number of detected hallucinations: {n_hallucinations}')
  print(f'Mean response time: {mean_response_time} seconds')
  print(f'Mean minimum distance: {mean_min_distance}')
  print(f'Mean mean distance: {mean_mean_distance}')

# Step 2: Configure the genAI models

In [17]:
# GenAI Models
GENERATION_MODEL = "gemini-1.5-flash"
EMBEDDING_MODEL = "models/text-embedding-004"
GUARDING_MODEL = "gemini-1.5-flash-8b"

# Step 3: Define initial values for parameters to be adjusted

In [18]:
# Ingesting
FILE_NAMES = ['study_in_scarlett.txt']
CHUNK_SIZE = 2000

# Retrieval
K = 3

# Step 4: Prepare the knowledgebase

In [19]:
do_ingestion(file_names=FILE_NAMES, chunk_size=CHUNK_SIZE)

# Step 5: Manually evaluate responses

In [20]:
# Read csv from local files
evaluation_dataframe = pd.read_csv(EVALUATION_DATA_PATH + 'simple_evaluation_dataset.csv')
evaluation_dataframe.head()

Unnamed: 0,story_name,question,ground_truth_answer
0,A Study in Scarlet,What year does Dr. Watson complete his medical degree?,1878
1,A Study in Scarlet,Where do Sherlock Holmes and Dr. Watson decide to live together?,221B Baker Street
2,A Study in Scarlet,What word does Sherlock Holmes find written in blood on the wall at the crime scene?,RACHE
3,A Study in Scarlet,What profession does Sherlock Holmes describe himself as having?,A consulting detective
4,A Study in Scarlet,What clue suggests to Holmes that the murderer might have smoked a particular kind of cigar?,Ashes from a Trichinopoly cigar found at the crime scene


In [21]:
# The function gives us the dired outputs to gather some insights
def generate_rag_answers(dataframe: pd.DataFrame, k: int = K):
  def generate_rag_response_with_insights(question: str) -> pd.Series:
        start_time = time.time()  # Start timer

        # Generate the RAG response
        try:
            output = do_fact_checked_rag(question, k=k, verbose=False)
            response = output[0]  # Extract the response from the output
            distances = output[2]  # Extract the distances from the output
            min_distance = round(min(distances), 2)  # Find the minimum distance
            mean_distance = round(sum(distances) / len(distances), 2)  # Calculate the mean distance
        except FactCheckingValidationError:
            response = "Possible hallucination detected."  # Return an empty string in case of validation error
            min_distance = None
            mean_distance = None

        end_time = time.time()  # End timer
        execution_time = round(end_time - start_time, 2)   # Calculate execution time

        return pd.Series([response, execution_time, min_distance, mean_distance])  # Return response and time

  # Apply the timing function to each row and store results in new columns
  dataframe[['rag_response', 'response_time', 'min_context_distance', 'mean_context_distance']] = dataframe['question'].apply(generate_rag_response_with_insights)

  return dataframe


In [22]:
# Generate responses
evaluation_dataframe = generate_rag_answers(evaluation_dataframe)
evaluation_dataframe

Unnamed: 0,story_name,question,ground_truth_answer,rag_response,response_time,min_context_distance,mean_context_distance
0,A Study in Scarlet,What year does Dr. Watson complete his medical degree?,1878,1878 \n,2.6,0.83,0.95
1,A Study in Scarlet,Where do Sherlock Holmes and Dr. Watson decide to live together?,221B Baker Street,They decide to live together at 221B Baker Street. \n,2.13,0.84,0.86
2,A Study in Scarlet,What word does Sherlock Holmes find written in blood on the wall at the crime scene?,RACHE,RACHE \n,2.1,0.63,0.69
3,A Study in Scarlet,What profession does Sherlock Holmes describe himself as having?,A consulting detective,Sherlock Holmes describes himself as a **consulting detective**. \n,2.06,0.64,0.73
4,A Study in Scarlet,What clue suggests to Holmes that the murderer might have smoked a particular kind of cigar?,Ashes from a Trichinopoly cigar found at the crime scene,"Holmes found scattered ash on the floor that was dark in color and flakey, which is characteristic of a Trichinopoly cigar. \n",2.17,0.69,0.75
5,A Scandal in Bohemia,Who is the woman that Sherlock Holmes admires and refers to as 'the woman'?,Irene Adler,Possible hallucination detected.,2.02,,
6,The Red-Headed League,What unique characteristic qualified Mr. Jabez Wilson to join the Red-Headed League?,His bright red hair,Possible hallucination detected.,1.89,,
7,A Case of Identity,Who is Miss Mary Sutherland engaged to marry?,Hosmer Angel,Possible hallucination detected.,2.06,,
8,The Boscombe Valley Mystery,Who is initially suspected of killing Charles McCarthy?,"His son, James McCarthy",Possible hallucination detected.,1.94,,
9,The Five Orange Pips,Who is ultimately responsible for the threats and deaths in the Openshaw family?,The Ku Klux Klan,Possible hallucination detected.,2.04,,


In [23]:
print_insights(evaluation_dataframe)

Number of detected hallucinations: 10
Mean response time: 2.06 seconds
Mean minimum distance: 0.73
Mean mean distance: 0.8


# Step 5: Ingest remaining stories

In [24]:
# Ingest the second book, which contains the other stories
adjusted_file_names = ['adventures_of_sherlock_holmes.txt']
do_ingestion(file_names=adjusted_file_names, chunk_size=CHUNK_SIZE, clear_knowledgebase=False)

In [26]:
# Generate responses
evaluation_dataframe = generate_rag_answers(evaluation_dataframe)
evaluation_dataframe

Unnamed: 0,story_name,question,ground_truth_answer,rag_response,response_time,min_context_distance,mean_context_distance
0,A Study in Scarlet,What year does Dr. Watson complete his medical degree?,1878,1878 \n,2.28,0.83,0.95
1,A Study in Scarlet,Where do Sherlock Holmes and Dr. Watson decide to live together?,221B Baker Street,They decide to live together at 221B Baker Street. \n,1.94,0.83,0.84
2,A Study in Scarlet,What word does Sherlock Holmes find written in blood on the wall at the crime scene?,RACHE,RACHE \n,1.81,0.63,0.69
3,A Study in Scarlet,What profession does Sherlock Holmes describe himself as having?,A consulting detective,"Sherlock Holmes describes himself as a ""consulting detective"". \n",2.08,0.64,0.72
4,A Study in Scarlet,What clue suggests to Holmes that the murderer might have smoked a particular kind of cigar?,Ashes from a Trichinopoly cigar found at the crime scene,"Holmes found the ash of an Indian cigar, which he identified based on his knowledge of tobacco ashes. \n",2.14,0.59,0.66
5,A Scandal in Bohemia,Who is the woman that Sherlock Holmes admires and refers to as 'the woman'?,Irene Adler,Irene Adler \n,2.04,0.78,0.78
6,The Red-Headed League,What unique characteristic qualified Mr. Jabez Wilson to join the Red-Headed League?,His bright red hair,Mr. Jabez Wilson's red hair qualified him to join the Red-Headed League. \n,2.45,0.55,0.62
7,A Case of Identity,Who is Miss Mary Sutherland engaged to marry?,Hosmer Angel,Miss Mary Sutherland is engaged to marry Mr. Hosmer Angel. \n,2.12,0.82,0.85
8,The Boscombe Valley Mystery,Who is initially suspected of killing Charles McCarthy?,"His son, James McCarthy","James McCarthy, the son of the deceased, is initially suspected of killing Charles McCarthy. \n",1.91,0.87,0.9
9,The Five Orange Pips,Who is ultimately responsible for the threats and deaths in the Openshaw family?,The Ku Klux Klan,The Ku Klux Klan is ultimately responsible for the threats and deaths in the Openshaw family. \n,2.24,0.83,0.9


In [27]:
print_insights(evaluation_dataframe)

Number of detected hallucinations: 4
Mean response time: 2.06 seconds
Mean minimum distance: 0.74
Mean mean distance: 0.79


# Step 5: Manually adjust chunk size

In [28]:
# Define new chunk size. Be careful: The smaller the chunk_size, the more time is needed for the ingestion. I.e. adjusted_chunk_size = 400 takes 1-2 minutes
adjusted_chunk_size = 400

In [29]:
# We need to clear the knowledgebase for this
do_ingestion(file_names=adjusted_file_names, chunk_size=adjusted_chunk_size, clear_knowledgebase=True)

In [30]:
# Evaluate
evaluation_dataframe = generate_rag_answers(evaluation_dataframe)
evaluation_dataframe

Unnamed: 0,story_name,question,ground_truth_answer,rag_response,response_time,min_context_distance,mean_context_distance
0,A Study in Scarlet,What year does Dr. Watson complete his medical degree?,1878,Possible hallucination detected.,2.26,,
1,A Study in Scarlet,Where do Sherlock Holmes and Dr. Watson decide to live together?,221B Baker Street,Possible hallucination detected.,1.87,,
2,A Study in Scarlet,What word does Sherlock Holmes find written in blood on the wall at the crime scene?,RACHE,Possible hallucination detected.,1.99,,
3,A Study in Scarlet,What profession does Sherlock Holmes describe himself as having?,A consulting detective,"Sherlock Holmes describes himself as a ""professional"". \n",1.88,0.68,0.7
4,A Study in Scarlet,What clue suggests to Holmes that the murderer might have smoked a particular kind of cigar?,Ashes from a Trichinopoly cigar found at the crime scene,The clue is that the murderer smokes Indian cigars. \n,2.02,0.51,0.6
5,A Scandal in Bohemia,Who is the woman that Sherlock Holmes admires and refers to as 'the woman'?,Irene Adler,"The woman that Sherlock Holmes admires and refers to as ""the woman"" is Irene Adler. \n",1.9,0.54,0.59
6,The Red-Headed League,What unique characteristic qualified Mr. Jabez Wilson to join the Red-Headed League?,His bright red hair,Possible hallucination detected.,2.16,,
7,A Case of Identity,Who is Miss Mary Sutherland engaged to marry?,Hosmer Angel,Possible hallucination detected.,1.9,,
8,The Boscombe Valley Mystery,Who is initially suspected of killing Charles McCarthy?,"His son, James McCarthy","James McCarthy, the son, is initially suspected of killing Charles McCarthy. \n",1.91,0.86,0.88
9,The Five Orange Pips,Who is ultimately responsible for the threats and deaths in the Openshaw family?,The Ku Klux Klan,Possible hallucination detected.,2.46,,


In [31]:
print_insights(evaluation_dataframe)

Number of detected hallucinations: 9
Mean response time: 2.0 seconds
Mean minimum distance: 0.63
Mean mean distance: 0.68


# Step 6: Manually adjust k

In [32]:
adjusted_k = 30

In [33]:
# We do not need to ingest again, since our optimizations just affect the retrieval pipeline
evaluation_dataframe = generate_rag_answers(evaluation_dataframe, k=adjusted_k)
evaluation_dataframe

Unnamed: 0,story_name,question,ground_truth_answer,rag_response,response_time,min_context_distance,mean_context_distance
0,A Study in Scarlet,What year does Dr. Watson complete his medical degree?,1878,Possible hallucination detected.,2.22,,
1,A Study in Scarlet,Where do Sherlock Holmes and Dr. Watson decide to live together?,221B Baker Street,Baker Street \n,1.99,0.76,0.83
2,A Study in Scarlet,What word does Sherlock Holmes find written in blood on the wall at the crime scene?,RACHE,Possible hallucination detected.,2.11,,
3,A Study in Scarlet,What profession does Sherlock Holmes describe himself as having?,A consulting detective,"""It is my business to know what other people don’t know."" \n",2.06,0.68,0.77
4,A Study in Scarlet,What clue suggests to Holmes that the murderer might have smoked a particular kind of cigar?,Ashes from a Trichinopoly cigar found at the crime scene,"Holmes found the ash of an Indian cigar, which his knowledge of tobacco ashes enabled him to pronounce as an Indian cigar. \n",2.18,0.51,0.74
5,A Scandal in Bohemia,Who is the woman that Sherlock Holmes admires and refers to as 'the woman'?,Irene Adler,Irene Adler \n,2.04,0.54,0.75
6,The Red-Headed League,What unique characteristic qualified Mr. Jabez Wilson to join the Red-Headed League?,His bright red hair,"Mr. Jabez Wilson's unique characteristic that qualified him to join the Red-Headed League was his **bright, blazing, fiery red hair**. \n",2.06,0.46,0.8
7,A Case of Identity,Who is Miss Mary Sutherland engaged to marry?,Hosmer Angel,Miss Mary Sutherland is engaged to marry Hosmer Angel. \n,2.16,0.65,0.91
8,The Boscombe Valley Mystery,Who is initially suspected of killing Charles McCarthy?,"His son, James McCarthy","James McCarthy, the son of the deceased, is initially suspected of killing Charles McCarthy. \n",1.99,0.86,1.02
9,The Five Orange Pips,Who is ultimately responsible for the threats and deaths in the Openshaw family?,The Ku Klux Klan,Possible hallucination detected.,2.12,,


In [34]:
print_insights(evaluation_dataframe)

Number of detected hallucinations: 5
Mean response time: 2.09 seconds
Mean minimum distance: 0.63
Mean mean distance: 0.83
