# Gemini API [DO COST $$$]

In [1]:
import google.genai as genai
from google.genai import types

In [2]:
import pandas as pd
from datetime import datetime

### APIs Usage Log table

In [3]:
import pandas as pd
from datetime import datetime, timezone

LOG_COLUMNS = [
    "timestamp","query","uploaded_file","response_text","finish_reason",
    "cached_content_token_count","candidates_token_count",
    "prompt_token_count","thoughts_token_count","total_token_count",
]

def ensure_logs_df(logs_df: pd.DataFrame | None) -> pd.DataFrame:
    if logs_df is None or logs_df.empty:
        # Initialize with correct columns (no rows). Keep dtypes flexible.
        logs_df = pd.DataFrame({c: pd.Series(dtype="object") for c in LOG_COLUMNS})
    # Guarantee column order/superset
    for c in LOG_COLUMNS:
        if c not in logs_df.columns:
            logs_df[c] = pd.Series(dtype="object")
    return logs_df[LOG_COLUMNS]

def append_usage_log(logs_df, query_text, uploaded_file, resp=None):
    """
    Append a new usage log entry to an existing DataFrame (no FutureWarning).
    """
    logs_df = ensure_logs_df(logs_df)

    # Safe extraction
    usage = getattr(resp, "usage_metadata", None) if resp else None
    try:
        finish_reason = resp.candidates[0].finish_reason.name if (resp and getattr(resp, "candidates", None)) else None
    except Exception:
        finish_reason = None
    output_text = getattr(resp, "text", None) if resp else None

    # Build row
    new_row = {
        "timestamp": datetime.now(timezone.utc),  # tz-aware
        "query": query_text,
        "uploaded_file": uploaded_file,
        "response_text": output_text,
        "finish_reason": finish_reason,
        "cached_content_token_count": getattr(usage, "cached_content_token_count", None) if usage else None,
        "candidates_token_count": getattr(usage, "candidates_token_count", None) if usage else None,
        "prompt_token_count": getattr(usage, "prompt_token_count", None) if usage else None,
        "thoughts_token_count": getattr(usage, "thoughts_token_count", None) if usage else None,
        "total_token_count": getattr(usage, "total_token_count", None) if usage else None,
    }

    # Append without concat (avoids the deprecation)
    logs_df.loc[len(logs_df)] = [new_row.get(c, None) for c in LOG_COLUMNS]
    return logs_df

In [4]:
logs_df = pd.DataFrame({
    "timestamp": pd.Series(dtype="datetime64[ns]"),
    "query": pd.Series(dtype="string"),
    "uploaded_file": pd.Series(dtype="string"),
    "response_text": pd.Series(dtype="string"),
    "finish_reason": pd.Series(dtype="string"),
    "cached_content_token_count": pd.Series(dtype="Int64"),
    "candidates_token_count": pd.Series(dtype="Int64"),
    "prompt_token_count": pd.Series(dtype="Int64"),
    "thoughts_token_count": pd.Series(dtype="Int64"),
    "total_token_count": pd.Series(dtype="Int64"),
})
logs_df

Unnamed: 0,timestamp,query,uploaded_file,response_text,finish_reason,cached_content_token_count,candidates_token_count,prompt_token_count,thoughts_token_count,total_token_count


In [5]:
client = genai.Client(api_key="AIzaSyDHOSjzr-AedFPftuIK7iiZ0yTqaTkSDYQ")

### Fail to get response example due to 'MAX_TOKENS'>
GenerateContentResponse(
  automatic_function_calling_history=[],
  candidates=[
    Candidate(
      content=Content(
        role='model'
      ),
      finish_reason=<FinishReason.MAX_TOKENS: 'MAX_TOKENS'>,
      index=0
    ),
  ],
  model_version='gemini-2.5-flash',
  response_id='tPemaLqfEaOtmtkPiLTW4QI',
  sdk_http_response=HttpResponse(
    headers=<dict len=11>
  ),
  usage_metadata=GenerateContentResponseUsageMetadata(
    prompt_token_count=8,
    prompt_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.TEXT: 'TEXT'>,
        token_count=8
      ),
    ],
    thoughts_token_count=199,
    total_token_count=207
  )
)

In [6]:
resp = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=[{"role": "user", "parts": [{"text": "Give 3 bullets about ADC."}]}],
    config=types.GenerateContentConfig(
        max_output_tokens=1024,
        temperature=0.6,
    )
)

# Robust extractor
def extract_text(r):
    out = []
    for c in getattr(r, "candidates", []) or []:
        # finish_reason can be useful: types.FinishReason.STOP, SAFETY, etc.
        # print("finish_reason:", c.finish_reason)
        content = getattr(c, "content", None)
        if content:
            for p in getattr(content, "parts", []) or []:
                t = getattr(p, "text", None)
                if t:
                    out.append(t)
    # Fallback to r.text if present
    return "\n".join(out) or getattr(r, "text", "") or ""

print(extract_text(resp))

Here are 3 key bullets about Analog-


### PDF upload with text prompt

#### Step1: CV Extraction and Summarization

In [7]:
# Format the datetime object into a string
formatted_today = datetime.utcnow().strftime("%Y-%m-%d")

In [8]:
def master_gemini_upload_cv_prompt_w_log(
    uploaded_filename, prompt_text, logs_df,
    system_text       =  f"You are an expert HR assistant, make a summarization of the uploaded CV. Note that today is {formatted_today}.",
    max_output_tokens = 4096,
    temperature       = 0.1,
    top_p             = 0.9, # At each step, the model sorts possible next tokens by probability, then keeps only the smallest set whose cumulative probability ≥ top_p.
    top_k             = 40,  # At each step, the model only considers the top_k most likely tokens.
):
    # Step 1: Upload the PDF
    uploaded_file = client.files.upload(
        file=uploaded_filename, 
        config={"display_name": "CV"}
    )

    # Step 2: Pass the file reference into the request
    resp = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=[
            {
                "role": "user",
                "parts": [
                    {"text": prompt_text},
                    {"file_data": {"file_uri": uploaded_file.uri}}
                ],
            }
        ],
        config=types.GenerateContentConfig(
            system_instruction=[system_text],
            max_output_tokens = max_output_tokens,
            temperature = temperature,
            top_p = top_p,
            top_k = top_k,
        ),
    )
    
    try:
        print(f"resp.text:\n {resp.text}")
        size_info = calc_resp_text_size(resp)
        print(f"[resp.text size] chars={size_info['char_length']}, tokens≈{size_info['token_est']}")
    except Exception as e:
        print(f"resp.text: Not Found ({e})")
        size_info = {"char_length": 0, "token_est": 0}
        
    # Append a new log
    logs_df = append_usage_log(
        logs_df,
        query_text=prompt_text,
        uploaded_file=uploaded_filename,
        resp=resp
    )
    return resp, logs_df

def calc_resp_text_size(resp) -> dict:
    """
    Calculate the size of resp.text in characters and estimated tokens.
    
    Parameters
    ----------
    resp : object
        Gemini response object (with .text)
    
    Returns
    -------
    dict : {"char_length": int, "token_est": int}
    """
    if not hasattr(resp, "text") or resp.text is None:
        return {"char_length": 0, "token_est": 0}
    
    txt = resp.text
    char_length = len(txt)
    token_est   = len(txt.split())  # rough estimate
    
    return {"char_length": char_length, "token_est": token_est}


In [9]:
# uploaded_filename = "ExampleCV/NLP-CV-NachaiLim.pdf"
# uploaded_filename = "ExampleCV/CV-Oranid.pdf"
uploaded_filename = "ExampleCV/Natthaporn_CV2022.pdf"
system_text       =  f"You are an expert HR assistant, make a summarization of the uploaded CV. Note that today is {formatted_today}."
# prompt_text       = "Who is the individual in the CV? Summarize the uploaded CV's education and work experience in 5-10 concise bullet points with experience years. Then list his/her skills in python list."
prompt_text       = """
Analyze the uploaded CV and provide the following:

1. Identify the individual (full name).
2. Summarize education and work experience in **10–15 concise bullet points**, including years of experience for each role.
3. Provide a breakdown of total experience (in years) aggregated by **position title** across the job history.
4. Extract the list of skills and output them as a valid Python list (e.g., ["skill1", "skill2", "skill3"]).
"""

max_output_tokens = 4096*2
temperature       = 0.1
top_p             = 0.9 # At each step, the model sorts possible next tokens by probability, then keeps only the smallest set whose cumulative probability ≥ top_p.
top_k             = 40  # At each step, the model only considers the top_k most likely tokens.

In [10]:
resp_cv, logs_df = master_gemini_upload_cv_prompt_w_log(
    uploaded_filename, prompt_text, logs_df,
    system_text       = system_text,
    max_output_tokens = max_output_tokens,
    temperature       = temperature,
    top_p             = top_p, 
    top_k             = top_k, 
)
logs_df.tail(3)

resp.text:
 Here's a summarization of the provided CV:

### 1. Individual Identification
**Full Name:** Natthaporn Takpho

### 2. Education and Work Experience Summary (11 bullet points)

*   **Assistant Manager, Research Division, Mitsui Chemicals Singapore R&D Centre (May 2021 – Present; 4 years 4 months):** Leads and directs R&D projects in life science and healthcare, focusing on business development, market analysis, and patent management.
*   **Researcher, Mitsui Chemicals Singapore R&D Centre (Nov 2017 – Apr 2021; 3 years 6 months):** Planned and supervised R&D projects, developed business ideas, validated technology, and prepared patent applications.
*   **Molecular Microbiologist, Mahidol-Oxford Tropical Medicine Research Unit (Oct 2013 – Sep 2014; 1 year):** Performed molecular genetic analysis for epidemiological studies and worked in a BSL-3 laboratory.
*   **PhD in Biological Science, Nara Institute of Science and Technology (completed Sep 2017):** Focused on biological sc

Unnamed: 0,timestamp,query,uploaded_file,response_text,finish_reason,cached_content_token_count,candidates_token_count,prompt_token_count,thoughts_token_count,total_token_count
0,2025-09-01 09:33:29.233297+00:00,\nAnalyze the uploaded CV and provide the foll...,ExampleCV/Natthaporn_CV2022.pdf,Here's a summarization of the provided CV:\n\n...,STOP,,670,1172,1888,3730


#### Step2: JD Summarization

In [11]:
def master_gemini_jd_prompt_w_log(
    prompt_text, logs_df,
    system_text       = f"""
    You are an expert HR assistant, 
    make a summarization of the job description in a list of Key responsibilities, Mandaotry experiences & skills, Prefer experiences & skills.
    """,
    max_output_tokens = 4096,
    temperature       = 0.1,
    top_p             = 0.9, # At each step, the model sorts possible next tokens by probability, then keeps only the smallest set whose cumulative probability ≥ top_p.
    top_k             = 40,  # At each step, the model only considers the top_k most likely tokens.
):

    # Step 1: Pass the file reference into the request
    resp = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=[
            {
                "role": "user",
                "parts": [
                    {"text": prompt_text},
                ],
            }
        ],
        config=types.GenerateContentConfig(
            system_instruction=[system_text],
            max_output_tokens = max_output_tokens,
            temperature = temperature,
            top_p = top_p,
            top_k = top_k,
        ),
    )
    
    try:
        print(f"resp.text:\n {format_job_description(resp.text)}")
    except:
        print(f"resp.text:\n Not Found")
        
    # Append a new log
    logs_df = append_usage_log(
        logs_df,
        query_text=prompt_text,
        uploaded_file=uploaded_filename,
        resp=resp
    )
    return resp, logs_df

def format_job_description(raw_text: str) -> str:
    """
    Clean and format a job description string into a more readable format.
    Supports Markdown-style output for readability.
    """
    import re
    from textwrap import dedent

    # Step 1: Replace \n escape sequences with real line breaks
    formatted = raw_text.replace("\\n", "\n")

    # Step 2: Remove leading/trailing quotes or boilerplate
    formatted = re.sub(r'^"|"$', '', formatted.strip())

    # Step 3: Normalize multiple blank lines
    formatted = re.sub(r'\n{3,}', '\n\n', formatted)

    # Step 4: Dedent to align properly
    formatted = dedent(formatted)

    return formatted

In [12]:
prompt_text = f"""
Senior Data Scientist

ข้อมูลพื้นฐาน 5-10 ปี ปริญญาตรีขึ้นไป

รายละเอียดงานเบื้องต้น Key responsibilities: • Interpret data, and analyze results using statistical methods • Prepare and deliver business reports that effectively communicate trends, patterns, risks, and insights using data • Research, design, and develop machine-learning/artificial-intelligence/computer-vision systems to address key business challenges • Perform data quality testing, validation, and assurance as a part of designing, and implementing scalable data solutions • Support identification, triage, and remediation of data quality issues across the data and technology organizations • Effectively manage and develop small data analytics teams • Effectively engage and partner with team members across data analytics, technology development, and business strategy to drive outcomes and impact • Ensure end user requirements of data solutions are effectively met • Develop user friendly documentation to communicate the use and value of data solutions • Identify and push forward process improvement opportunities and solutions • Review and keep up to date with developments in the data analytics and ML fields

Education: • B.S., M.S., or Ph.D. degree in Computer Science, Mathematics, Software Engineering, Physics, and/or Data Science

Relevant experience required: • 5+ years hands-on experience with data mining, statistical analysis, distributed computing, data pipelining tools, and data health / monitoring frameworks • Proficiency with one or more programming languages or data engineering frameworks, such as Python, SQL, Spark, Java, C++, TypeScript/JavaScript, or similar • Proficiency with any of the major machine learning and computer vision frameworks preferred • Hands-on experience on BI solutions (e.g. Tableau, Power BI, Qlik, Looker) • Ability to work well with cross-disciplinary teams • Strong numerical and analytical skills with the ability to collect, organize, analyze, and disseminate significant amounts of information with attention to detail and accuracy • Communicate in English fluently

😆 Our Benefits: · Free lunch every Tuesday and Friday · WFH/WFA (Work From Home/Work From Anywhere) · Group insurance · Retirement savings fund · Well-stocked Snack Bar with snacks and beverages · Dress in your own style at work · Annual vacation 15 days
"""

system_text = f"""
You are an expert HR assistant, 
make a summarization of the job description in a list of Key responsibilities, Mandaotry experiences & skills, Prefer experiences & skills.
"""
max_output_tokens = 4096
temperature       = 0.1
top_p             = 0.9 # At each step, the model sorts possible next tokens by probability, then keeps only the smallest set whose cumulative probability ≥ top_p.
top_k             = 40  # At each step, the model only considers the top_k most likely tokens.

In [13]:
resp_jd, logs_df = master_gemini_jd_prompt_w_log(    
    prompt_text, logs_df,
    system_text       = system_text,
    max_output_tokens = max_output_tokens,
    temperature       = temperature,
    top_p             = top_p, 
    top_k             = top_k, 
)
logs_df.tail(3)

resp.text:
 Here's a summarization of the Senior Data Scientist job description:

---

**Senior Data Scientist**

**Key Responsibilities:**

*   Interpret data, analyze results using statistical methods, and prepare business reports to communicate trends, patterns, risks, and insights.
*   Research, design, and develop machine learning, artificial intelligence, and computer vision systems to address key business challenges.
*   Perform data quality testing, validation, and assurance as part of designing and implementing scalable data solutions.
*   Support the identification, triage, and remediation of data quality issues across data and technology organizations.
*   Effectively manage and develop small data analytics teams.
*   Engage and partner with cross-functional teams (data analytics, technology development, business strategy) to drive outcomes.
*   Ensure end-user requirements for data solutions are met and develop user-friendly documentation.
*   Identify and implement process

Unnamed: 0,timestamp,query,uploaded_file,response_text,finish_reason,cached_content_token_count,candidates_token_count,prompt_token_count,thoughts_token_count,total_token_count
0,2025-09-01 09:33:29.233297+00:00,\nAnalyze the uploaded CV and provide the foll...,ExampleCV/Natthaporn_CV2022.pdf,Here's a summarization of the provided CV:\n\n...,STOP,,670,1172,1888,3730
1,2025-09-01 09:33:38.423997+00:00,\nSenior Data Scientist\n\nข้อมูลพื้นฐาน 5-10 ...,ExampleCV/Natthaporn_CV2022.pdf,Here's a summarization of the Senior Data Scie...,STOP,,437,489,1410,2336


#### Step3: JD vs CV scoring

In [14]:
resp_jd.text[:500]

"Here's a summarization of the Senior Data Scientist job description:\n\n---\n\n**Senior Data Scientist**\n\n**Key Responsibilities:**\n\n*   Interpret data, analyze results using statistical methods, and prepare business reports to communicate trends, patterns, risks, and insights.\n*   Research, design, and develop machine learning, artificial intelligence, and computer vision systems to address key business challenges.\n*   Perform data quality testing, validation, and assurance as part of designing and"

In [15]:
resp_cv.text[:500]

"Here's a summarization of the provided CV:\n\n### 1. Individual Identification\n**Full Name:** Natthaporn Takpho\n\n### 2. Education and Work Experience Summary (11 bullet points)\n\n*   **Assistant Manager, Research Division, Mitsui Chemicals Singapore R&D Centre (May 2021 – Present; 4 years 4 months):** Leads and directs R&D projects in life science and healthcare, focusing on business development, market analysis, and patent management.\n*   **Researcher, Mitsui Chemicals Singapore R&D Centre (Nov 20"

In [16]:
output_template = """
### JD Mandatory Requirements [✓: Pass | ✗: Missing]
1. Requirement 1 — [✓/✗]
2. Requirement 2 — [✓/✗]

### JD Preferred Requirements [✓: Pass | ✗: Missing]
1. Requirement 1 — [✓/✗]
2. Requirement 2 — [✓/✗]

### Candidate Strengths (1–5 bullets)
1. Strength 1
2. Strength 2

### Candidate Weaknesses (1–5 bullets)
1. Weakness 1
2. Weakness 2

### JD vs CV Matching Score  
**Score:** X.X / 10.0  

**Reasoning (3–5 bullets):**  
1. Reason 1  
2. Reason 2  
3. Reason 3  
"""

In [17]:
system_text = f"""
You are an expert HR assistant, 
Filling in the output_template below based on the given job description (JD) and summarized resume (CV).
Only answer in the given output_template.

# output_template:
{output_template}
"""

prompt_text = f"""
# job description (JD):
{format_job_description(resp_jd.text)}

# summarized resume (CV):
{resp_cv.text}
"""

In [18]:
max_output_tokens = 4096*2
temperature       = 0.1
top_p             = 0.9 # At each step, the model sorts possible next tokens by probability, then keeps only the smallest set whose cumulative probability ≥ top_p.
top_k             = 40  # At each step, the model only considers the top_k most likely tokens.

In [19]:
resp_summary, logs_df = master_gemini_jd_prompt_w_log(    
    prompt_text, logs_df,
    system_text       = system_text,
    max_output_tokens = max_output_tokens,
    temperature       = temperature,
    top_p             = top_p, 
    top_k             = top_k, 
)
logs_df.tail(3)

resp.text:
 ### JD Mandatory Requirements [✓: Pass | ✗: Missing]
1.  **Education:** B.S., M.S., or Ph.D. degree in Computer Science, Mathematics, Software Engineering, Physics, and/or Data Science. — ✗
2.  **Experience:** 5+ years of hands-on experience with data mining, statistical analysis, distributed computing, data pipelining tools, and data health/monitoring frameworks. — ✗
3.  **Programming:** Proficiency with one or more programming languages or data engineering frameworks (e.g., Python, SQL, Spark, Java, C++, TypeScript/JavaScript). — ✗
4.  **BI Solutions:** Hands-on experience with Business Intelligence tools (e.g., Tableau, Power BI, Qlik, Looker). — ✗
5.  **Analytical Skills:** Strong numerical and analytical abilities with attention to detail and accuracy in collecting, organizing, analyzing, and disseminating information. — ✓
6.  **Communication:** Fluent in English. — ✓
7.  **Collaboration:** Ability to work effectively with cross-disciplinary teams. — ✓

### JD Preferre

Unnamed: 0,timestamp,query,uploaded_file,response_text,finish_reason,cached_content_token_count,candidates_token_count,prompt_token_count,thoughts_token_count,total_token_count
0,2025-09-01 09:33:29.233297+00:00,\nAnalyze the uploaded CV and provide the foll...,ExampleCV/Natthaporn_CV2022.pdf,Here's a summarization of the provided CV:\n\n...,STOP,,670,1172,1888,3730
1,2025-09-01 09:33:38.423997+00:00,\nSenior Data Scientist\n\nข้อมูลพื้นฐาน 5-10 ...,ExampleCV/Natthaporn_CV2022.pdf,Here's a summarization of the Senior Data Scie...,STOP,,437,489,1410,2336
2,2025-09-01 09:33:51.643159+00:00,\n# job description (JD):\nHere's a summarizat...,ExampleCV/Natthaporn_CV2022.pdf,### JD Mandatory Requirements [✓: Pass | ✗: Mi...,STOP,,766,1356,1881,4003


## Terra job description example

source: https://www.jobfinfin.com/job/659cf72b64805d54e65dc789

Senior Data Scientist

ข้อมูลพื้นฐาน
5-10 ปี
ปริญญาตรีขึ้นไป

รายละเอียดงานเบื้องต้น
Key responsibilities:
• Interpret data, and analyze results using statistical methods
• Prepare and deliver business reports that effectively communicate trends, patterns, risks, and insights using data
• Research, design, and develop machine-learning/artificial-intelligence/computer-vision systems to address key business challenges 
• Perform data quality testing, validation, and assurance as a part of designing, and implementing scalable data solutions
• Support identification, triage, and remediation of data quality issues across the data and technology organizations
• Effectively manage and develop small data analytics teams
• Effectively engage and partner with team members across data analytics, technology development, and business strategy to drive outcomes and impact
• Ensure end user requirements of data solutions are effectively met
• Develop user friendly documentation to communicate the use and value of data solutions
• Identify and push forward process improvement opportunities and solutions
• Review and keep up to date with developments in the data analytics and ML fields
 
Education:
• B.S., M.S., or Ph.D. degree in Computer Science, Mathematics, Software Engineering, Physics, and/or Data Science
 
Relevant experience required:
• 5+ years hands-on experience with data mining, statistical analysis, distributed computing, data pipelining tools, and data health / monitoring frameworks
• Proficiency with one or more programming languages or data engineering frameworks, such as Python, SQL, Spark, Java, C++, TypeScript/JavaScript, or similar
• Proficiency with any of the major machine learning and computer vision frameworks preferred
• Hands-on experience on BI solutions (e.g. Tableau, Power BI, Qlik, Looker)
• Ability to work well with cross-disciplinary teams
• Strong numerical and analytical skills with the ability to collect, organize, analyze, and disseminate significant amounts of information with attention to detail and accuracy
• Communicate in English fluently

😆 Our Benefits:
· Free lunch every Tuesday and Friday
· WFH/WFA (Work From Home/Work From Anywhere)
· Group insurance
· Retirement savings fund
· Well-stocked Snack Bar with snacks and beverages
· Dress in your own style at work
· Annual vacation 15 days 

# Text Embedding for RAG

In [20]:
import os
import json
from datetime import datetime
from typing import List, Dict, Tuple, Optional, Any
from google import genai
import faiss
import numpy as np
import pickle
from copy import deepcopy

In [21]:
docs = [
    "Cloud adoption helps startups scale infrastructure with low upfront costs.",
    "Hybrid cloud combines private and public resources for flexibility.",
    "On-premise solutions give companies more control but higher costs."
]

In [22]:
# Create embeddings
embeddings = [
    client.models.embed_content(
        model="text-embedding-004",
        contents=[{"role": "user", "parts": [{"text": doc}]}]
    ).embeddings[0].values
    for doc in docs
]

# Convert to NumPy array for FAISS
emb_matrix = np.array(embeddings).astype("float32")

# Build FAISS index (cosine similarity → use inner product + normalized vectors)
faiss.normalize_L2(emb_matrix)
index = faiss.IndexFlatIP(emb_matrix.shape[1])
index.add(emb_matrix)

In [23]:
# Search for nearest docs
query = "What are the benefits of cloud computing?"
query_emb = client.models.embed_content(
    model="text-embedding-004",
    contents=[{"role": "user", "parts": [{"text": query}]}]
).embeddings[0].values

query_emb = np.array([query_emb]).astype("float32")
faiss.normalize_L2(query_emb)  # only if using cosine/IP index

D, I = index.search(query_emb, k=2)  # distances, indices
retrieved_docs = [docs[i] for i in I[0]]
print("Retrieved docs:", retrieved_docs)

Retrieved docs: ['Cloud adoption helps startups scale infrastructure with low upfront costs.', 'Hybrid cloud combines private and public resources for flexibility.']


In [24]:
def simple_token_count(text: str) -> int:
    """Very rough token estimate (whitespace split).
    Replace with your tokenizer if needed."""
    return len(text.split())

class FaissVectorStore:
    """
    Minimal FAISS + metadata store with cosine similarity search.
    - Stores vectors in FAISS (IndexFlatIP) with L2-normalized vectors (so IP == cosine sim)
    - Stores metadata separately (JSONL or pickle)
    """

    def __init__(self, dim: int):
        self.dim = dim
        self.index = faiss.IndexFlatIP(dim)  # inner product (use with normalized vectors)
        self._vectors = []                  # keep in RAM until saved (optional)
        self._metadata: List[Dict] = []     # [{"text":..., "timestamp":..., "token_length":...}, ...]

    # ---------- Embedding helpers (plug your embedding client here) ----------
    @staticmethod
    def _to_float32(arr) -> np.ndarray:
        return np.array(arr, dtype="float32")

    def add_texts_and_metadata(
        self,
        texts: List[str],
        embeddings: List[List[float]],
        metadata_list: Optional[List[Dict]] = None,
        default_metadata: Optional[Dict] = None,
    ):
        """
        Add texts with precomputed embeddings + flexible per-item metadata.
    
        Parameters
        ----------
        texts : List[str]
            Plaintext chunks to index.
        embeddings : List[List[float]]
            Precomputed embeddings matching each text (dim == self.dim).
            These will be L2-normalized for cosine similarity with IndexFlatIP.
        metadata_list : Optional[List[Dict]]
            A list of metadata dicts (one per text). Each dict can contain any keys
            (JSONL-friendly). Missing keys are allowed and will be auto-filled where applicable.
            If None, minimal metadata will be generated.
        default_metadata : Optional[Dict]
            Default metadata merged into each item *before* item-specific overrides.
            Example: {"source": "my_corpus", "tag": "v1"}.
    
        Behavior
        --------
        - For each item i, final metadata = {**default_metadata, **metadata_list[i]} (if provided)
        - Auto-fill:
            * "timestamp" (UTC ISO 8601) if not present
            * "token_length" via simple_token_count(text) if not present
        - All keys are preserved as-is to support JSONL dumps without schema constraints.
    
        Raises
        ------
        ValueError
            If lengths of texts, embeddings, and metadata_list (when provided) mismatch.
        """
        n = len(texts)
        if len(embeddings) != n:
            raise ValueError("embeddings length must match texts length")
    
        if metadata_list is not None and len(metadata_list) != n:
            raise ValueError("metadata_list length must match texts length when provided")
    
        # Normalize and add vectors
        vecs = np.asarray(embeddings, dtype="float32")
        faiss.normalize_L2(vecs)
        self.index.add(vecs)
        self._vectors.extend(vecs)
    
        # Prepare defaults
        default_metadata = default_metadata or {}
    
        # Build metadata rows JSONL-friendly (arbitrary keys preserved)
        now_iso = datetime.utcnow().isoformat()
        for i, t in enumerate(texts):
            item_meta = metadata_list[i] if metadata_list is not None else {}
            # Merge: defaults first, then item-specific override
            meta = {**default_metadata, **item_meta}
    
            # Auto-fill timestamp if missing
            meta.setdefault("timestamp", now_iso)
            # Auto-fill token_length if missing (won't overwrite if provided)
            meta.setdefault("token_length", simple_token_count(t))
            # Always store the raw text (if you want to make it optional, remove this line)
            meta.setdefault("text", t)
    
            self._metadata.append(meta)
            
    def clear(self):
        """
        Clear all vectors and metadata from the store.
        This reinitializes the FAISS index and empties metadata.
        """
        # Recreate a fresh FAISS index with the same dimension
        self.index = faiss.IndexFlatIP(self.dim)  # cosine sim (normalized IP)
        
        # Reset vectors and metadata
        self._vectors = []
        self._metadata = []         

    def search(self, query_embedding: List[float], k: int = 5) -> List[Tuple[Dict, float]]:
        """
        Search the index by embedding. Returns top-k as [(metadata, similarity), ...].
        Similarity is cosine similarity (since we normalized and use IP index).
        """
        q = self._to_float32([query_embedding])
        faiss.normalize_L2(q)  # normalize query for cosine/IP
        D, I = self.index.search(q, k)
        hits = []
        for score, idx in zip(D[0], I[0]):
            if idx == -1:  # no result
                continue
            hits.append((self._metadata[idx], float(score)))  # score ∈ [-1, 1]
        return hits

    # ---------- Persistence ----------
    def save(self, index_path: str, metadata_path: str):
        """Save FAISS index and metadata (JSONL if .jsonl else pickle)."""
        faiss.write_index(self.index, index_path)

        # Save metadata (choose JSONL by default for readability)
        if metadata_path.endswith(".jsonl"):
            with open(metadata_path, "w", encoding="utf-8") as f:
                for row in self._metadata:
                    f.write(json.dumps(row, ensure_ascii=False) + "\n")
        else:
            with open(metadata_path, "wb") as f:
                pickle.dump(self._metadata, f)

    @classmethod
    def load(cls, index_path: str, metadata_path: str):
        """Load FAISS index and metadata; returns an initialized store."""
        index = faiss.read_index(index_path)

        # Load metadata
        if metadata_path.endswith(".jsonl"):
            metadata = []
            with open(metadata_path, "r", encoding="utf-8") as f:
                for line in f:
                    metadata.append(json.loads(line))
        else:
            with open(metadata_path, "rb") as f:
                metadata = pickle.load(f)

        dim = index.d  # read dimension from index
        store = cls(dim)
        store.index = index
        store._metadata = metadata
        return store

# BM25 for keyword search

In [25]:
from rank_bm25 import BM25Okapi
import regex as re

In [26]:
class BM25Index:
    """
    Persistent BM25 index with UNIQUE docs keyed by doc_id.
    - doc_id := doc['id'] if present, else doc['uploaded_filename']
    - add_or_replace(): replaces existing doc with same doc_id
    - load(): dedupes by doc_id (last one wins)
    """
    def __init__(self, by_id=None):
        self.by_id = by_id or {}  # doc_id -> meta (includes 'text')
        self._bm25 = None
        self._ids = []     # order aligned with _tokens
        self._tokens = []  # tokenized texts in same order
        self._docs = {} 

    # ---------- persistence ----------
    @classmethod
    def load(cls, meta_path: str):
        by_id = {}
        if os.path.exists(meta_path):
            with open(meta_path, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    d = json.loads(line)
                    doc_id = d.get("id") or d.get("uploaded_filename")
                    if not doc_id:
                        # skip malformed rows
                        continue
                    # normalize stored id & text
                    d["id"] = doc_id
                    d["text"] = (d.get("text") or "").strip()
                    # last one wins for same doc_id
                    by_id[doc_id] = d
        return cls(by_id)

    def save(self, meta_path: str):
        with open(meta_path, "w", encoding="utf-8") as f:
            for doc_id, d in self.by_id.items():
                f.write(json.dumps(d, ensure_ascii=False) + "\n")

    def clear(self):
        self.by_id.clear()
        self._bm25 = None
        self._ids = []
        self._tokens = []

    # ---------- core ops ----------
    def _rebuild(self):
        # build arrays in a stable order
        self._ids = list(self.by_id.keys())
        self._tokens = [_tokenize(self.by_id[i].get("text", "")) for i in self._ids]
        self._bm25 = BM25Okapi(self._tokens)

    def add_or_replace(self, doc_meta: dict):
        doc_id = doc_meta.get("id") or doc_meta.get("uploaded_filename")
        if not doc_id:
            raise ValueError("doc_meta must include 'id' or 'uploaded_filename'")
        # normalize
        doc_meta = {**doc_meta, "id": doc_id, "text": (doc_meta.get("text") or "").strip()}
        self.by_id[doc_id] = doc_meta
        self._bm25 = None  # mark dirty (lazy rebuild)

    def search(self, query: str, k: int = 5):
        if not self.by_id:
            return []
        if self._bm25 is None:
            self._rebuild()
        q_tokens = _tokenize(query or "")
        scores = self._bm25.get_scores(q_tokens)
        order = np.argsort(scores)[::-1]
        hits = []
        for i in order:
            doc_id = self._ids[i]
            meta = self.by_id[doc_id]
            hits.append((meta, float(scores[i])))
            if len(hits) >= k:
                break
        return hits

    def __len__(self):
        return len(self._docs)

    def count(self):
        return len(self._docs)

def _tokenize(txt: str):
    return re.findall(r"[A-Za-z0-9\-]+", (txt or "").lower())

# Creating Hybrid index with RAG and BM25

In [27]:
def embed_texts_with_gemini(client, texts: List[str], embedding_modelname:str ="text-embedding-004") -> List[List[float]]:
    emb_list = []
    for t in texts:
        emb = client.models.embed_content(
            model=embedding_modelname,
            contents=[{"role": "user", "parts": [{"text": t}]}]
        ).embeddings[0].values
        emb_list.append(emb)
    return emb_list

def embed_query_with_gemini(client, query: str, embedding_modelname:str ="text-embedding-004") -> List[float]:
    resp = client.models.embed_content(
        model=embedding_modelname,
        contents=[{"role": "user", "parts": [{"text": query}]}]
    )
    embed_query = resp.embeddings[0].values
    
    return embed_query, resp

In [28]:
# Format the datetime object into a string
formatted_today = datetime.utcnow().strftime("%Y-%m-%d")
system_text       =  f"You are an expert HR assistant, make a summarization of the uploaded CV. Note that today is {formatted_today}."
prompt_text       = """
Analyze the uploaded CV and provide the following:

1. Identify the individual (full name).
2. Summarize education and work experience in **10–15 concise bullet points**, including years of experience for each role.
3. Provide a breakdown of total experience (in years) aggregated by **position title** across the job history.
4. Extract the list of skills and output them as a valid Python list (e.g., ["skill1", "skill2", "skill3"]).
"""

max_output_tokens = 4096*2
temperature       = 0.1
top_p             = 0.9 # At each step, the model sorts possible next tokens by probability, then keeps only the smallest set whose cumulative probability ≥ top_p.
top_k             = 40  # At each step, the model only considers the top_k most likely tokens.

In [29]:
uploaded_filenames_list = [
    "ExampleCV/NLP-CV-NachaiLim.pdf",
    "ExampleCV/CV-Oranid.pdf",
    "ExampleCV/Natthaporn_CV2022.pdf",
]
applied_position = "senior data scientist"
embedding_modelname = "text-embedding-004"
vector_dbname = "vector_and_bm25_dbs/vector_index.faiss"
vector_dbmeta = "vector_and_bm25_dbs/vector_metadata.jsonl"
bm25_dbmeta   = "vector_and_bm25_dbs/bm25_metadata.jsonl"  

In [30]:
# ========= Load / init FAISS (and clear) =========
try:
    loaded_store = FaissVectorStore.load(vector_dbname, vector_dbmeta)
    loaded_store.clear()
    loaded_store.save(vector_dbname, vector_dbmeta)  # persist cleared index
except Exception:
    print(f"Fail to load '{vector_dbname}' & '{vector_dbmeta}', creating new vector DB instead")
    loaded_store = FaissVectorStore(dim=768)
    loaded_store.save(vector_dbname, vector_dbmeta)  # <-- persist empty index here

# ========= Load / init BM25 (and clear like FAISS) =========
try:
    bm25_index = BM25Index.load(bm25_dbmeta)
    bm25_index.clear()
    bm25_index.save(bm25_dbmeta)
except Exception:
    bm25_index = BM25Index()
    bm25_index.save(bm25_dbmeta)
# ==================================
# Ingest loop (load -> add -> save)
# ==================================
for idx, uploaded_filename in enumerate(uploaded_filenames_list):
    print(f"\n==== Start Processing {idx}st CV from {len(uploaded_filenames_list)} CVs: {uploaded_filename} ====")
    try:
        print(f">>>>>  Start Summarizing: {uploaded_filename}")
        # Step 1: Raw CV to summarized CV
        resp_cv, logs_df = master_gemini_upload_cv_prompt_w_log(
            uploaded_filename, prompt_text, logs_df,
            system_text       = system_text,
            max_output_tokens = max_output_tokens,
            temperature       = temperature,
            top_p             = top_p, 
            top_k             = top_k, 
        )

        # Step 2: Embedding the summarized CV
        print(f">>>>>  Start Embedding: {uploaded_filename}")
        doc_text = (resp_cv.text or "").strip()
        docs = [doc_text]
        per_item_meta = [{"id": idx, "uploaded_filename": uploaded_filename, "applied_position": applied_position},]
        defaults = {"embedding_model": embedding_modelname, "project": "cv-summarization"}
        doc_embs = embed_texts_with_gemini(client, docs, embedding_modelname = embedding_modelname)

        # Step 3: Saving the embedding
        print(f">>>>>  Start Saving to FAISS db + Meta data: {uploaded_filename}")
        loaded_store = FaissVectorStore.load(vector_dbname, vector_dbmeta)
        loaded_store.add_texts_and_metadata(
            texts=docs,
            embeddings=doc_embs,
            metadata_list=per_item_meta,
            default_metadata=defaults,
        )
        loaded_store.save(vector_dbname, vector_dbmeta)

        # Step 4: Save to BM25 (load -> add -> save, same style)
        print(f">>>>>  Start Adding BM25 index: {uploaded_filename}")
        # reload current BM25 from disk to mirror FAISS pattern (optional but "same way")
        bm25_index = BM25Index.load(bm25_dbmeta)
        bm25_meta = {**defaults, **per_item_meta[0], "text": doc_text}
        bm25_index.add_or_replace(bm25_meta)
        bm25_index.save(bm25_dbmeta)
        print(f"Added {uploaded_filename} to BM25 (total {len(bm25_index)} docs).")

    except Exception as e:
        print(f"Failed to upload and/or extract CV: {e}")
print(f"\n==== Completed Processing {len(uploaded_filenames_list)} CVs ====")
logs_df.head(3)


==== Start Processing 0st CV from 3 CVs: ExampleCV/NLP-CV-NachaiLim.pdf ====
>>>>>  Start Summarizing: ExampleCV/NLP-CV-NachaiLim.pdf
resp.text:
 Here's a summarization of the uploaded CV:

---

**1. Individual's Full Name:**
Nachai Limsettho

---

**2. Education and Work Experience Summary:**

*   **Senior Data Scientist at TipTip Network PTE. LTD. (Singapore) (2022-Present, 3 years):** Led the design and implementation of Digital Content and Creator recommendation systems on AWS.
*   **Senior Data Scientist at TipTip Network PTE. LTD. (Singapore) (2022-Present, 3 years):** Engineered and implemented automated eKYC, video, and text moderation systems, significantly reducing API costs.
*   **Data Scientist at OVO (PT Visionet Internasional) (Singapore) (2019-2022, 3 years):** Led the development of multiple customer score models and a sentiment analysis model using NLP and topic modeling.
*   **Data Scientist at OVO (PT Visionet Internasional) (Singapore) (2019-2022, 3 years):** Devel

Unnamed: 0,timestamp,query,uploaded_file,response_text,finish_reason,cached_content_token_count,candidates_token_count,prompt_token_count,thoughts_token_count,total_token_count
0,2025-09-01 09:33:29.233297+00:00,\nAnalyze the uploaded CV and provide the foll...,ExampleCV/Natthaporn_CV2022.pdf,Here's a summarization of the provided CV:\n\n...,STOP,,670,1172,1888,3730
1,2025-09-01 09:33:38.423997+00:00,\nSenior Data Scientist\n\nข้อมูลพื้นฐาน 5-10 ...,ExampleCV/Natthaporn_CV2022.pdf,Here's a summarization of the Senior Data Scie...,STOP,,437,489,1410,2336
2,2025-09-01 09:33:51.643159+00:00,\n# job description (JD):\nHere's a summarizat...,ExampleCV/Natthaporn_CV2022.pdf,### JD Mandatory Requirements [✓: Pass | ✗: Mi...,STOP,,766,1356,1881,4003


In [31]:
# ========= Example query after loop =========
query = "Study in Australia"
q_emb, _ = embed_query_with_gemini(client, query, embedding_modelname=embedding_modelname)

vec_hits = loaded_store.search(q_emb, k=3)
bm25_hits = bm25_index.search(query, k=3)

def show(hits, title):
    print(f"\n=== {title} ===")
    for meta, score in hits:
        print(f"[score={score:.4f}] {meta['uploaded_filename']}")

show(vec_hits, "Vector Search")
show(bm25_hits, "BM25 Search")


=== Vector Search ===
[score=0.4102] ExampleCV/CV-Oranid.pdf
[score=0.3402] ExampleCV/Natthaporn_CV2022.pdf
[score=0.3237] ExampleCV/NLP-CV-NachaiLim.pdf

=== BM25 Search ===
[score=0.0925] ExampleCV/Natthaporn_CV2022.pdf
[score=0.0823] ExampleCV/CV-Oranid.pdf
[score=0.0783] ExampleCV/NLP-CV-NachaiLim.pdf


# Creating context prompt with RAG & BM25 from query 

In [32]:
def _doc_id(m: dict) -> str:
    return m.get("id") or m.get("uploaded_filename") or str(id(m))

def _safe(txt: str) -> str:
    return (txt or "").strip()

def _truncate_chars(txt: str, max_chars: int) -> str:
    t = _safe(txt)
    return t if len(t) <= max_chars else (t[:max_chars].rstrip() + "…")
    
def _rrf_fuse(
    vec_hits: List[Tuple[Dict[str, Any], float]],
    bm25_hits: List[Tuple[Dict[str, Any], float]],
    k_final: int = 5,
    rrf_k: int = 60,
    w_vec: float = 1.0,
    w_bm: float = 1.0,
) -> List[Dict[str, Any]]:
    """
    Fuse two ranked result lists (vector search and BM25) with Weighted Reciprocal Rank Fusion (RRF).

    Args:
        vec_hits: A list of (metadata, score) pairs from the vector/embedding search.
                  `metadata` must contain a stable identifier consumable by `_doc_id(meta)`.
        bm25_hits: A list of (metadata, score) pairs from the BM25 keyword search.
        k_final: Number of fused items to return.
        rrf_k: RRF damping constant. Larger values reduce the influence of rank differences.
               Classic literature often uses 60. Must be > 0.
        w_vec: Weight for the vector-search contribution to the fused score.
        w_bm: Weight for the BM25 contribution to the fused score.

    Returns:
        A list of metadata dicts (length ≤ `k_final`) ranked by fused score (descending).
        Each returned dict is a shallow copy of the original metadata and includes:
            - "_score_vec": the original vector score for that item (0 if absent)
            - "_score_bm25": the original BM25 score for that item (0 if absent)
            - "_score_fused": the final fused score (higher = better)

    Notes:
        - The fusion is rank-based, robust to incomparable raw score scales.
        - Items present in only one list are still included (the other rank treated as ∞).
        - This function depends on an external `_doc_id(meta: dict) -> str` helper that
          returns a unique/stable doc ID (e.g., meta["id"] or meta["uploaded_filename"]).

    Example:
        fused = _rrf_fuse(vec_hits, bm25_hits, k_final=5, rrf_k=60, w_vec=1.0, w_bm=0.7)
    """
    # Build rank maps: lower rank number = better (1 is best).
    # We sort each hits list by score descending to assign ranks.
    vec_ranks = {
        _doc_id(m): r
        for r, (m, _) in enumerate(sorted(vec_hits, key=lambda x: x[1], reverse=True), 1)
    }
    bm_ranks = {
        _doc_id(m): r
        for r, (m, _) in enumerate(sorted(bm25_hits, key=lambda x: x[1], reverse=True), 1)
    }

    # Remember original scores so we can attach them to outputs later.
    vec_scores = { _doc_id(m): s for m, s in vec_hits }
    bm_scores  = { _doc_id(m): s for m, s in bm25_hits }

    # Union of all doc IDs across both runs.
    all_ids = set(vec_ranks.keys()) | set(bm_ranks.keys())

    fused: List[Tuple[str, float]] = []
    for did in all_ids:
        # If a doc is missing in one list, give it an effectively infinite rank.
        r_vec = vec_ranks.get(did, 10**9)
        r_bm  = bm_ranks.get(did, 10**9)

        # Weighted Reciprocal Rank Fusion:
        #   fused = w_vec * 1/(rrf_k + rank_vec) + w_bm * 1/(rrf_k + rank_bm)
        f = (w_vec * (1.0 / (rrf_k + r_vec))) + (w_bm * (1.0 / (rrf_k + r_bm)))
        fused.append((did, f))

    # Sort by fused score (descending).
    fused.sort(key=lambda x: x[1], reverse=True)

    # Build a lookup from ID -> original metadata (first one encountered wins;
    # both lists should contain identical metas for the same ID in sane pipelines).
    by_id_meta: Dict[str, Dict[str, Any]] = {}
    for m, _ in vec_hits + bm25_hits:
        did = _doc_id(m)
        if did not in by_id_meta:
            by_id_meta[did] = m

    # Materialize the top-k results with attached score diagnostics.
    out: List[Dict[str, Any]] = []
    for did, f in fused[:k_final]:
        m = dict(by_id_meta[did])  # shallow copy to avoid mutating upstream metadata
        m["_score_vec"] = float(vec_scores.get(did, 0.0))
        m["_score_bm25"] = float(bm_scores.get(did, 0.0))
        m["_score_fused"] = float(f)
        out.append(m)

    return out


def _build_context_block(ranked_ctx: List[dict], max_chars_per_ctx=900) -> str:
    """
    Build a compact context with numeric citations [1], [2], ...
    """
    lines = []
    for i, m in enumerate(ranked_ctx, 1):
        uf = m.get("uploaded_filename", "doc")
        preview = _truncate_chars(m.get("text", ""), max_chars_per_ctx)
        lines.append(
            f"[{i}] file: {uf} | vec={m.get('_score_vec',0):.3f} | bm25={m.get('_score_bm25',0):.3f} | fused={m.get('_score_fused',0):.3f}\n{preview}"
        )
    return "\n\n".join(lines)

# --------- Main: RAG + BM25 + Gemini ---------
def master_gemini_rag_bm25_answer_w_log(
    client,
    query: str,
    vector_store,         # your FaissVectorStore handle
    bm25_index,           # your BM25Index handle (with .search)
    logs_df,
    # retrieval knobs
    k_vec=4, k_bm25=6, k_final=5,
    rrf_k=60, w_vec=1.0, w_bm=1.0,
    max_chars_per_ctx=4096,
    # model knobs (same style as your JD helper)
    system_text = """
    You are an expert HR assistant.
    Use only the provided CONTEXT to answer. If not found, say so.
    Cite sources with [n] where n matches the context block number.
    Respond concisely for recruiters.
    """,
    model_name="gemini-2.5-flash",
    max_output_tokens=4096*3,
    temperature=0.1,
    top_p=0.9,
    top_k=40,
    verbose = 0
):
    # 1) Vector + BM25
    q_emb, _ = embed_query_with_gemini(client, query, embedding_modelname="text-embedding-004")
    vec_hits  = vector_store.search(q_emb, k=k_vec)       # [(meta, sim)]
    bm_hits   = bm25_index.search(query, k=k_bm25)        # [(meta, score)]

    # 2) Fuse
    fused_ctx = _rrf_fuse(vec_hits, bm_hits, k_final=k_final, rrf_k=rrf_k, w_vec=w_vec, w_bm=w_bm)
    context_block = _build_context_block(fused_ctx, max_chars_per_ctx=max_chars_per_ctx)

    # 3) Build final prompt
    user_prompt = f"""QUESTION:
    {query}
    
    CONTEXT (numbered; cite as [n]):
    {context_block}
    
    INSTRUCTIONS:
    - Answer the QUESTION based only on the CONTEXT.
    - If multiple candidates match, list top matches with 1-2 line justifications.
    - Use [n] citations after each claim referencing a specific context block.
    - If the answer is not present, say "Not found in provided documents.
    """

    # 4) Call Gemini
    resp = client.models.generate_content(
        model=model_name,
        contents=[{"role": "user", "parts": [{"text": user_prompt}]}],
        config=types.GenerateContentConfig(
            system_instruction=[system_text],
            max_output_tokens=max_output_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
        ),
    )

    # 5) Optional: pretty print + logging
    if verbose > 0:
        try:
            print("LLM answer:\n", resp.text)
        except Exception:
            print("LLM answer:\n <no text>")

    # Append a new log
    logs_df = append_usage_log(
        logs_df,
        query_text=query,
        uploaded_file=None,
        resp=resp
    )
    return resp, logs_df, fused_ctx


In [33]:
query = "Which candidates have work experience in Singapore?"
resp, logs_df, used_ctx = master_gemini_rag_bm25_answer_w_log(
    client=client,
    query=query,
    vector_store=loaded_store,
    bm25_index=bm25_index,
    logs_df=logs_df,
    k_vec=3, k_bm25=3, k_final=2, 
    w_vec=1.0, w_bm=1.0,
    verbose = 1
)
print(used_ctx[0]['uploaded_filename'])

LLM answer:
 The following candidates have work experience in Singapore:

*   **Natthaporn Takpho** has worked as an Assistant Manager and Researcher at Mitsui Chemicals Singapore R&D Centre [1].
*   **Nachai Limsettho** has worked as a Senior Data Scientist at TipTip Network PTE. LTD. (Singapore), a Data Scientist at OVO (PT Visionet Internasional) (Singapore), and a Senior Executive, Data Analytics at Allianz SE (Singapore) [2].
ExampleCV/Natthaporn_CV2022.pdf


In [34]:
query = "Which candidates study in Australia?"
resp, logs_df, used_ctx = master_gemini_rag_bm25_answer_w_log(
    client=client,
    query=query,
    vector_store=loaded_store,
    bm25_index=bm25_index,
    logs_df=logs_df,
    k_vec=3, k_bm25=3, k_final=2, 
    w_vec=1.0, w_bm=1.0,
    verbose = 1
)
print(used_ctx[0]['uploaded_filename'])

LLM answer:
 Oranid Yenradee studies in Australia, pursuing a Master of Commerce (Extension) in Data Analytics at the University of Sydney [1].
ExampleCV/CV-Oranid.pdf


# Simple chat GUI with Gradio

In [35]:
import gradio as gr

In [36]:
def chat_handler(user_msg, history, k_vec, k_bm25, k_final, rrf_k, w_vec, w_bm, temperature, max_tokens):
    global logs_df
    history = history or []  # history is a list of {"role":..., "content":...}

    # Call your RAG+BM25 function
    resp, logs_df, used_ctx = master_gemini_rag_bm25_answer_w_log(
        client=client,
        query=user_msg,
        vector_store=loaded_store,
        bm25_index=bm25_index,
        logs_df=logs_df,
        k_vec=int(k_vec),
        k_bm25=int(k_bm25),
        k_final=int(k_final),
        rrf_k=int(rrf_k),
        w_vec=float(w_vec),
        w_bm=float(w_bm),
        max_output_tokens=int(max_tokens),
        temperature=float(temperature),
    )

    answer = getattr(resp, "text", "") or "(no text)"

    # Build sources block
    src_lines = []
    for i, m in enumerate(used_ctx or [], 1):
        src_lines.append(
            f"[{i}] {m.get('uploaded_filename','doc')} | "
            f"vec={m.get('_score_vec',0):.3f} bm25={m.get('_score_bm25',0):.3f} fused={m.get('_score_fused',0):.3f}"
        )
    sources_block = "**Sources**\n" + ("\n".join(src_lines) if src_lines else "No sources.")
    assistant_msg = answer.strip() + "\n\n" + sources_block

    # 🚨 IMPORTANT: append as dicts with role/content
    history.append({"role": "user",      "content": user_msg})
    history.append({"role": "assistant", "content": assistant_msg})

    return history, "\n".join(src_lines)

# ---- Build the simple UI ----
with gr.Blocks() as demo:
    gr.Markdown("### 🔎 RAG + BM25 Chat (Notebook)")

    with gr.Row():
        with gr.Column(scale=4):
            chat = gr.Chatbot(label="Conversation", type='messages', height=480, show_copy_button=True)
            user_box = gr.Textbox(label="Your question", placeholder="e.g., Which candidates worked in Singapore?")
            send_btn = gr.Button("Send", variant="primary")
        with gr.Column(scale=3):
            with gr.Accordion("Settings", open=False):
                k_vec   = gr.Slider(1, 10, value=4, step=1, label="Top-k (Vector)")
                k_bm25  = gr.Slider(1, 10, value=6, step=1, label="Top-k (BM25)")
                k_final = gr.Slider(1, 10, value=5, step=1, label="Top-k (Fused)")
                rrf_k   = gr.Slider(1, 200, value=60, step=1, label="RRF k (damping)")
                w_vec   = gr.Slider(0.0, 2.0, value=1.0, step=0.1, label="Weight: Vector")
                w_bm    = gr.Slider(0.0, 2.0, value=1.0, step=0.1, label="Weight: BM25")
                temperature = gr.Slider(0.0, 1.5, value=0.1, step=0.05, label="Temperature")
                max_tokens  = gr.Slider(1024, 4096*3, value=1024, step=64, label="Max output tokens")
            sources = gr.Textbox(label="Sources (debug)", lines=8, show_copy_button=True)

    # Wire up Enter key and button
    user_box.submit(
        fn=chat_handler,
        inputs=[user_box, chat, k_vec, k_bm25, k_final, rrf_k, w_vec, w_bm, temperature, max_tokens],
        outputs=[chat, sources],
    ).then(fn=lambda: "", inputs=None, outputs=user_box)

    send_btn.click(
        fn=chat_handler,
        inputs=[user_box, chat, k_vec, k_bm25, k_final, rrf_k, w_vec, w_bm, temperature, max_tokens],
        outputs=[chat, sources],
    ).then(fn=lambda: "", inputs=None, outputs=user_box)

In [37]:
demo.launch(inline=False, share=False, show_error=True)

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




## Example question 
* Who are a good candidates for developing dashboard, rank and score (0-1) too?

In [39]:
demo.close()

Closing server running on port: 7860
