# RAG-based ANA QNA

In [1]:
!pip install --upgrade --quiet  gpt4all



In [2]:
!pip install --upgrade langchain pydantic





In [3]:
!pip install -q -U google-generativeai



In [63]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.40.1-py2.py3-none-any.whl (8.6 MB)
     ---------------------------------------- 8.6/8.6 MB 6.8 MB/s eta 0:00:00
Collecting pydeck<1,>=0.8.0b4
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
     ---------------------------------------- 6.9/6.9 MB 6.5 MB/s eta 0:00:00
Collecting altair<6,>=4.0
  Downloading altair-5.4.1-py3-none-any.whl (658 kB)
     ------------------------------------- 658.1/658.1 kB 10.3 MB/s eta 0:00:00
Collecting blinker<2,>=1.0.0
  Downloading blinker-1.9.0-py3-none-any.whl (8.5 kB)
Collecting rich<14,>=10.14.0
  Downloading rich-13.9.4-py3-none-any.whl (242 kB)
     -------------------------------------- 242.4/242.4 kB 7.5 MB/s eta 0:00:00
Collecting pyarrow>=7.0
  Downloading pyarrow-18.0.0-cp310-cp310-win_amd64.whl (25.1 MB)
     ---------------------------------------- 25.1/25.1 MB 5.8 MB/s eta 0:00:00
Collecting narwhals>=1.5.2
  Downloading narwhals-1.13.4-py3-none-any.whl (206 kB)
     ---------------



In [62]:
from langchain.document_loaders import DirectoryLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
import os
import shutil
from langchain.evaluation import load_evaluator
from pydantic import BaseModel, GetJsonSchemaHandler
from langchain.prompts import ChatPromptTemplate
import google.generativeai as genai
from langchain.chains import ConversationChain
import streamlit as st

ModuleNotFoundError: No module named 'streamlit'

In [24]:
# from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain.chains.conversation.memory import ConversationSummaryMemory
# from langchain.chains.conversation.memory import ConversationBufferWindowMemory
# from langchain.chains.conversation.memory import ConversationSummaryBufferMemory

In [5]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="langchain")

### create chunks from documents

In [6]:
DATA_PATH = "data_sources"

In [7]:
def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.pdf")
    documents = loader.load()
    return documents

In [8]:
documents = load_documents()

In [9]:
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    
    chunks = text_splitter.split_documents(documents)
    
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    
    document = chunks[3]
    print(document.page_content)
    print(document.metadata)
    
    return chunks

In [10]:
doc_chunks = split_text(documents)

Split 1 documents into 19 chunks.
WORK EXPERIENCE

DATA SCIENCE CONSULTANT for Silver Linings (Jewelry E-commerce)

2024, Jan-Mar

Worked closely with stakeholders to assess business use cases and provide data-driven recommendations that identified new revenue opportunities.
{'source': 'data_sources\\Data Scientist - Ana Vasquez.pdf', 'start_index': 556}


In [11]:
doc_chunks

[Document(metadata={'source': 'data_sources\\Data Scientist - Ana Vasquez.pdf', 'start_index': 0}, page_content='ANA VASQUEZ avasquez.msds2024@aim.edu | linkedin.com/in/ana-p-vasquez\n\ngithub.com/helloanavee | +63 919 001 5061'),
 Document(metadata={'source': 'data_sources\\Data Scientist - Ana Vasquez.pdf', 'start_index': 114}, page_content='With over 5 years of experience as a data scientist, analyst, and software engineer, I specialize in partnering with stakeholders to define and deliver data-driven solutions that drive cost savings and revenue growth. Skilled in both client management and technical tools such as Python and SQL, I'),
 Document(metadata={'source': 'data_sources\\Data Scientist - Ana Vasquez.pdf', 'start_index': 312}, page_content='and revenue growth. Skilled in both client management and technical tools such as Python and SQL, I excel at translating business needs into actionable insights. I am eager to contribute my expertise to support company success and innovat

### save chunks to vector database

In [12]:
CHROMA_PATH = "chroma"

model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
gpt4all_kwargs = {'allow_download': 'True'}
gpt4all_embeddings = GPT4AllEmbeddings(
    model_name=model_name,
    gpt4all_kwargs=gpt4all_kwargs
)
    
def save_to_chroma(chunks: list[Document]):
    # clear previous db
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # create db
    db = Chroma.from_documents(
        doc_chunks, gpt4all_embeddings, persist_directory=CHROMA_PATH
    )
    
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [13]:
save_to_chroma(doc_chunks)

Saved 19 chunks to chroma.


### query the closest texts from the db

In [14]:
# use the same embedding function
model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
gpt4all_kwargs = {'allow_download': 'True'}
gpt4all_embeddings = GPT4AllEmbeddings(
    model_name=model_name,
    gpt4all_kwargs=gpt4all_kwargs
)

# prepare the db
db = Chroma(persist_directory=CHROMA_PATH,
            embedding_function=gpt4all_embeddings)

  db = Chroma(persist_directory=CHROMA_PATH,


In [15]:
query_text = "should i hire her as a data scientist?"

In [16]:
results = db.similarity_search_with_relevance_scores(query_text, k=3)

In [17]:
results

[(Document(metadata={'source': 'data_sources\\Data Scientist - Ana Vasquez.pdf', 'start_index': 2441}, page_content='Collaborated with clients to create end-to-end data projects with SQL and Looker Studio to analyze their data, bringing data-driven insights into the industries of marketing, HR, real estate, health, and finance.\n\nDATA ANALYST for The Freelance Movement Tribe (E-Learning Platform)\n\n2021 – 2022'),
  0.18785726628411736),
 (Document(metadata={'source': 'data_sources\\Data Scientist - Ana Vasquez.pdf', 'start_index': 556}, page_content='WORK EXPERIENCE\n\nDATA SCIENCE CONSULTANT for Silver Linings (Jewelry E-commerce)\n\n2024, Jan-Mar\n\nWorked closely with stakeholders to assess business use cases and provide data-driven recommendations that identified new revenue opportunities.'),
  0.16815025131821992),
 (Document(metadata={'source': 'data_sources\\Data Scientist - Ana Vasquez.pdf', 'start_index': 1096}, page_content='DATA SCIENCE CONSULTANT for Confidential Construc

### create the prompt with my custom data

In [64]:
PROMPT_TEMPLATE = """
Given the following context and a question, generate an answer based on this context only.
If the answer is not found in the context, kindly state "I don't know." Don't try to make up an answer.

CONTEXT: {context}

QUESTION: {question}
"""

In [65]:
# PROMPT_TEMPLATE = """
# Answer the question based only on the following context:

# {context}

# ---

# Answer the question based on the above context: {question}
# """

In [19]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)

In [20]:
print(prompt)

Human: 
Answer the question based only on the following context:

Collaborated with clients to create end-to-end data projects with SQL and Looker Studio to analyze their data, bringing data-driven insights into the industries of marketing, HR, real estate, health, and finance.

DATA ANALYST for The Freelance Movement Tribe (E-Learning Platform)

2021 – 2022

---

WORK EXPERIENCE

DATA SCIENCE CONSULTANT for Silver Linings (Jewelry E-commerce)

2024, Jan-Mar

Worked closely with stakeholders to assess business use cases and provide data-driven recommendations that identified new revenue opportunities.

---

DATA SCIENCE CONSULTANT for Confidential Construction Company

2024, Apr-Sept

Partnered with cross-functional stakeholders to define business problems and develop data-driven solutions for production, operations, and logistics use cases.

---

Answer the question based on the above context: should i hire her as a data scientist?



### prompt the LLM

In [21]:
with open('GEMINI_KEY.txt', 'r') as file:
    exec(file.read().strip())

In [22]:
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel("gemini-1.5-flash")
response = model.generate_content(prompt)
print(response.text)

Based on the provided context, it's difficult to definitively say whether you should hire her as a data scientist. Here's why:

**Pros:**

* **Experience in various industries:** She has worked with data in a variety of fields, showing adaptability and potentially a broader understanding of business needs.
* **Data analysis skills:** She demonstrates proficiency in SQL and Looker Studio, indicating solid data analysis skills.
* **Collaborative approach:** She highlights working closely with clients and stakeholders, implying good communication and teamwork abilities.
* **Data-driven problem solving:**  Her experience with identifying business use cases and providing data-driven recommendations suggests a focus on practical application of data.

**Cons:**

* **Limited experience in data science:** While she has worked as a "data science consultant", the provided context doesn't clearly highlight specific data science skills like machine learning, statistical modeling, or advanced data v

In [28]:
!pip install --upgrade --quiet  langchain-google-genai



In [29]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [30]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=API_KEY)

In [43]:
result = llm.invoke(prompt)

In [44]:
print(result.content)

Based on the provided context, it's difficult to definitively say whether you should hire this person as a data scientist. Here's why:

**Pros:**

* **Experience with SQL and Looker Studio:**  These are valuable tools for data analysis.
* **Diverse Industry Experience:** Working in marketing, HR, real estate, health, and finance demonstrates adaptability and a broad understanding of different data landscapes.
* **Collaboration and Communication Skills:**  The context highlights working with clients and stakeholders, suggesting strong communication and collaboration abilities.
* **Data-Driven Recommendations and Solutions:** The descriptions emphasize the ability to provide insights and solutions based on data analysis.

**Cons:**

* **Limited Time in Data Science Roles:** The provided information suggests the candidate has primarily worked in data analysis roles, not specifically data science. The data science consultant positions were short-term (3 months and 6 months).
* **Lack of Sp

In [39]:
conversation_with_summary = ConversationChain(
    llm=llm,
    memory=ConversationSummaryMemory(llm=llm),
    verbose=True
)

  conversation_with_summary = ConversationChain(


In [45]:
response_with_summary = conversation_with_summary.invoke(response)



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
The human asks if they should hire a person as a data scientist based on their provided work experience. The AI states that it's difficult to make a decision based on the provided information because it lacks details about their specific data science skills, qualifications, and accomplishments. The AI suggests gathering more information about their skills, projects, education, and references before making a hiring decision. 

Human: response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
            

ValidationError: 3 validation errors for HumanMessage
content.str
  Input should be a valid string [type=string_type, input_value=response:
GenerateContent...": 593
      }
    }),
), input_type=GenerateContentResponse]
    For further information visit https://errors.pydantic.dev/2.9/v/string_type
content.list[union[str,dict[any,any]]].0.str
  Input should be a valid string [type=string_type, input_value=response:
GenerateContent...": 593
      }
    }),
), input_type=GenerateContentResponse]
    For further information visit https://errors.pydantic.dev/2.9/v/string_type
content.list[union[str,dict[any,any]]].0.dict[any,any]
  Input should be a valid dictionary [type=dict_type, input_value=response:
GenerateContent...": 593
      }
    }),
), input_type=GenerateContentResponse]
    For further information visit https://errors.pydantic.dev/2.9/v/dict_type

### prompt with history

In [48]:
# from langchain_openai import ChatOpenAI
# llm = ChatOpenAI(openai_api_key=openai_api_key)

# convert the output of the chatmodel into pure text
from langchain_core.output_parsers import StrOutputParser
output_parser = StrOutputParser

In [51]:
# retriever takes the question then compares with numeric vectors in the db and return the similar text
retriever = db.as_retriever()

In [55]:
!pip install langchain-core

ERROR: Could not find a version that satisfies the requirement langchain-core-prompts (from versions: none)
ERROR: No matching distribution found for langchain-core-prompts


In [57]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

instruction_to_system = """
Given a chat history and the latest user question
which might reference context in the chat history, formulate a standalone question
which can be understood without the chat history. Do NOT answer the question,
just reformulate it if needed and otherwise return it as is.
"""

question_maker_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", instruction_to_system),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)

question_chain = question_maker_prompt | llm | StrOutputParser