Things need to explore & try:
- VectorStore
- Chain together
- Memory
- Streamlit combine
- Agent graphs
- Call tools

# Import & Config

In [14]:
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import FAISS 
from langchain.memory import ConversationBufferMemory, ChatMessageHistory
from langchain.chains import ConversationalRetrievalChain, create_retrieval_chain
from langchain.chains import create_history_aware_retriever
from langchain_core.chat_history import BaseChatMessageHistory
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain import hub
from htmlTemplate import css, bot_template, user_template

In [2]:
load_dotenv()

True

# Main

In [3]:
raw_text = """
About Us: We are a Global Group AI team that empowers organization with citizen data scientists and enforces governance to ensure responsible and ethical use of AI. Our team is responsible for developing advanced analytics and machine learning models, and delivering data-driven insights to business leaders across the organization.
Job Description: As a Junior Data Scientist, you will be responsible for creating and implementing complex machine learning models, performing data analysis, and driving insights from large datasets. You will work with stakeholders across the organization to understand their business needs and translate them into data-driven solutions. You will also work closely with citizen data scientists to help them leverage data and build models for their business problems.
Key Responsibilities:
• Develop and implement machine learning models to solve complex business problems, using techniques such as linear regression, logistic regression, decision trees, and other supervised and unsupervised learning models.
• Perform data analysis and create insights from large datasets.
• Work with stakeholders across the organization to understand business needs and translate
them into data-driven solutions.
• Collaborate with citizen data scientists to empower them to leverage data and build models
for their business problems.
• Enforce governance to ensure responsible and ethical use of data.
• Keep up-to-date with the latest developments in machine learning and data science.
Requirements:
• Bachelor's or Master's degree in Computer Science, Statistics, Mathematics, Actuarial Science or related fields.
• At least 1-4 years of experience in data science or related field.
• Strong knowledge of machine learning algorithms and statistical modelling techniques,
especially using Scikit-learn.
• Proficiency in Python, PySpark and SQL.
• Experience with data visualization and reporting tools (e.g. Power BI, QlikSense).
• Excellent communication and problem-solving skills.
• Experience with GenAI solutions and prompt engineering
Nice-to-haves:
• Familiarity with actuarial methods and models.
• Experience in MLOps and data pipelines eg, Bitbucket, Artifactory, Jenkins
• Familiarity with deep learning frameworks (e.g. TensorFlow, PyTorch).
• Experience with cloud-based services (e.g. AWS, Azure, GCP).
• Knowledge of big data technologies (e.g. Hadoop, Spark).
If you're interested in this position, please send your resume and cover letter to [Global Group AI team email address]. We look forward to hearing from you!

"""

## a. TextSplitter

In [4]:
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size = 800,
        chunk_overlap = 200,
        length_function = len
    )
    chunks = text_splitter.split_text(text)
    return chunks

text_chunks = get_text_chunks(raw_text)

## b. Embedding to VectorStore

In [5]:
def get_vectorestore(text_chunks):
    embeddings = OpenAIEmbeddings(base_url="https://api.bianxie.ai/v1")
    #embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorsore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorsore

vectorstore = get_vectorestore(text_chunks)

## c. Define LLM & Prompt

In [6]:
llm = ChatOpenAI(base_url = "https://api.bianxie.ai/v1")
prompt = hub.pull("rlm/rag-prompt")
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


## d. Chain

In [17]:
retriever = vectorstore.as_retriever()
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What is this pdf about? Answer me with one sentence")

'The PDF is about a Global Group AI team that empowers organizations with citizen data scientists and enforces governance to ensure responsible and ethical use of AI.'

## e. Chain with memory

In [20]:
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do not answer the question, \
just reformulate it if needed and otherwise return it as is.
"""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [21]:
qa_system_prompt = """You are an assistant for question-answering tasks. \
    Use the following pieces of retrived context to answer the question. \
    If you don't know the answer, just say you don't know. \
    Use three sentences maximum and keep the answer concise. \
    
    {context}
"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}")
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [22]:
store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer"
)

conversational_rag_chain.invoke(
    {"input": "What is this text about? Answer me with one sentence"},
    config = {
        "configurable": {"session_id": "abc123"}
    },
)["answer"]

'The text is a job description for a Junior Data Scientist position within a Global Group AI team that focuses on developing and implementing advanced analytics and machine learning models to deliver data-driven insights to business leaders.'

In [23]:
conversational_rag_chain.invoke(
    {"input": "What's the most important quality to get this job according to the JD?"},
    config = {
        "configurable": {"session_id": "abc123"}
    },
)["answer"]

'According to the job description, the most important quality to have for this Junior Data Scientist position is a strong knowledge of machine learning algorithms and statistical modeling techniques, especially using Scikit-learn.'

In [24]:
conversational_rag_chain.invoke(
    {"input": "What else?"},
    config = {
        "configurable": {"session_id": "abc123"}
    },
)["answer"]

'Another important quality for this Junior Data Scientist position is proficiency in programming languages like Python, PySpark, and SQL, as well as experience with data visualization and reporting tools such as Power BI and QlikSense.'

In [27]:
store

{'abc123': InMemoryChatMessageHistory(messages=[HumanMessage(content='What is this text about? Answer me with one sentence', additional_kwargs={}, response_metadata={}), AIMessage(content='The text is a job description for a Junior Data Scientist position within a Global Group AI team that focuses on developing and implementing advanced analytics and machine learning models to deliver data-driven insights to business leaders.', additional_kwargs={}, response_metadata={}), HumanMessage(content="What's the most important quality to get this job according to the JD?", additional_kwargs={}, response_metadata={}), AIMessage(content='According to the job description, the most important quality to have for this Junior Data Scientist position is a strong knowledge of machine learning algorithms and statistical modeling techniques, especially using Scikit-learn.', additional_kwargs={}, response_metadata={}), HumanMessage(content='What else?', additional_kwargs={}, response_metadata={}), AIMessa