In [1]:
import pandas as pd
import numpy as np
import os
import gradio as gr
from dotenv import load_dotenv

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import hub
from langchain.schema import AIMessage, HumanMessage, SystemMessage

from langchain_core.runnables import (
    ConfigurableField,
    RunnableBinding,
    RunnableLambda,
    RunnablePassthrough,
    RunnableParallel
)

pd.set_option('display.max_colwidth', 300)  # Or use a large number if 'None' does not work in some environments
pd.set_option('display.max_columns', 300)  # Show all columns
pd.set_option('display.max_rows', 20)  # Show all row

load_dotenv()
OPENAI_APIKEY = os.environ['OPENAI_APIKEY']

# Set up Chain

In [3]:
# Instantiate models
embeddings_model = OpenAIEmbeddings(api_key=OPENAI_APIKEY, model='text-embedding-3-large', max_retries=100, chunk_size=16, show_progress_bar=False)
chat_model_4 = ChatOpenAI(api_key=OPENAI_APIKEY, temperature=0.5, model='gpt-4-turbo-2024-04-09')
# Load chroma from disk
vectorstore = Chroma(persist_directory="../chroma_db", embedding_function=embeddings_model)
# Set up the vectorstore to be the retriever
num_results = 5
retriever = vectorstore.as_retriever(search_kwargs={'k': num_results})

In [4]:
def format_chat_history(chat_history):
    chat_history_langchain_format = []
    for human, ai in chat_history:
        chat_history_langchain_format.append(HumanMessage(content=human))
        chat_history_langchain_format.append(AIMessage(content=ai))
    
    return chat_history_langchain_format

In [5]:
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history", optional=True),
        ("human", "{question}"),
    ]
)

contextualize_chain = contextualize_q_prompt | chat_model_4 | StrOutputParser()

template = """You are an assistant for question-answering tasks regarding
plant-based nutrition to prevent and reverse disease and nutrition science. 
The following are snippets of transcripts from Dr Michael Greger's nutrition facts
videos that you are going to use to to answer the question.

If you don't know the answer, just say that you don't know.
Ask follow-up questions if appropriate. 

Context: {context}

Question: {contextualized_question}

Answer: 

"""

answer_prompt = ChatPromptTemplate.from_template(template)

qa_chain = answer_prompt | chat_model_4 | StrOutputParser()




In [6]:
chat_history = [["Is juice bad for me?", "It's better to avoid juice and drink water"],
                ["Should I drink filtered water?", "Filtered water is usually saafer"],
                ["How much water should I drink a day?", "5-7 cups"]]
question = "But how much for children?."

In [7]:
langchain_chat_history = format_chat_history(chat_history)
langchain_chat_history

[HumanMessage(content='Is juice bad for me?'),
 AIMessage(content="It's better to avoid juice and drink water"),
 HumanMessage(content='Should I drink filtered water?'),
 AIMessage(content='Filtered water is usually saafer'),
 HumanMessage(content='How much water should I drink a day?'),
 AIMessage(content='5-7 cups')]

In [8]:

contextualized_question = contextualize_chain.invoke({'chat_history': langchain_chat_history, 'question': question})
contextualized_question


'How much water should children drink per day?'

In [9]:
context = retriever.invoke(contextualized_question)
[c.metadata['title'] for c in context]

['What Is the Safest Metabolism Booster?',
 'What Is the Safest Metabolism Booster?',
 'How Many Glasses of Water Should We Drink a Day?',
 'How Many Glasses of Water Should We Drink a Day?',
 'Podcast: Boosting Our Metabolism']

In [10]:
answer = qa_chain.invoke({ 'context': context, 'contextualized_question': contextualized_question})
answer

"The recommended daily adequate intake of water for children varies by age. For children ages 4 through 8, it is recommended that they drink about 7 cups of water a day. For ages 9 through 13, the recommendation is 8 cups a day for girls and 10 cups a day for boys. These recommendations may offer more than just hydration benefits, as some research suggests possible increases in metabolic rate associated with water consumption. However, it's important to note that not all research teams were able to replicate these findings consistently."

In [11]:
def format_result(answer, context):
    # Extract unique pairs of titles and video IDs from the context
    unique_videos = set((doc.metadata['title'], doc.metadata['videoId']) for doc in context)
    
    # Create a plain text string where each title is followed by its URL
    titles_with_links = [
        f"{title}: https://www.youtube.com/watch?v={video_id}" for title, video_id in unique_videos
    ]
    
    # Join these entries with line breaks to form a clear list
    titles_string = '\n'.join(titles_with_links)
    titles_formatted = f"Relevant Videos:\n{titles_string}"
    
    # Combine the answer from the result with the formatted list of video links
    response = f"{answer}\n\n{titles_formatted}"

    return response

In [12]:
print(format_result(answer, context))

The recommended daily adequate intake of water for children varies by age. For children ages 4 through 8, it is recommended that they drink about 7 cups of water a day. For ages 9 through 13, the recommendation is 8 cups a day for girls and 10 cups a day for boys. These recommendations may offer more than just hydration benefits, as some research suggests possible increases in metabolic rate associated with water consumption. However, it's important to note that not all research teams were able to replicate these findings consistently.

Relevant Videos:
What Is the Safest Metabolism Booster?: https://www.youtube.com/watch?v=WGlENiyN4Dw
Podcast: Boosting Our Metabolism: https://www.youtube.com/watch?v=HyuXNzLzKYM
How Many Glasses of Water Should We Drink a Day?: https://www.youtube.com/watch?v=qrzSLIBOauM


# Put the Code Together

In [61]:

class ContextRetriever:
    def __init__(self, embeddings_model, persist_directory, num_results):
        self.persist_directory = persist_directory
        self.num_results = num_results
        self.vectorstore = Chroma(persist_directory=self.persist_directory, embedding_function=embeddings_model)
        self.retriever = self.vectorstore.as_retriever(search_kwargs={'k': self.num_results})
    
    def get_context(self, question):
        self.context = self.retriever.invoke(question)
        return self.context

class ChatHistoryFormatter:
    @staticmethod
    def format_chat_history(chat_history, len_history=10):
        formatted_history = []
        if len(chat_history) >0:
            for human, ai in chat_history[-len_history:]:
                formatted_history.append(HumanMessage(content=human))
                formatted_history.append(AIMessage(content=ai))
            return formatted_history
        else:
            return chat_history

class QuestionContextualizer:
    def __init__(self, chat_model):
        self.chat_model = chat_model
        self.contextualize_q_system_prompt = """Given a chat history and the latest user question \
                                                which might reference context in the chat history, formulate a standalone question \
                                                which can be understood without the chat history. Do NOT answer the question, \
                                                just reformulate it if needed and otherwise return it as is."""
        self.prompt_template = ChatPromptTemplate.from_messages([
            ("system", self.contextualize_q_system_prompt),
            MessagesPlaceholder("chat_history", optional=True),
            ("human", "{question}")
        ])
        self.contextualize_chain = contextualize_q_prompt | chat_model | StrOutputParser()

    def contextualize_question(self, formatted_chat_history, question):
        self.question = question
        self.formatted_chat_history = formatted_chat_history
        self.contextualized_question = self.contextualize_chain.invoke({'chat_history': formatted_chat_history, 'question': question})
        return self.contextualized_question 

class AnswerGenerator:
    def __init__(self, chat_model):
        self.chat_model = chat_model
        # Create the prompt template
        self.prompt_template = """You are an assistant for question-answering tasks regarding
                                    plant-based nutrition to prevent and reverse disease and nutrition science in general. 
                                    The following are snippets of transcripts from Dr Michael Greger's nutrition facts
                                    videos that you are going to use to to answer the question. If you don't know the answer, just say that you don't know.
                                    Ask follow-up questions if appropriate. 

                                    Context: {context}

                                    Question: {contextualized_question}

                                    Answer: 
                                    """
        self.prompt_template = ChatPromptTemplate.from_template(self.prompt_template)
        # Question and answer chain that takes in the context and contextualized question and spits out the answer
        self.qa_chain = self.prompt_template | self.chat_model | StrOutputParser()

    def generate_answer(self, context, question):
        self.context = context
        self.question = question
        # Invoke the question and answer chain
        self.answer = self.qa_chain.invoke({'context': self.context, 'contextualized_question': self.question})
        
        return self.answer


class ResultFormatter:
    @staticmethod
    def format_result(answer, context):
        # Extract unique pairs of titles and video IDs from the context
        unique_videos = set((doc.metadata['title'], doc.metadata['videoId']) for doc in context)        
        # Create a plain text string where each title is followed by its URL
        titles_with_links = [f"{title}: https://www.youtube.com/watch?v={video_id}" for title, video_id in unique_videos]
        # Join these entries with line breaks to form a clear list
        titles_string = '\n'.join(titles_with_links)
        titles_formatted = f"Relevant Videos:\n{titles_string}"
        # Combine the answer from the result with the formatted list of video links
        response = f"{answer}\n\n{titles_formatted}"

        return response


class NutritionBot:
    def __init__(self, temp=0.5, 
                       chat_model_name='gpt-4-turbo-2024-04-09', 
                       embeddings_model_name='text-embedding-3-large',
                       num_results=10,
                       persist_directory='../chroma_db',
                       ):
        # Load environment and API keys
        load_dotenv()
        api_key = os.getenv('OPENAI_APIKEY')

        self.embeddings_model_name = embeddings_model_name
        self.temp = temp
        self.chat_model_name = chat_model_name
        self.num_results = num_results
        self.persist_directory = persist_directory

        # Embeddings Model
        self.embeddings_model = OpenAIEmbeddings(api_key=api_key, model=self.embeddings_model_name, max_retries=100, chunk_size=16, show_progress_bar=False)

        # Initialize Chat Model
        self.chat_model = ChatOpenAI(api_key=api_key, temperature=self.temp, model=self.chat_model_name)

        # Set up vector store and other components
        self.context_retriever = ContextRetriever(embeddings_model=self.embeddings_model, persist_directory=self.persist_directory, num_results=self.num_results)
        self.chat_history_formatter = ChatHistoryFormatter()
        self.question_contextualizer = QuestionContextualizer(self.chat_model)
        self.answer_generator = AnswerGenerator(self.chat_model)
        self.result_formatter = ResultFormatter()

    def process_chat(self, new_question, chat_history):
        formatted_chat_history = self.chat_history_formatter.format_chat_history(chat_history)
        contextualized_question = self.question_contextualizer.contextualize_question(formatted_chat_history, new_question)
        context = self.context_retriever.get_context(contextualized_question)
        answer = self.answer_generator.generate_answer(context=context, question=contextualized_question)
        final_result = self.result_formatter.format_result(answer, context)
        return final_result


In [62]:
nutrition_bot = NutritionBot()

In [63]:
history = []
question = 'eggs and diabetes'
print(nutrition_bot.process_chat(chat_history=history, new_question=question))

Yes, eating eggs can affect diabetes management and increase the risk of developing diabetes. Research has shown a stepwise increase in the risk of type 2 diabetes with higher egg consumption. Eating just a single egg a week may increase the odds of developing diabetes by 76%, and consuming two eggs a week appeared to double the odds. Furthermore, eating one egg a day was found to triple the odds of diabetes. This association between egg consumption and an increased risk of type 2 diabetes has been confirmed across various populations, including studies conducted in Asia and Europe.

Additionally, for individuals who already have diabetes, consuming eggs may worsen their condition. Eating one egg a day or more has been associated with a shortened lifespan and may double the all-cause mortality rate for those with diabetes. These findings align with dietary guidelines that recommend restricting egg consumption to prevent cardiometabolic diseases like diabetes and cardiovascular disease.

In [64]:
history = [["Is juice bad for me?", "It's better to avoid juice and drink water"],
                ["Should I drink filtered water?", "Filtered water is usually saafer"],
                ["How much water should I drink a day?", "5-7 cups"]]
question = "But how much for children?."
print(nutrition_bot.process_chat(chat_history=history, new_question=question))

The recommended daily adequate intake of water for children varies by age. For children ages 4 through 8, it's about 7 cups a day. For ages 9 through 13, it's 8 cups a day for girls and 10 cups for boys. These recommendations may offer more than just hydration benefits, as they are associated with increased metabolic rates in some studies. However, not all research teams were able to replicate these findings.

Relevant Videos:
What Is the Safest Metabolism Booster?: https://www.youtube.com/watch?v=WGlENiyN4Dw
Podcast: Boosting Our Metabolism: https://www.youtube.com/watch?v=HyuXNzLzKYM
How Many Glasses of Water Should We Drink a Day?: https://www.youtube.com/watch?v=qrzSLIBOauM


In [65]:
nutrition_bot = NutritionBot()


In [66]:
# Set up chat bot interface
answer_bot = gr.ChatInterface(
                            nutrition_bot.process_chat,
                            chatbot=gr.Chatbot(height=300),
                            textbox=gr.Textbox(placeholder="Ask me a question about nutrition and health", container=False, scale=7),
                            title="Nutrition Facts ChatBot",
                            description="Ask Dr Michael McGregor's Nutrition Facts videos any questions!",
                            theme="soft",
                            examples=["diverticulosis", "heart disease", "low carb diets", "diabetes", "green tea"],
                            cache_examples=False,
                            retry_btn=None,
                            undo_btn=None,
                            clear_btn=None,
                            stop_btn="Interrupt",
                            submit_btn="Ask"
                        )

In [67]:
answer_bot.launch()

Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.


