In [4]:
import torch
torch.cuda.empty_cache()
!pip install -U bitsandbytes
!pip install transformers datasets accelerate peft datasets
!pip install -qU langchain tiktoken langchain_community langchain_chroma langchain-huggingface huggingface-hub sentence_transformers chromadb langchainhub transformers peft
!pip install flash-attn --no-build-isolation



In [5]:
from langchain_huggingface import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_chroma import Chroma
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    pipeline
)
from peft import PeftModel, PeftConfig
from IPython.display import display, Markdown
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Function to calculate the total number of tokens in the vector database
def count_total_tokens_in_vectorstore(vectorstore, tokenizer):
    # Retrieve all documents from the vector store
    all_docs = vectorstore.get()['documents']

    total_tokens = 0

    # Iterate over each document and calculate the number of tokens
    for doc in all_docs:
        tokens_in_doc = len(tokenizer.encode(doc))  # Tokenize the document content (which is a string)
        total_tokens += tokens_in_doc

    return total_tokens

In [7]:
# Initialize embeddings
embedding_model_name = "BAAI/bge-small-en-v1.5"
embedding_model_kwargs = {"device": "cuda"}
embedding_encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=embedding_model_name,
    model_kwargs=embedding_model_kwargs,
    encode_kwargs=embedding_encode_kwargs
)

# Initialize vector store and retriever
vectorstore = Chroma(
    persist_directory=#"/content/drive/MyDrive/UWA/Sem 4/Capstone/Project/vector1",
    #/content/drive/MyDrive/CITS5553_Capstone/vector1
    "/content/drive/MyDrive/UWA/Sem 4/Capstone/Project/vector1",
    embedding_function=hf
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Check the available device
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using device: {device}")

# Check for bf16 support
is_bf16_support = False
try:
    tensor_bf16 = torch.tensor([1.0], dtype=torch.bfloat16, device=device)
    is_bf16_support = True
    print("bf16 tensors are supported.")
except TypeError:
    print("bf16 tensors are not supported.")

# Quantization configuration
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the base model and tokenizer
base_model = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(base_model)

# Load the fine-tuned Phi3 mini model with LoRA
model = AutoModelForCausalLM.from_pretrained(base_model, quantization_config=bnb_config,return_dict=True, device_map=device)

qlora_model = PeftModel.from_pretrained(model, "KunalRaghuvanshi/phi3_mini_qlora_chemical_eng")

pipeline = pipeline("text-generation", model=qlora_model, tokenizer=tokenizer, max_new_tokens=1000)
llm = HuggingFacePipeline(pipeline=pipeline)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Using device: cuda
bf16 tensors are supported.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCaus

In [8]:
# Define the RAG Chat Model class
class RAGChatModel:
    def __init__(self, retriever, llm, tokenizer, max_token_limit=1200): #count_total_tokens_in_vectorstore(vectorstore, tokenizer)//10):
        self.retriever = retriever
        self.llm = llm
        self.tokenizer = tokenizer
        self.max_token_limit = max_token_limit
        self.current_token_count = 0
        self.template_standard = """
        <|system|>
        Answer the question in detail. Provide all the relevant information based on the provided context.
        It is critical that you mention all page numbers where this information is found. Do not skip any page numbers.


        Context: {context}

        Providing all the page numbers is essential  for the answer.
        <|end|>

        <|user|>
        Question: {question}
        <|end|>

        <|assistant|>
        """
        self.template_exceeded = """
        <|system|>
        Answer the question in detail; warn that information is not taken from the prescribed textbook and must provide the page numbers where they can find the correct information in the prescribed textbook.

        Context: {context}
        Providing all the page numbers is essential for the answer.
        <|end|>

        <|user|>
        Question: {question}
        <|end|>

        <|assistant|>
        """

    def num_tokens_from_string(self, string: str) -> int:
        """Returns the number of tokens in a text string using the tokenizer."""
        return len(self.tokenizer.encode(string))

    def format_docs(self, docs, full_content=True):
        """Format the documents to be used as context in the prompt."""
        if full_content:
            return "\n\n".join(f"Information in Page number: {(doc.metadata['page']+1)}\n{doc.page_content}" for doc in docs)
        else:
            return "Information available in prescribed textbook " + ", ".join(f"Page number: {doc.metadata['page']}" for doc in docs)

    def get_prompt(self, docs, question):
        """Generate the prompt based on token count and context formatting."""
        # Format the context with full content
        context = self.format_docs(docs, full_content=True)
        total_tokens_in_context = self.num_tokens_from_string(context)

        # Add tokens to the running total
        self.current_token_count += total_tokens_in_context

        # Decide whether to use full content or only page numbers
        if self.current_token_count > self.max_token_limit:
            print("Token limit exceeded. Information from prescribed textbook will not be used.")
            # Reformat context to include only page numbers
            context = self.format_docs(docs, full_content=False)
            template = self.template_exceeded
        else:
            template = self.template_standard

        # Create the prompt
        prompt = template.format(context=context, question=question)
        return prompt

    def extract_clean_answer(self, raw_output):
        """Extract only the answer from the raw output."""
        assistant_tag = "<|assistant|>"
        if assistant_tag in raw_output:
            clean_answer = raw_output.split(assistant_tag)[-1].strip()
            return clean_answer
        return raw_output.strip()

    def ask_question(self, question):
        """Main function to retrieve relevant docs and generate a response."""
        # Add fixed request for page numbers to the user's question
        question_with_page_request = f"{question}. Please provide the page numbers in your answer."

        # Retrieve relevant documents
        docs = self.retriever.invoke(question_with_page_request)

        # Generate prompt based on token count
        prompt = self.get_prompt(docs, question_with_page_request)

        # Pass the prompt to the LLM
        result = self.llm.generate([prompt])

        # Extract the generated text
        raw_answer = result.generations[0][0].text

        # Get the clean answer
        clean_answer = self.extract_clean_answer(raw_answer)

        # Display the answer
        display(Markdown(clean_answer))


In [9]:
# Initialize the RAGChatModel
rag_chat_model = RAGChatModel(retriever, llm, tokenizer)

In [10]:
!pip install gradio



In [11]:
import gradio as gr

# Function to shorten the question for the chat history display
def get_short_overview(question, answer, max_length=50):
    """Generate a short summary of the question for the chat history."""
    return (question[:max_length] + '...') if len(question) > max_length else question

# Function for the RAG model interaction
def ask_question_gradio(history, question):
    """Main function to retrieve relevant docs and generate a response."""
    if not question:  # Check if the question is empty
        return history, "", ""  # Return empty if no question is asked

    # Add fixed request for page numbers to the user's question
    question_with_page_request = f"{question}. Please provide the page numbers in your answer."

    # Retrieve relevant documents using the RAG model
    docs = rag_chat_model.retriever.invoke(question_with_page_request)
    prompt = rag_chat_model.get_prompt(docs, question_with_page_request)

    # Generate the response
    result = rag_chat_model.llm.generate([prompt])
    raw_answer = result.generations[0][0].text
    clean_answer = rag_chat_model.extract_clean_answer(raw_answer)

    # Add the question and answer to the conversation history as dicts
    history.append({"role": "user", "content": question})  # Add user question
    history.append({"role": "assistant", "content": clean_answer})  # Add model answer

    # Generate a short summary for the chat history section
    short_overview = get_short_overview(question, clean_answer)

    # Format the chat history for display in the overview
    chat_history = "\n\n".join([f"{get_short_overview(q['content'], a['content'])}" for q, a in zip(history[::2], history[1::2])])

    # Return the updated history and chat history for display
    return history, chat_history, ""  # The empty string clears the input box

# Create Gradio Blocks interface
with gr.Blocks() as demo:
    gr.Markdown(
        """
        <h1 style='text-align: center;'>Q-ChemNerd</h1>
        <p style='text-align: center;'>Ask any question and get a response from the RAG model.</p>
        """,
    )

    with gr.Row():
        with gr.Column(scale=1, min_width=200):
            gr.Markdown("### Chat History Overview")
            history_display = gr.Textbox(label="Chat History", lines=20, interactive=False)  # Non-editable

        with gr.Column(scale=2):
            chatbot = gr.Chatbot(label="QLora ChemNerd Chat", type='messages')  # Ensure type is 'messages'
            user_input = gr.Textbox(placeholder="Ask your question...", label="Type your message here:")
            submit_button = gr.Button("Send")

            history_state = gr.State([])

            submit_button.click(
                ask_question_gradio,
                inputs=[history_state, user_input],
                outputs=[chatbot, history_display, user_input],  # Update chatbot and chat history
                scroll_to_output=True
            )

            user_input.submit(
                ask_question_gradio,
                inputs=[history_state, user_input],
                outputs=[chatbot, history_display, user_input],  # Update chatbot and chat history
                scroll_to_output=True
            )

# Launch the Gradio interface
demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://51a2e4c96d7076da98.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




How many stages are required to distil toluene from decane at atmospheric conditions?

To heat a dodecane fluid with steam, what type of heat exchanger should be applied and why?

What methods are available to capture CO2 from air at dilute concentrations?

In [13]:
# Start the interactive chat
print("Welcome to the RAG Chat Model! Ask any question (type 'exit' to quit):")
while True:
  print("\n\n")
  question = input("Your question: ")
  if question.lower() == 'exit':
    print("Exiting the chat.")
    break
  rag_chat_model.ask_question(question)

Welcome to the RAG Chat Model! Ask any question (type 'exit' to quit):



Your question:  exit


The instruction requires a detailed response that includes all relevant information from the provided context, specifically mentioning the page numbers where this information can be found. Since the actual content and page numbers are not provided in your query, I will create a hypothetical answer that would fit the instruction if the content were available.

In the context of understanding the importance of citing page numbers in academic and research settings, it is crucial to recognize that page numbers serve as a navigational tool, allowing readers to locate the original source of information quickly. This practice not only enhances the credibility of the research by demonstrating thoroughness and attention to detail but also facilitates the verification process for interested readers.

For instance, when referencing a study on the impact of climate change on marine biodiversity, including the page number where the specific data on coral reef degradation was found (e.g., "Smith et al., 2021, p. 45") provides a clear path for readers to access the original source. This practice is particularly important in fields where data and findings are rapidly evolving, as it allows for the verification of claims and the exploration of the research's depth.

Moreover, the inclusion of page numbers in citations adheres to the academic standards set by various style guides, such as APA, MLA, and Chicago, which mandate the provision of detailed bibliographic information to ensure the traceability of sources. Failure to include page numbers can lead to difficulties in locating the original source, potentially undermining the research's integrity and the author's credibility.

In conclusion, the practice of including page numbers in academic citations is a fundamental aspect of scholarly communication. It not only aids in the efficient navigation of research materials but also upholds the principles of academic integrity and transparency. For a comprehensive understanding of this practice, readers are encouraged to consult the relevant sections of their academic style guides, which typically include detailed instructions on how to format citations, including the inclusion of page numbers.

Hypothetical Page Numbers:

- Introduction to Academic Citations: p. 1
- Importance of Page Numbers in Citations: p. 2
- Navigating Academic Research with Page Numbers: p. 3
- Adhering to Academic Style Guides: p. 4
- Case Study: Including Page Numbers in Citations: p. 5
- Conclusion: p. 6

Please note, these page numbers are illustrative and not from an actual text. In a real scenario, the answer would include the specific page numbers from the text where the relevant information is found.






KeyboardInterrupt: Interrupted by user