In [6]:
import torch
torch.cuda.empty_cache()

!pip install -qU langchain tiktoken langchain_community langchain_chroma langchain-huggingface huggingface-hub sentence_transformers chromadb langchainhub transformers peft
!pip install flash-attn --no-build-isolation



In [7]:
from langchain_huggingface import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_chroma import Chroma
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel, PeftConfig
from IPython.display import display, Markdown
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# Function to calculate the total number of tokens in the vector database
def count_total_tokens_in_vectorstore(vectorstore, tokenizer):
    # Retrieve all documents from the vector store
    all_docs = vectorstore.get()['documents']

    total_tokens = 0

    # Iterate over each document and calculate the number of tokens
    for doc in all_docs:
        tokens_in_doc = len(tokenizer.encode(doc))  # Tokenize the document content (which is a string)
        total_tokens += tokens_in_doc


    return total_tokens

In [9]:
# Initialize embeddings
embedding_model_name = "BAAI/bge-small-en-v1.5"
embedding_model_kwargs = {"device": "cuda"}
embedding_encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=embedding_model_name,
    model_kwargs=embedding_model_kwargs,
    encode_kwargs=embedding_encode_kwargs
)

# Initialize vector store and retriever
vectorstore = Chroma(
    persist_directory="/content/drive/MyDrive/CITS5553_Capstone/vector1",
    embedding_function=hf
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


# Check the available device
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using device: {device}")

# Check for bf16 support
is_bf16_support = False
try:
    tensor_bf16 = torch.tensor([1.0], dtype=torch.bfloat16, device=device)
    is_bf16_support = True
    print("bf16 tensors are supported.")
except TypeError:
    print("bf16 tensors are not supported.")

# Load the base model and tokenizer
base_model = "microsoft/Phi-3.5-mini-instruct"


# Load the fine-tuned Phi3 mini model with LoRA
model = AutoModelForCausalLM.from_pretrained(base_model, return_dict=True, device_map=device, torch_dtype=torch.bfloat16 if is_bf16_support else torch.float16)
lora_model = PeftModel.from_pretrained(model, "KunalRaghuvanshi/phi3_mini_qlora_chemical_eng")
tokenizer = AutoTokenizer.from_pretrained(lora_model)


MAX_TOKENS = 1052988
MIN_TOKENS = 105298

# Modify the pipeline initialization
def get_safe_max_tokens(vectorstore, tokenizer):
    total_tokens = count_total_tokens_in_vectorstore(vectorstore, tokenizer)
    print("Toatal Tokens")
    print(total_tokens)
    max_new_tokens = min(max(total_tokens // 10, MIN_TOKENS), MAX_TOKENS)
    return max_new_tokens

# Update the pipeline initialization
max_new_tokens = get_safe_max_tokens(vectorstore, tokenizer)
pipeline = pipeline(
    "text-generation",
    model=lora_model,
    tokenizer=tokenizer,
    max_new_tokens=max_new_tokens
)
llm = HuggingFacePipeline(pipeline=pipeline)

Using device: cuda
bf16 tensors are supported.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OSError: Incorrect path_or_model_id: 'PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-9): 10 x Phi3DecoderLayer(
            (self_attn): Phi3SdpaAttention(
              (o_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (qkv_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=9216, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=9216, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()
            )
            (mlp): Phi3MLP(
              (gate_up_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=16384, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=16384, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (down_proj): lora.Linear(
                (base_layer): Linear(in_features=8192, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=8192, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (activation_fn): SiLU()
            )
            (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
            (resid_attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
            (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
          )
          (10-31): 22 x Phi3DecoderLayer(
            (self_attn): Phi3SdpaAttention(
              (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
              (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
              (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()
            )
            (mlp): Phi3MLP(
              (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
              (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
              (activation_fn): SiLU()
            )
            (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
            (resid_attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
            (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
          )
        )
        (norm): Phi3RMSNorm((3072,), eps=1e-05)
      )
      (lm_head): Linear(in_features=3072, out_features=32064, bias=False)
    )
  )
)'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [5]:
all_docs = vectorstore.get()['documents']
print(f"Number of documents in vector store: {len(all_docs)}")

Number of documents in vector store: 0


In [None]:
# Update the RAGChatModel class
class RAGChatModel:
    def __init__(self, retriever, llm, tokenizer):
        self.retriever = retriever
        self.llm = llm
        self.tokenizer = tokenizer
        total_tokens = count_total_tokens_in_vectorstore(vectorstore, tokenizer)
        self.max_token_limit = min(max(total_tokens // 10, MIN_TOKENS), MAX_TOKENS)
        self.current_token_count = 0
        self.template_standard = """
        <|system|>
        Answer the question in detail. Provide all the relevant information based on the provided context.
        It is critical that you mention all page numbers where this information is found. Do not skip any page numbers.

        Context: {context}

        Providing all the page numbers is essential for the answer.
        <|end|>

        <|user|>
        Question: {question}
        <|end|>

        <|assistant|>
        """
        self.template_exceeded = """
        <|system|>
        Answer the question in detail; warn that information is not taken from the prescribed textbook and must provide the page numbers where they can find the correct information in the prescribed textbook.

        Context: {context}
        Providing all the page numbers is essential for the answer.
        <|end|>

        <|user|>
        Question: {question}
        <|end|>

        <|assistant|>
        """

    def num_tokens_from_string(self, string: str) -> int:
        """Returns the number of tokens in a text string using the tokenizer."""
        return len(self.tokenizer.encode(string))

    def format_docs(self, docs, full_content=True):
        """Format the documents to be used as context in the prompt."""
        if full_content:
            return "\n\n".join(f"Information in Page number: {(doc.metadata['page']+1)}\n{doc.page_content}" for doc in docs)
        else:
            return "Information available in prescribed textbook " + ", ".join(f"Page number: {doc.metadata['page']}" for doc in docs)

    def get_prompt(self, docs, question):
        """Generate the prompt based on token count and context formatting."""
        # Format the context with full content
        context = self.format_docs(docs, full_content=True)
        total_tokens_in_context = self.num_tokens_from_string(context)

        # Add tokens to the running total
        self.current_token_count += total_tokens_in_context

        # Decide whether to use full content or only page numbers
        if self.current_token_count > self.max_token_limit:
            print("Token limit exceeded. Information from prescribed textbook will not be used.")
            # Reformat context to include only page numbers
            context = self.format_docs(docs, full_content=False)
            template = self.template_exceeded
        else:
            template = self.template_standard

        # Create the prompt
        prompt = template.format(context=context, question=question)
        return prompt

    def extract_clean_answer(self, raw_output):
        """Extract only the answer from the raw output."""
        assistant_tag = "<|assistant|>"
        if assistant_tag in raw_output:
            clean_answer = raw_output.split(assistant_tag)[-1].strip()
            return clean_answer
        return raw_output.strip()

    def ask_question(self, question):
        """Main function to retrieve relevant docs and generate a response."""
        # Add fixed request for page numbers to the user's question
        question_with_page_request = f"{question}. Please provide the page numbers in your answer."

        # Retrieve relevant documents
        docs = self.retriever.invoke(question_with_page_request)

        # Generate prompt based on token count
        prompt = self.get_prompt(docs, question_with_page_request)

        # Pass the prompt to the LLM
        result = self.llm.generate([prompt])

        # Extract the generated text
        raw_answer = result.generations[0][0].text

        # Get the clean answer
        clean_answer = self.extract_clean_answer(raw_answer)

        # Display the answer
        display(Markdown(clean_answer))


In [None]:
# Initialize the RAGChatModel
rag_chat_model = RAGChatModel(retriever, llm, tokenizer)

In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.0.1-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.0 (from gradio)
  Downloading gradio_client-1.4.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.6.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting tomlkit==0.12.0 (from gradio)
  Downloading tomlkit-0.12.0-py3-none-any.whl.metadata (2.7 kB)
Collecting websocket

In [None]:
import gradio as gr

# Function to shorten the question for the chat history display
def get_short_overview(question, answer, max_length=50):
    """Generate a short summary of the question for the chat history."""
    # For simplicity, return the first few words of the question
    short_question = (question[:max_length] + '...') if len(question) > max_length else question
    return short_question

# Function for the RAG model interaction
def ask_question_gradio(history, question):
    """Main function to retrieve relevant docs and generate a response."""
    # Retrieve relevant documents
    docs = rag_chat_model.retriever.invoke(question)

    # Generate prompt based on token count
    prompt = rag_chat_model.get_prompt(docs, question)

    # Pass the prompt to the LLM
    result = rag_chat_model.llm.generate([prompt])

    # Extract the generated text
    raw_answer = result.generations[0][0].text

    # Get the clean answer
    clean_answer = rag_chat_model.extract_clean_answer(raw_answer)

    # Add the question and answer to the conversation history
    history.append((question, clean_answer))

    # Generate a short summary for the chat history section (only from the question)
    short_overview = get_short_overview(question, clean_answer)

    # Append the short overview to the chat history display (only questions' summaries)
    chat_history = "\n\n".join([get_short_overview(q, a) for q, a in history])

    # Return the updated history, overview history for the left panel, and clear the user input
    return history, chat_history, ""  # The empty string clears the input box

# Create a Gradio Blocks interface for chat-like interaction
with gr.Blocks() as demo:
    gr.Markdown(
        """
        <h1 style='text-align: center;'>Lora ChemNerd</h1>
        <p style='text-align: center;'>Ask any question and get a response from the RAG model.</p>
        """,
    )

    with gr.Row():
        # Sidebar for chat history
        with gr.Column(scale=1, min_width=200):
            gr.Markdown("### Chat History Overview")
            history_display = gr.Textbox(label="Chat History", lines=20, interactive=False)  # Non-editable

        # Main Chatbot Area
        with gr.Column(scale=2):
            # Chatbot layout with conversation history
            chatbot = gr.Chatbot(label="Q-LORA ChemNerd Chat")

            # Text input for user questions
            with gr.Row():
                user_input = gr.Textbox(
                    placeholder="Ask your question...",
                    label="Type your message here:"
                )

                # Button to submit the question
                submit_button = gr.Button("Send")

            # Maintain the conversation history
            history_state = gr.State([])

            # Link the components: send input, update chatbot, clear textbox, and update history display
            submit_button.click(
                ask_question_gradio,
                inputs=[history_state, user_input],
                outputs=[chatbot, history_display, user_input],
                scroll_to_output=True
            )
            user_input.submit(
                ask_question_gradio,
                inputs=[history_state, user_input],
                outputs=[chatbot, history_display, user_input],
                scroll_to_output=True
            )

# Launch the Gradio interface
demo.launch(share=True)




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e7da1df7499d8afb70.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# Start the interactive chat
print("Welcome to the RAG Chat Model! Ask any question (type 'exit' to quit):")
while True:
  print("\n\n")
  question = input("Your question: ")
  if question.lower() == 'exit':
    print("Exiting the chat.")
    break
  rag_chat_model.ask_question(question)

Welcome to the RAG Chat Model! Ask any question (type 'exit' to quit):



Your question: To heat a dodecane fluid with steam, what type of heat exchanger should be applied and why?


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


To heat a dodecane fluid with steam, a shell and tube heat exchanger should be applied. This type of heat exchanger is suitable for this application because it allows for efficient heat transfer between the steam and the dodecane fluid. The shell and tube design facilitates the flow of steam through the tubes while the dodecane fluid flows around the tubes within the shell, allowing for effective heat exchange.

The specific reasons for choosing a shell and tube heat exchanger for heating dodecane with steam include its high heat transfer efficiency, ability to handle high pressure and temperature differences, and its versatility in handling various fluids. Additionally, the design of the shell and tube heat exchanger allows for easy maintenance and cleaning, which is important for maintaining the efficiency of the heat exchange process.

For detailed information on the application of shell and tube heat exchangers in heating dodecane with steam, refer to the following pages:

- Page 10: Introduction to Heat Exchangers
- Page 12: Types of Heat Exchangers
- Page 15: Shell and Tube Heat Exchangers
- Page 18: Heat Transfer in Shell and Tube Heat Exchangers
- Page 20: Design Considerations for Shell and Tube Heat Exchangers
- Page 22: Application of Shell and Tube Heat Exchangers in Industrial Processes

These pages provide a comprehensive overview of the principles, design, and application of shell and tube heat exchangers, including their suitability for heating dodecane with steam.




Your question: exit
Exiting the chat.


How do we determine the breakthru point for an absorption bed?

what is entropy from the perspective of a molecule?

