In [1]:
import torch
import chromadb
import numpy as np
import transformers
from time import time
from typing import List
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

# Llama model takes about 16s per query
hf_token = "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"

  from .autonotebook import tqdm as notebook_tqdm


#### Loading LLM

In [2]:
torch.random.manual_seed(0) 

model_id = "microsoft/Phi-3-mini-128k-instruct"
# model_id = "meta-llama/Meta-Llama-3-8B"

model = AutoModelForCausalLM.from_pretrained( 
    model_id,  
    device_map="cuda",  
    torch_dtype="auto",  
    trust_remote_code=True,  
    token = hf_token
) 

tokenizer = AutoTokenizer.from_pretrained(model_id, force_download = True, token = hf_token)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████████████████████| 2/2 [00:01<00:00,  1.52it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### Load the cleaned embeddings from ChromaDB

In [3]:
chroma_client = chromadb.PersistentClient(path = "../mlengine/data/chromadb")
msr_ = chroma_client.get_collection(name = "msr2013-query")

In [4]:
def craft_prompt(query: str) -> str:

    msr_query = msr_.query(
            query_texts=[query],  # Chroma will embed this for you
            n_results=10  # How many results to return
        )

    # Get values that deviate less than 0.1 distance away
    documents = np.array(msr_query['documents'][0])
    distances = np.array(msr_query['distances'][0])

    query_results = documents[distances < (distances[0] + 0.1)].tolist()

    # Join results with new lines for the context
    context = "\n".join(query_results)

    return [ 
        {"role": "system", "content": "You are a helpful AI assistant that is answering questions from a database."}, 
        {"role": "user", "content": f"""
        Answer the question as truthfully as possible using the provided text, and if the answer is not contained within the text below, 
        respond with "There are some partial information contained within the database." followed by your suggestions.
        The context given to you is from a list of possible related defects as found in a database of defect entries.
        End off the answer by listing the context that I have provided you with, with the headline, "Here are the most relevant entries in the database: "
        
        >>CONTEXT<<
        {context}\n
        >>QUESTION<< {query}\n
        """}
    ] 

pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

generation_args = { 
    "max_new_tokens": 500, 
    "return_full_text": False, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

output = pipe(craft_prompt("error 500 on internal server error"), **generation_args) 
print(output[0]['generated_text'])

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


 There are some partial information contained within the database. The provided context indicates an HTTP status 500 error, which is commonly associated with internal server errors. However, without more specific details about the error, it's challenging to pinpoint the exact cause. To better understand the issue, consider checking server logs, application error messages, or consulting with the development team responsible for the server or application.

Here are the most relevant entries in the database:

- http status 500 error
- Internal Server Error
- Server-side error
- Application error


In [None]:
### This
# Simple Chat Window, Interactive like ChatGPT
# Test Llama Models, Performance and Time Benchmarks


### Separate
# Technical Documentation PDF separate -> chat on contexts of the pdf
# Potentially other topics eg medical; industry specific models
# Train LLM ? 

In [18]:
import numpy as np
import chromadb

class LLM:
    
    def __init__(self,):

        # Model Documentation: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct?text=hi+there
        
        chroma_client = chromadb.PersistentClient(path = "./data/chromadb/")
        self.msr_ = chroma_client.get_collection(name = "msr2013-query")

        torch.random.manual_seed(0) 
        model_id: str = "microsoft/Phi-3-mini-128k-instruct"
        
        model = AutoModelForCausalLM.from_pretrained( 
            model_id,  
            device_map="cuda",  
            torch_dtype="auto",  
            trust_remote_code=True,  
        ) 
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        
        self.message = [
            {"role": "system", "content": "You are a helpful AI assistant that is answering questions from a database."}
        ]
        self.pipe = pipeline( 
            "text-generation", 
            model=model, 
            tokenizer=tokenizer, 
        ) 
        self.generation_args = { 
            "max_new_tokens": 500, 
            "return_full_text": False, 
            "temperature": 0.0, 
            "do_sample": False, 
        } 

        print("New Chat Initialised.")

    def _craft_prompt(self, query: str, context: str):
        return (
            f"""
            Answer the question as truthfully as possible using the provided text, and if the answer is not contained within the text below, 
            respond with "There are some partial information contained within the database." followed by your suggestions.
            The context given to you is from a list of possible related defects as found in a database of defect entries.
            End off the answer by listing the context that I have provided you with, with the headline, "Here are the most relevant entries in the database: "
            
            >>CONTEXT<<
            {context}\n
            >>QUESTION<< {query}\n
            """
        )

    def send_query(self, query: str): # List[Dict[Literal["system", "user", "assistant"], str]):
        
        msr_query = self.msr_.query(
            query_texts=[query],  # Chroma will embed this for you
            n_results=10  # How many results to return
        )

        # Get values that deviate less than 0.1 distance away
        documents = np.array(msr_query['documents'][0])
        distances = np.array(msr_query['distances'][0])
    
        query_results = documents[distances < (distances[0] + 0.1)].tolist()
    
        # Join results with new lines for the context
        context = "\n".join(query_results)
        
        prompt = self._craft_prompt(query, context)
        self.message += [ { "role": "user", "content": query } ]

        output = self.pipe(prompt, **self.generation_args) 

        response = output[0]['generated_text']

        self.message += [ { "role": "assistant", "content": response } ]
        return response


def initiate_chat():
    return LLM()

In [19]:
llm = initiate_chat()

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:01<00:00,  1.39it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


New Chat Initialised.


In [20]:
print(llm.send_query("why is my html file closing on 404 on its own?"))


            Here are the most relevant entries in the database:
            - file: empty files appear as <html><body></body><html>
            - file:// url in http pages does not load content
            - file:///link on webpage does not work
            - page loads improperly when /index.html omitted from url
            - crash when opening html code as a local file://

            There are some partial information contained within the database.
            Suggestions:
            - Check if the file path is correct and the file exists.
            - Ensure the file is not empty and contains valid HTML code.
            - Verify that the server is configured to serve the file correctly.
            - Confirm that the URL used to access the file is correct and includes the necessary protocol (file://).
            - If the file is part of a larger project, ensure that all dependencies are correctly linked and loaded.
            - Check for any server-side errors that might be 