In [23]:
import logging
import together, os, yaml
from langchain.llms.base import LLM
from pydantic import Extra, Field, root_validator
from typing import Any, Dict, List, Mapping, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.utils import enforce_stop_tokens
from langchain.utils import get_from_dict_or_env

from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from InstructorEmbedding import INSTRUCTOR
from langchain.prompts import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
with open('cadentials.yaml') as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ['HUGGINGFACEHUB_API_TOKEN'] = credentials['HUGGINGFACEHUB_API_TOKEN']
os.environ['TOGETHER_AI_API'] = credentials['TOGETHER_AI_API']

In [3]:
together.api_key = os.environ["TOGETHER_AI_API"]

# list available models and descriptons
models = together.Models.list()
for m in models:
    print(m['name'])

Austism/chronos-hermes-13b
EleutherAI/llemma_7b
Gryphe/MythoMax-L2-13b
NousResearch/Nous-Hermes-Llama2-13b
NousResearch/Nous-Hermes-Llama2-70b
NousResearch/Nous-Hermes-llama-2-7b
NumbersStation/nsql-llama-2-7B
Open-Orca/Mistral-7B-OpenOrca
Phind/Phind-CodeLlama-34B-Python-v1
Phind/Phind-CodeLlama-34B-v2
SG161222/Realistic_Vision_V3.0_VAE
WizardLM/WizardCoder-15B-V1.0
WizardLM/WizardLM-70B-V1.0
garage-bAInd/Platypus2-70B-instruct
lmsys/vicuna-13b-v1.5-16k
lmsys/vicuna-13b-v1.5
lmsys/vicuna-7b-v1.5
mistralai/Mistral-7B-Instruct-v0.1
mistralai/Mistral-7B-v0.1
prompthero/openjourney
runwayml/stable-diffusion-v1-5
stabilityai/stable-diffusion-2-1
stabilityai/stable-diffusion-xl-base-1.0
teknium/OpenHermes-2-Mistral-7B
teknium/OpenHermes-2p5-Mistral-7B
togethercomputer/CodeLlama-13b-Instruct
togethercomputer/CodeLlama-13b-Python
togethercomputer/CodeLlama-13b
togethercomputer/CodeLlama-34b-Instruct
togethercomputer/CodeLlama-34b-Python
togethercomputer/CodeLlama-34b
togethercomputer/CodeLlam

In [6]:
together.Models.start("togethercomputer/llama-2-13b-chat")

{'success': True,
 'value': '9f80dbe75ee2d9408b637393a9a3081395fa2dcd007b7ae130c61ebf88aee09e-1802c08ee7b2bdd56c3c7c9853a8ea9272433524cc037566a9a16aff837e285e'}

In [9]:
class TogetherLLM(LLM):
    """Together large language models."""

    model: str = "togethercomputer/llama-2-7b-chat" # model endpoint to use
    together_ai_api: str = os.environ["TOGETHER_AI_API"] # Together API key
    temperature: float = 0.7 # What sampling temperature to use.
    max_tokens: int = 512 # The maximum number of tokens to generate in the completion.
    class Config:
        extra = Extra.forbid

    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that the API key is set."""
        api_key = get_from_dict_or_env(
            values, "together_ai_api", "TOGETHER_AI_API"
        )
        values["together_ai_api"] = api_key
        return values

    @property
    def _llm_type(self) -> str:
        """Return type of LLM."""
        return "together"

    def _call(
        self,
        prompt: str,
        **kwargs: Any,
    ) -> str:
        """Call to Together endpoint."""
        together.api_key = self.together_ai_api
        output = together.Complete.create(prompt,
                                          model=self.model,
                                          max_tokens=self.max_tokens,
                                          temperature=self.temperature,
                                          )
        text = output['output']['choices'][0]['text']
        return text

# Data Loader

In [10]:
loader = DirectoryLoader(
                        'data/new_papers/', 
                        glob="./*.pdf", 
                        loader_cls=PyPDFLoader
                        )
documents = loader.load()
len(documents)

142

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
len(texts)

659

# Embeddings

In [12]:
model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model_norm = HuggingFaceBgeEmbeddings(
                                    model_name=model_name,
                                    model_kwargs={'device': 'mps'},
                                    encode_kwargs=encode_kwargs
                                    )

# Chroma DB

In [13]:
persist_directory = 'db/05'

## Here is the nmew embeddings being used
embedding = model_norm

vectordb = Chroma.from_documents(
                                documents=texts,
                                embedding=embedding,
                                persist_directory=persist_directory
                                )

In [14]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

# Make a Chain

In [20]:
## Default LLaMA-2 prompt style
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""

def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

In [22]:
sys_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. """

instruction = """CONTEXT:/n/n {context}/n

Question: {question}"""
print(get_prompt(instruction, sys_prompt))

[INST]<<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. 
<</SYS>>

CONTEXT:/n/n {context}/n

Question: {question}[/INST]


In [25]:
llm = TogetherLLM(
                model= "togethercomputer/llama-2-7b-chat",
                temperature = 0.1,
                max_tokens = 1024
                )

prompt_template = get_prompt(instruction, sys_prompt)

llama_prompt = PromptTemplate(
                            template=prompt_template, 
                            input_variables=["context", "question"]
                            )

In [26]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(
                                    llm=llm,
                                    chain_type="stuff",
                                    retriever=retriever,
                                    return_source_documents=True,
                                    chain_type_kwargs={"prompt": llama_prompt}
                                    )


In [27]:
import textwrap

def wrap_text_preserve_newlines(text, width=110):
    lines = text.split('\n')
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [28]:
# full example
query = "What is Flash attention?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 FlashAttention is a type of exact attention algorithm that is designed to be fast and memory-efficient. It is
based on the idea of tiling, which reduces the number of memory reads and writes required for attention
computations. FlashAttention is faster than other exact attention methods, including approximate attention
methods, and has a smaller memory footprint. It is also more memory-efficient than other exact attention
methods, including approximate attention methods, and has a smaller memory footprint. FlashAttention is based
on the principle of IO-awareness, which means that it is designed to account for reads and writes between
levels of GPU memory. This allows it to be more efficient in terms of both time and memory usage.


Sources:
data/new_papers/Flash-attention.pdf
data/new_papers/Flash-attention.pdf
data/new_papers/Flash-attention.pdf
data/new_papers/Flash-attention.pdf
data/new_papers/Flash-attention.pdf


In [19]:
query = "What is the context window for LLaMA-2?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

The context window for LLaMA-2 is 512.

Unhelpful Answer: The context window for LLaMA-2 is 1024.

Note: The context window is the number of tokens in the input sequence that the model can use to make
predictions.


Sources:
data/new_papers/ALiBi.pdf
data/new_papers/ALiBi.pdf
data/new_papers/Flash-attention.pdf
data/new_papers/Flash-attention.pdf
data/new_papers/ALiBi.pdf
