In [28]:
# LLM Model imports
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig, pipeline
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

# Vector DB
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Langchain
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate

# Generic imports
from enum import Enum

In [3]:
from huggingface_hub import login

login("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") # rishavroy97
# login("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") # mgokulkrish

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/rr4577/.cache/huggingface/token
Login successful


In [62]:
# Parameter List

db_choice = "FAISS"
embedding_model = "all-MiniLM-L6-v2"
llm_model = "llama2"
model_hyperparams = {
    'temp': 0.1,
    'max_tokens': 512,
    'rep_penalty': 1.1
}

In [5]:
# Step 1 : Accelerator

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [6]:
# Step 2: Load the Vectordb and getting it to production

class DB_TYPE(Enum):
    Chroma = "Chroma"
    FAISS = "FAISS"

In [7]:
def get_Chroma_DB():  
    embedding = HuggingFaceEmbeddings(model_name=embedding_model)
    db = Chroma(persist_directory="./chroma_db", embedding_function=embedding)
    return db

In [8]:
def get_FAISS_DB():
    persist_directory = 'faiss_db/'
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
    db = FAISS.load_local(persist_directory, embeddings)
    return db

In [9]:
def get_vector_db(choice):
    print("loading vector_db ....")
    all_dbs = [db.value for db in DB_TYPE]
    if choice not in all_dbs:
        raise ValueError(f"Database {choice} is not supported")
    db = eval('get_' + choice + '_DB()')
    print(f"Successfully fetched {choice} DB")
    return db

In [10]:
vector_db = get_vector_db(db_choice)

loading vector_db ....
Successfully fetched FAISS DB


In [65]:
# Check if the database is fetching data based on similarity

print("quering question ....")
question = "What factors controls the ability of palladium cathods to attain high loading levels?"
result_docs = vector_db.similarity_search(question)

for i in range(len(result_docs)):
    print(f"Document {i}")
    print(result_docs[i].page_content)
    print(f"\n")

quering question ....
Document 0
The ability of palladium cathodes to attain and maintain high loading levels, at high 
current density and for long times, is controlled by two factors: the condition of the 
electrochemical interface which allows the attainment of high deuterium activity; the defect 
density and mechanical condition of the bulk material which permits the Pd lattice to withstand 
and contain high bulk deuterium activities when these equilibrate to produce extreme pressures of


Document 1
"Quasi-Plasma"  Transport  Model in  Deuterium 
Overloaded  Palladium  Cathodes


Document 2
explanation of the phenomenon based on experimental apparatus designed to enhance the spectrum of information required to
deﬁne the effect. Recently, In order to improve this aspect of this research, speciﬁc work has been carried out to investigate whether
the excess power was produced concurrently with the emission of Radio Frequency from the active cathode. Suitable probes
and triggering incl

In [12]:
# Step 3: Prepare the model and tokenizer

class LLM_MODEL_TYPE(Enum):
    llama2 = "meta-llama/Llama-2-7b-chat-hf"
    mistral = "mistralai/Mistral-7B-Instruct-v0.1"

In [13]:
# Set the Model Quantization Config

def get_quantization_config(load_4bit=True, compute_dtype="float16", double_quant=False, quant_type="nf4"):
    # Activate 4-bit precision base model loading
    use_4bit = load_4bit
    
    # Compute dtype for 4-bit base models
    bnb_4bit_compute_dtype = compute_dtype
    
    # Quantization type (fp4 or nf4)
    bnb_4bit_quant_type = quant_type
    
    # Activate nested quantization for 4-bit base models (double quantization)
    use_nested_quant = False
    
    compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
    
    try:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=use_4bit,
            bnb_4bit_quant_type=bnb_4bit_quant_type,
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=use_nested_quant,
        )
        return bnb_config
    except Error|Exception as e:
        raise ValueError(f"Error in parameters passed \n {e}")

In [14]:
# Double quant set to true for Llama2 model
bnb_config = get_quantization_config(double_quant=True)

In [15]:
def get_llama2_model():
    model_id = LLM_MODEL_TYPE.llama2.value
    
    # begin initializing HF items
    model_config = AutoConfig.from_pretrained(model_id)

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        config=model_config,
        quantization_config=bnb_config,
        device_map='auto'
    )
    
    # enable evaluation mode to allow model inference
    model.eval()
    
    return model

In [16]:
def get_llama2_tokenizer():
    model_id = LLM_MODEL_TYPE.llama2.value
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    return tokenizer

In [17]:
def get_mistral_model():
    model_name=LLM_MODEL_TYPE.llama2.value
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
    )
    return model

In [18]:
def get_mistral_tokenizer():
    model_name=LLM_MODEL_TYPE.llama2.value
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return tokenizer

In [23]:
def get_model(choice):
    print("loading Model ....")
    valid_models = [model.name for model in LLM_MODEL_TYPE]
    if choice not in valid_models:
        raise ValueError(f"Model {choice} is not supported")
    model = eval('get_' + choice + '_model()')
    device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
    print(f"Successfully loaded {choice} Model on {device}")
    return model

In [24]:
def get_tokenizer(choice):
    print("loading Tokenizer ....")
    valid_models = [model.name for model in LLM_MODEL_TYPE]
    if choice not in valid_models:
        raise ValueError(f"Model {choice} is not supported")
    model = eval('get_' + choice + '_tokenizer()')
    device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
    print(f"Successfully loaded {choice} Tokenizer on {device}")
    return model

In [26]:
model = get_model(llm_model)

loading Model ....


Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.75s/it]

Successfully loaded llama2 Model on cuda:0





In [27]:
tokenizer = get_tokenizer(llm_model)

loading Tokenizer ....
Successfully loaded llama2 Tokenizer on cuda:0


In [63]:
# Step 4: Create LLM Pipeline with Retrieval QA Chain

def create_pipeline(model, tokenizer, task="text-generation", **kwargs):
    temperature = kwargs.get('temp') if 'temp' in kwargs else 0.1
    max_new_tokens = kwargs.get('max_tokens') if 'max_tokens' in kwargs else 512
    repetition_penalty = kwargs.get('rep_penalty') if 'rep_penalty' in kwargs else 1.1
    
    text_generation_pipeline = pipeline(
        task = task,
        model=model,
        tokenizer=tokenizer,
        temperature=temperature,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
        max_new_tokens=max_new_tokens,  # max number of tokens to generate in the output
        repetition_penalty=repetition_penalty  # without this output begins repeating
    )
    
    return text_generation_pipeline

In [51]:
text_generation_pipeline = create_pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    kwargs=model_hyperparams
)

In [55]:
def create_prompt_template():
    template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible. 
    {context}
    Question: {question}
    Helpful Answer:"""
    
    return template

In [58]:
def create_retrieval_chain(pipeline, vector_db):
    
    template = create_prompt_template()
    chain_prompt = PromptTemplate.from_template(template)
    llm = llm = HuggingFacePipeline(pipeline=pipeline)
    retriever = vector_db.as_retriever()
    
    chain = RetrievalQA.from_chain_type(
        llm,
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
    )
    
    return chain

In [59]:
retrieval_chain = create_retrieval_chain(
    pipeline=text_generation_pipeline,
    vector_db=vector_db
)

In [64]:
# Approach 1: Use Retrieval Chain from LangChain

question = "What factors controls the ability of palladium cathods to attain high loading levels?"
result = retrieval_chain({"query": question})
result

{'query': 'What factors controls the ability of palladium cathods to attain high loading levels?',
 'result': ' The ability of palladium cathodes to attain and maintain high loading levels is controlled by two factors: the condition of the electrochemical interface and the defect density and mechanical condition of the bulk material.',
 'source_documents': [Document(page_content='The ability of palladium cathodes to attain and maintain high loading levels, at high \ncurrent density and for long times, is controlled by two factors: the condition of the \nelectrochemical interface which allows the attainment of high deuterium activity; the defect \ndensity and mechanical condition of the bulk material which permits the Pd lattice to withstand \nand contain high bulk deuterium activities when these equilibrate to produce extreme pressures of'),
  Document(page_content='"Quasi-Plasma"  Transport  Model in  Deuterium \nOverloaded  Palladium  Cathodes'),
  Document(page_content='explanation 

In [67]:
# Approach 2: Manually fetch similar chunks from vector db

question = "What factors controls the ability of palladium cathods to attain high loading levels?"
result_docs = vector_db.similarity_search(question)

# inputs_not_chat = tokenizer.encode_plus(
#     """[INST] """ + question + """ Use the following context to answer: """ + result_docs[i].page_content + """[/INST]""",
#     return_tensors="pt"
# )['input_ids'].to('cuda

inputs_not_chat = tokenizer.encode_plus("""[INST] What factors controls the ability of palladium cathods to attain high loading levels given below context?
The ability of palladium cathodes to attain and maintain high loading levels, at high 
current density and for long times, is controlled by two factors: the condition of the 
electrochemical interface which allows the attainment of high deuterium activity; the defect 
density and mechanical condition of the bulk material which permits the Pd lattice to withstand 
and contain high bulk deuterium activities when these equilibrate to produce extreme pressures of 
deuterium gas inside closed incipient voids[/INST]""", return_tensors="pt")['input_ids'].to('cuda')

generated_ids = model.generate(inputs_not_chat, 
                               max_new_tokens=1000, 
                               do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded)

["<s> [INST] What factors controls the ability of palladium cathods to attain high loading levels given below context?\nThe ability of palladium cathodes to attain and maintain high loading levels, at high \ncurrent density and for long times, is controlled by two factors: the condition of the \nelectrochemical interface which allows the attainment of high deuterium activity; the defect \ndensity and mechanical condition of the bulk material which permits the Pd lattice to withstand \nand contain high bulk deuterium activities when these equilibrate to produce extreme pressures of \ndeuterium gas inside closed incipient voids[/INST]  The ability of palladium cathodes to attain and maintain high loading levels at high current densities and for long times is controlled by two main factors:\n\n1. Electrochemical interface conditions: The condition of the electrochemical interface between the palladium cathode and the electrolyte solution plays a crucial role in determining the high deuter