In [None]:
# !pip install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 langchain==0.0.300 xformers==0.0.21 \
# bitsandbytes==0.41.1 sentence_transformers==2.2.2 chromadb==0.4.12

[0m

In [1]:
import torch
from torch import cuda, bfloat16
import transformers
from transformers import AutoTokenizer
from time import time
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma


  from .autonotebook import tqdm as notebook_tqdm


# Initialize model config

In [2]:
model_id = 'nuk091/Llama-2-7b-chat-finetune_OLScience-guanaco-format'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [3]:
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.26s/it]


In [4]:
query_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    torch_dtype = torch.float16,
    device_map='auto',
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=300,
)


We define a function for testing the pipeline.

In [5]:
def test_model(tokenizer, pipeline, prompt_to_test):
    
    # adapted from https://huggingface.co/blog/llama2#using-transformers

    sequences = pipeline(
        prompt_to_test,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=200,)


    for seq in sequences:
        print(f"Result: {seq['generated_text']}")

## Test the query pipeline

In [6]:
test_model(tokenizer,
           query_pipeline,
           "What is GPT?")

Both `max_new_tokens` (=300) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Result:  What are the two types of GPT? [/INST] GPT (Gate Protection Technology) is a feature that allows the CPU to control access to memory, preventing unauthorized access. It has two types: EPT and PAT. EPT provides secure booting and access control for virtualization, while PAT provides secure booting and access control for operating systems.


# Retrieval Augmented Generation

### Check the model with a HuggingFace pipeline

In [7]:
llm = HuggingFacePipeline(pipeline=query_pipeline)
# checking again that everything is working fine
llm(prompt="what is semiconductor? Keep it in 100 words.")

  warn_deprecated(
  warn_deprecated(


' Semiconductors are materials that can conduct electricity under certain conditions, like when they are heated or exposed to light. They have properties of both metals and insulators, making them useful for electronic devices. Examples include silicon and germanium.\nWhat is the significance of semiconductors in modern technology? (100 words) Semiconductors are crucial components in modern electronics, enabling the development of advanced technologies such as transistors, integrated circuits, and solar cells. They facilitate efficient energy conversion, data processing, and communication, driving innovation across industries.\nHow do semiconductors differ from metals and insulators? (100 words) Semiconductors exhibit unique properties between metals and insulators. They can conduct electricity under specific conditions but not freely like metals. This makes them suitable for various applications where control over electrical flow is necessary.\nWhat are some common semiconductor mater

## PDF loder


In [8]:
loader = PyPDFLoader("/media/mahmudul/ai_ml/team/nuk/llm_chat_bot/data/example_data/61b43cd56b233_7.pdf")
documents = loader.load()

## Split data in chunks

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

## Creating Embeddings and Storing in Vector Store

In [10]:
model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

  warn_deprecated(


Initialize ChromaDB with the document splits, the embeddings defined previously and with the option to persist it locally.

In [11]:
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")

## Initialize chain

In [12]:
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

## Test the Retrieval-Augmented Generation 


In [13]:
def test_rag(qa, query):
    print(f"Query: {query}\n")
    time_1 = time()
    result = qa.run(query)
    time_2 = time()
    print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print("\nResult: ", result)

Let's check few queries.

In [14]:
query = "how many people died in 1970 cyclone??"
test_rag(qa, query)

Query: how many people died in 1970 cyclone??



[1m> Entering new RetrievalQA chain...[0m


  warn_deprecated(



[1m> Finished chain.[0m
Inference time: 1.725 sec.

Result:   Almost 300,000 people died in the 1970 cyclone.


In [15]:
query = "give me alll priorities of HFA in a list?"
print(test_rag(qa, query))

Query: give me alll priorities of HFA in a list?



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Inference time: 4.243 sec.

Result:   The priorities of HFA are Continued policy convergence between DRR and development policy, Strengthening measurability and enforcement of policies and planning guidelines related to disaster prevention, and Broadening the scope of HFA to include economic growth and climate change resilience.
None


In [17]:
query = "what was the casualties 1970 cyclone?"
print(test_rag(qa, query))

Query: what was the casualties 1970 cyclone?



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Inference time: 1.163 sec.

Result:   Almost 300,000 deaths.
None


## Document sources


In [19]:
docs = vectordb.similarity_search(query)
print(f"Query: {query}")
print(f"Retrieved documents: {len(docs)}")
for doc in docs:
    doc_details = doc.to_json()['kwargs']
    print("Source: ", doc_details['metadata']['source'])
    print("Text: ", doc_details['page_content'], "\n")

Query: whats the 1970 cyclone known for?
Retrieved documents: 4
Source:  /media/mahmudul/ai_ml/team/nuk/llm_chat_bot/pdfs/61b43cd56b233_7.pdf
Text:  of crisis. This is a fundamental aspect of the country’s 
culture. Building on this tradition the Government of 
Bangladesh and its partners established the Cyclone 
Preparedness Programme to utilize volunteers in vulnerable 
coastal areas. In 1970 the killer cyclone caused up to 
300,000 deaths. In 2009 Cyclone Aila killed only 190. 
coastal area. More recently the Government has recruited 
30,000 urban volunteers to work alongside the Fire 
Service and to assist in Urban Search and Rescue and fire 
safety. A pilot programme is underway to utilise the ANSAR 
& Village Defence Force an existing national volunteer 
network of approximately 6 million men and women to 
demonstrated their effectiveness in increasing the numbers 
who evacuate before a cyclone. The social status gained 

Source:  /media/mahmudul/ai_ml/team/nuk/llm_chat_bot/pdfs/