In [1]:
! nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-7eb4fb73-e5e9-5517-efd0-ebd28e34b45e)
GPU 1: Tesla T4 (UUID: GPU-2351fa78-bf0d-d8d0-9dc6-8515df21181c)


# Installs

In [2]:
%%time

! pip install -qq -U langchain tiktoken pypdf chromadb faiss-gpu
! pip install -qq -U transformers InstructorEmbedding sentence_transformers
! pip install -qq -U accelerate bitsandbytes xformers einops

CPU times: user 407 ms, sys: 89.4 ms, total: 496 ms
Wall time: 38 s


# Imports

In [3]:
import warnings
warnings.filterwarnings("ignore")

import os
import textwrap

import langchain
from langchain.llms import HuggingFacePipeline

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import LlamaTokenizer, LlamaForCausalLM, pipeline

print(langchain.__version__)

0.0.292


In [4]:
### Multi-document retriever
from langchain.vectorstores import Chroma, FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA, VectorDBQA
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader


from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

# CFG

In [5]:
class CFG:
    model_name = 'falcon' # wizardlm, llama, bloom, falcon

# Define model

In [6]:
def get_model(model = CFG.model_name):
    
    print('\nDownloading model: ', model, '\n\n')
    
    if CFG.model_name == 'wizardlm':
        tokenizer = AutoTokenizer.from_pretrained('TheBloke/wizardLM-7B-HF')
        
        model = AutoModelForCausalLM.from_pretrained('TheBloke/wizardLM-7B-HF',
                                                     load_in_8bit=True,
                                                     device_map='auto',
                                                     torch_dtype=torch.float16,
                                                     low_cpu_mem_usage=True
                                                    )
        max_len = 1024
        task = "text-generation"
        T = 0
        
    elif CFG.model_name == 'llama':
        tokenizer = AutoTokenizer.from_pretrained("aleksickx/llama-7b-hf")
        
        model = AutoModelForCausalLM.from_pretrained("aleksickx/llama-7b-hf",
                                                     load_in_8bit=True,
                                                     device_map='auto',
                                                     torch_dtype=torch.float16,
                                                     low_cpu_mem_usage=True,
                                                    )
        max_len = 1024
        task = "text-generation"
        T = 0.1

    elif CFG.model_name == 'bloom':
        tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-7b1")
        
        model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-7b1",
                                                     load_in_8bit=True,
                                                     device_map='auto',
                                                     torch_dtype=torch.float16,
                                                     low_cpu_mem_usage=True,
                                                    )
        max_len = 1024
        task = "text-generation"
        T = 0
        
    elif CFG.model_name == 'falcon':
        tokenizer = AutoTokenizer.from_pretrained("h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2")
        
        model = AutoModelForCausalLM.from_pretrained("h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2",
                                                     load_in_8bit=True,
                                                     device_map='auto',
                                                     torch_dtype=torch.float16,
                                                     low_cpu_mem_usage=True,
                                                     trust_remote_code=True
                                                    )
        max_len = 1024
        task = "text-generation"
        T = 0        
        
    else:
        print("Not implemented model (tokenizer and backbone)")
        
    return tokenizer, model, max_len, task, T

# Hugging Face pipeline

In [7]:
%%time

tokenizer, model, max_len, task, T = get_model(CFG.model_name)


Downloading model:  falcon 




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

CPU times: user 8.57 s, sys: 20.9 s, total: 29.4 s
Wall time: 1min 17s


In [8]:
pipe = pipeline(
    task=task,
    model=model, 
    tokenizer=tokenizer, 
    max_length=max_len,
    temperature=T,
    top_p=0.95,
    repetition_penalty=1.15
)

llm = HuggingFacePipeline(pipeline=pipe)

In [9]:
llm

HuggingFacePipeline(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7ed9c22a7190>, model_id='gpt2', model_kwargs=None, pipeline_kwargs=None)

# 🦜🔗 Langchain

- Multiple document retriever with LangChain

In [10]:
CFG.model_name

'falcon'

## Loader

- [Directory loader](https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/file_directory.html?highlight=DirectoryLoader#file-directory) for multiple files

In [11]:
%%time

loader = DirectoryLoader('/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/', 
                         glob="./*.pdf",
                         loader_cls=PyPDFLoader,
                         show_progress=True,
                         use_multithreading=True)

documents = loader.load()

100%|██████████| 7/7 [02:17<00:00, 19.70s/it]

CPU times: user 2min 17s, sys: 1.71 s, total: 2min 18s
Wall time: 2min 17s





In [12]:
%%time

### clean a bit
for i in range(len(documents)):
    documents[i].page_content = documents[i].page_content.replace('\t', ' ')\
                                                         .replace('\n', ' ')\
                                                         .replace('       ', ' ')\
                                                         .replace('      ', ' ')\
                                                         .replace('     ', ' ')\
                                                         .replace('    ', ' ')\
                                                         .replace('   ', ' ')\
                                                         .replace('  ', ' ')

CPU times: user 102 ms, sys: 1.05 ms, total: 103 ms
Wall time: 102 ms


In [13]:
len(documents)

4114

In [14]:
documents[186].page_content

'186could feel it vibrating and let go; it hung in midair, unsupported, at exactly the right height for him to mount it. His eyes moved from thegolden registration number at the top of the handle, right down to theperfectly smooth, streamlined birch twigs that made up the tail. "Who sent it to you?" said Ron in a hushed voice."Look and see if there\'s a card," said Harry.Ron ripped apart the Firebolt\'s wrappings."Nothing! Blimey, who\'d spend that much on you?""Well," said Harry, feeling stunned, "I\'m betting it wasn\'t the Dursleys." I bet it was Dumbledore," said Ron, now walking around and around the Firebolt, taking in every glorious inch. "He sent you the InvisibilityCloak anonymously...." "That was my dad\'s, though," said Harry. "Dumbledore was just Passing it on to me. He wouldn\'t spend hundreds of Galleons on me. He can\'t gogiving students stuff like this --" "That\'s why he wouldn\'t say it was from him!" said Ron. "In case some git like Malfoy said it was favoritism. Hey

## Splitter

- Splitting the text into chunks so it fits in memory
- [RecursiveCharacterTextSplitter](https://python.langchain.com/en/latest/reference/modules/document_loaders.html?highlight=RecursiveCharacterTextSplitter#langchain.document_loaders.MWDumpLoader)

In [15]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
len(texts)

8213

## Create vector database

In [16]:
%%time
### this takes ~35 min to run

persist_directory = 'harry-potter-vectordb-chroma'

### download embeddings model
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", 
                                                      model_kwargs={"device": "cuda"})

### create embeddings and DB
vectordb = Chroma.from_documents(documents=texts,
                                 embedding=instructor_embeddings,
                                 persist_directory=persist_directory,
                                 collection_name='hp_books')



### persist Chroma database
vectordb.persist()

load INSTRUCTOR_Transformer
max_seq_length  512
CPU times: user 33min 42s, sys: 15.4 s, total: 33min 57s
Wall time: 34min 6s


## Load vector database

After persisting the vector database, just load it from disk

In [17]:
# %%time

# load_directory = '/kaggle/input/harry-potter-vectordb-chroma'

# ### download embeddings model
# instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", 
#                                                       model_kwargs={"device": "cuda"})

# vectordb = Chroma(persist_directory=load_directory, embedding_function=instructor_embeddings)

## Retriever chain

- Retriever to retrieve relevant passages
- Chain to answer questions
- [RetrievalQA: Chain for question-answering against an index](https://python.langchain.com/en/latest/reference/modules/chains.html?highlight=RetrievalQA#langchain.chains.RetrievalQA)

In [18]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3, "search_type" : "similarity"})

qa_chain = RetrievalQA.from_chain_type(llm=llm, 
                                       chain_type="stuff", 
                                       retriever=retriever, 
                                       return_source_documents=True,
                                       verbose=False)

In [19]:
qa_chain

RetrievalQA(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, combine_documents_chain=StuffDocumentsChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, input_key='input_documents', output_key='output_text', llm_chain=LLMChain(memory=None, callbacks=None, callback_manager=None, verbose=False, tags=None, metadata=None, prompt=PromptTemplate(input_variables=['context', 'question'], output_parser=None, partial_variables={}, template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:", template_format='f-string', validate_template=True), llm=HuggingFacePipeline(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7ed9

## Post-process outputs

- Format llm response
- Cite sources (documents)

In [20]:
def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [21]:
def llm_ans(query):
    llm_response = qa_chain(query)
    ans = process_llm_response(llm_response)
    return ans

# Ask questions

- Question Answering from multiple documents
- Run QA Chain

In [22]:
CFG.model_name

'falcon'

In [23]:
model

RWForCausalLM(
  (transformer): RWModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x DecoderLayer(
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
        (self_attention): Attention(
          (maybe_rotary): RotaryEmbedding()
          (query_key_value): Linear8bitLt(in_features=4544, out_features=4672, bias=False)
          (dense): Linear8bitLt(in_features=4544, out_features=4544, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): MLP(
          (dense_h_to_4h): Linear8bitLt(in_features=4544, out_features=18176, bias=False)
          (act): GELU(approximate='none')
          (dense_4h_to_h): Linear8bitLt(in_features=18176, out_features=4544, bias=False)
        )
      )
    )
    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=4544, out_features=65024, bias=False)
)

In [24]:
query = "What happened to Harry Potter's parents?"
llm_ans(query)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


 Harry Potter's parents were killed by Voldemort when he was a baby.


Sources:
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 2 - The Chamber of Secrets.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 2 - The Chamber of Secrets.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 2 - The Chamber of Secrets.pdf


In [25]:
query = "Who is Hagrid? Where does he live?"
llm_ans(query)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


 Hagrid is a character in the Harry Potter series. He lives in a small wooden house on the edge of the
forbidden forest.


Sources:
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 1 - The Sorcerers Stone.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 2 - The Chamber of Secrets.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 6 - The Half-Blood Prince.pdf


In [26]:
query = "Who revived Voldemort?"
llm_ans(query)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


 Dumbledore


Sources:
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 4 - The Goblet of Fire.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 6 - The Half-Blood Prince.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 7 - The Deathly Hallows.pdf


In [27]:
query = "How was Voldemort ressurected?"
llm_ans(query)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


 Voldemort was resurrected by using Bertha Jorkins’s information, of course. Use my one faithful Death Eater,
stationed at Hogwarts, to ensure that the boy’s name was entered into the Goblet of Fire. Use my Death Eater
to ensure that the boy won the tournament — that he touched the Triwizard Cup first — the cup which my Death
Eater had turned into a Portkey, which would bring him here, beyond the reach of Dumbledore’s help and
protection, and into my waiting arms. And here he is... the boy you all believed had been my downfall....


Sources:
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 4 - The Goblet of Fire.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 4 - The Goblet of Fire.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 4 - The Goblet of Fire.pdf


In [28]:
query = "What are the four Hogwarts houses??"
llm_ans(query)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


 Gryffindor, Hufflepuff, Ravenclaw, and Slytherin.


Sources:
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 1 - The Sorcerers Stone.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 2 - The Chamber of Secrets.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 4 - The Goblet of Fire.pdf


In [29]:
query = "What is the name and species of Harry Potter's pet?"
llm_ans(query)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


 The name of Harry Potter's pet is a cat named Crookshanks.


Sources:
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 1 - The Sorcerers Stone.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 1 - The Sorcerers Stone.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 1 - The Sorcerers Stone.pdf


In [30]:
query = "What does a Mandrake do?"
llm_ans(query)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


 A Mandrake is a plant that is used in potions to restore people to their original state. It is also dangerous
because it has a loud, painful cry that can knock someone unconscious.


Sources:
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 2 - The Chamber of Secrets.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 2 - The Chamber of Secrets.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 2 - The Chamber of Secrets.pdf


In [31]:
query = "How does Harry Potter got into Hogwarts?"
llm_ans(query)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


 Harry Potter got into Hogwarts by using the invisibility cloak that he received from Hagrid.


Sources:
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 3 - The Prisoner of Azkaban.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 3 - The Prisoner of Azkaban.pdf
/kaggle/input/harry-potter-books-in-pdf-1-7/HP books/Harry Potter - Book 5 - The Order of the Phoenix.pdf
