In [1]:
EMB_OPENAI_ADA = "text-embedding-ada-002"
EMB_INSTRUCTOR_XL = "hkunlp/instructor-xl"
EMB_SBERT_MPNET_BASE = "sentence-transformers/all-mpnet-base-v2" 

In [2]:
LLM_OPENAI_GPT35 = "gpt-3.5-turbo"
LLM_FLAN_T5_XXL = "google/flan-t5-xxl"
LLM_FLAN_T5_XL = "google/flan-t5-xl"
LLM_FASTCHAT_T5_XL = "lmsys/fastchat-t5-3b-v1.0"
LLM_FLAN_T5_SMALL = "google/flan-t5-small"
LLM_FLAN_T5_BASE = "google/flan-t5-base"
LLM_FLAN_T5_LARGE = "google/flan-t5-large"
LLM_FALCON_SMALL = "tiiuae/falcon-7b-instruct"

In [3]:
from langchain.document_loaders import PDFPlumberLoader
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
from transformers import pipeline
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain import HuggingFacePipeline
from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
import torch
from transformers import AutoTokenizer
import re
import os


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
config = {"persist_directory":None,
          "load_in_8bit":False,
          "embedding" : EMB_SBERT_MPNET_BASE,
          "llm":LLM_FLAN_T5_BASE,
          }

In [5]:
def create_sbert_mpnet():
        device = "cuda" if torch.cuda.is_available() else "cpu"
        return HuggingFaceEmbeddings(model_name=EMB_SBERT_MPNET_BASE, model_kwargs={"device": device})


def create_flan_t5_base(load_in_8bit=False):
        # Wrap it in HF pipeline for use with LangChain
        model="google/flan-t5-base"
        tokenizer = AutoTokenizer.from_pretrained(model)
        return pipeline(
            task="text2text-generation",
            model=model,
            tokenizer = tokenizer,
            max_new_tokens=100,
            model_kwargs={"device_map": "auto", "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.}
        )

def create_falcon_instruct_small(load_in_8bit=False):
        model = "tiiuae/falcon-7b-instruct"

        tokenizer = AutoTokenizer.from_pretrained(model)
        hf_pipeline = pipeline(
                task="text-generation",
                model = model,
                tokenizer = tokenizer,
                trust_remote_code = True,
                max_new_tokens=100,
                model_kwargs={
                    "device_map": "auto", 
                    "load_in_8bit": load_in_8bit, 
                    "max_length": 512, 
                    "temperature": 0.01,
                    "torch_dtype":torch.bfloat16,
                    }
            )
        return hf_pipeline

In [6]:
if config["embedding"] == EMB_SBERT_MPNET_BASE:
    embedding = create_sbert_mpnet()

In [7]:
load_in_8bit = config["load_in_8bit"]
if config["llm"] == LLM_FLAN_T5_BASE:
    llm = create_flan_t5_base(load_in_8bit=load_in_8bit)

In [48]:
# Code to Train on multiple files
import os
path = "./data"
dir_list = os.listdir(path)
documents=[]
# prints all files
print(dir_list)
for file in dir_list:
    file = path + file
    loader = PDFPlumberLoader(file)
    doc = loader.load()
    documents+=doc

['B1.pdf', 'B2.pdf', 'B3.pdf']


In [38]:
pdf_path = path + "B1.pdf"
loader = PDFPlumberLoader(pdf_path)
documents = loader.load()

In [49]:
print(type(documents))
print(len(documents))


<class 'list'>
619


In [39]:
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base")  # This the encoding for text-embedding-ada-002
texts = text_splitter.split_documents(texts)


In [10]:
persist_directory = config["persist_directory"]
vectordb = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory=persist_directory)
 

In [11]:
hf_llm = HuggingFacePipeline(pipeline=llm)
retriever = vectordb.as_retriever(search_kwargs={"k":4})
qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="stuff",retriever=retriever)
# , chain_kwargs = {"return_intermediate_steps":True})

In [27]:
type(qa)

langchain.chains.retrieval_qa.base.RetrievalQA

In [12]:
if config["llm"] == LLM_FLAN_T5_SMALL or config["llm"] == LLM_FLAN_T5_BASE or config["llm"] == LLM_FLAN_T5_LARGE:
    question_t5_template = """
    context: {context}
    question: {question}
    answer: 
    """
    QUESTION_T5_PROMPT = PromptTemplate(
        template=question_t5_template, input_variables=["context", "question"]
    )
    qa.combine_documents_chain.llm_chain.prompt = QUESTION_T5_PROMPT

In [26]:
question = "how to eat mangos?"
qa.combine_documents_chain.verbose = False
qa.return_source_documents = True
qa({"query":question,})

Both `max_new_tokens` (=100) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


{'query': 'how to eat mangos?',
 'result': 'eat them with a spoon',
 'source_documents': [Document(page_content='108 Chapter Four\nSensory and Perceptual Development\nBody growth, sensory and perceptual development, motor skill development,\nand brain development are all undergoing major changes during this period\nof growth and development. This results in more growth during this age\nperiod than at any other time in a child’s life.\nMotor Development\nBetween the ages of 12 and 15 months old, a toddler can hold a single cube\nin one hand while grasping another, can drop a cube into a cup, begin to\ngrasp with forefingers, uncover hidden objects, rattle a spoon in the cup,\nmaintain a standing position with some assistance, begin walking, exploring\nand touching everything, walk flat-footed with a wide gait, enjoy the sense\nof spatial relationship, have increased knowledge of the environment\nthrough walking, enjoy feeling and stroking, and roll over with ease.\nBetween the ages of 1