In [16]:
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader, PdfFileReader
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
)
from langchain.embeddings.huggingface import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
from langchain.schema.runnable import RunnablePassthrough
from EmbeddingsPDF import EmbeddingsPDF


In [3]:
vectorstore = EmbeddingsPDF().get_chroma_embeddings()

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, BitsAndBytesConfig
import torch

MODEL_NAME = "susnato/phi-2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, return_token_type_ids=False)
tokenizer.pad_token = tokenizer.eos_token

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=None  # Remove quantization for now, add it back later if needed
)



generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 300
generation_config.temperature = 0.7
generation_config.top_p = 0.95
generation_config.do_sample = True
# generation_config.repetition_penalty = 1.15


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [7]:
from transformers import TextGenerationPipeline
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline

# text_generation_pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer, generation_config=generation_config)

pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    generation_config=generation_config,
)

In [8]:
from langchain import HuggingFacePipeline

llm = HuggingFacePipeline(
    pipeline=pipeline,
)

In [9]:
retriver = vectorstore.as_retriever(  
        search_type="mmr",  
        search_kwargs={'k': 5, 'fetch_k': 8}  
    )  

In [10]:
from langchain import PromptTemplate
from langchain.chains import LLMChain, SimpleSequentialChain, RetrievalQA, ConversationalRetrievalChain

template = """
[INST] <>
Act as an Insurance expert. Use the following information to answer the question at the end.
<>

{context}

{question} [/INST]
"""

prompt = PromptTemplate(template=template, input_variables=["context", "question"])

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriver,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)

In [11]:
from IPython.display import Markdown
prompt = "Are there any restrictions on cover provided by Vitality?"

outputs = pipeline(
    prompt,
    max_new_tokens=300,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
)
Markdown(outputs[0]["generated_text"])

KeyboardInterrupt: 

In [12]:
from IPython.display import Markdown, display
from tqdm.notebook import tqdm

query = "Are there any restrictions on cover provided by Vitality?"
with tqdm(total=1, desc="Processing queries") as pbar:
    result_ = qa_chain(query)
    result = result_["result"].strip()
    pbar.update(1)


display(Markdown(f"<b>{query}</b>"))
display(Markdown(f"<p>{result}</p>"))

Processing queries:   0%|          | 0/1 [00:00<?, ?it/s]

  warn_deprecated(


KeyboardInterrupt: 

In [21]:
emd = EmbeddingsPDF()
file = "healthInsurance.pdf"
pdf_docs = open(file, "rb")
raw_text = emd.get_pdf_text([pdf_docs])
            # 2 Get the text chunks
pdf_text = emd.get_text_chunks(raw_text)


In [25]:
# type(raw_text)

str

In [26]:
# Assuming tokenizer and model are already loaded

# Maximum sequence length supported by the model
max_sequence_length = 1022

# Step 4: Get the prompt from the user
prompt = "What is Vitality"

# Step 5: Combine PDF text with the prompt
input_text = prompt + " " + raw_text

# Step 6: Feed the data into the model and get the output
with torch.no_grad():
    # Tokenize the input text
    token_ids = tokenizer.encode(input_text, add_special_tokens=True, return_tensors="pt")

    # Check if the input exceeds the maximum sequence length
    if token_ids.size(1) > max_sequence_length:
        # Truncate or split the input into smaller chunks
        input_chunks = [token_ids[:, i:i+max_sequence_length-2] for i in range(0, token_ids.size(1), max_sequence_length-2)]
        outputs = []

        for chunk in input_chunks:
            output_ids = model.generate(
                chunk.to(model.device),
                max_length=max_sequence_length,  # Adjust the maximum length if needed
                do_sample=True,
                temperature=0.3,
                pad_token_id=tokenizer.eos_token_id  # Use eos_token_id as pad_token_id for open-end generation
            )
            outputs.append(output_ids)

        # Concatenate the outputs and get the last one
        output_ids = torch.cat(outputs, dim=1)
        output_ids = output_ids[:, -max_sequence_length+2:]
    else:
        # Input fits within the maximum sequence length
        output_ids = model.generate(
            token_ids.to(model.device),
            max_length=max_sequence_length,  # Adjust the maximum length if needed
            do_sample=True,
            temperature=0.3,
            pad_token_id=tokenizer.eos_token_id  # Use eos_token_id as pad_token_id for open-end generation
        )

# Decode the generated output
output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Generated Output:")
print(output)

# Optionally, you can use st.write to display the output in Streamlit
# st.text("Generated Output:")
# st.write(output)


KeyboardInterrupt: 

In [None]:
user_question = "What is Vitality?"

In [None]:
vectorstore_as_retriever = vectorstore.as_retriever()# Replace with your actual retriever

In [None]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# Create ConversationalRetrievalChain with Phi-2
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=model,
    # retriever=Phi2Retriever(model, tokenizer),
    memory=memory,
    retriever=vectorstore_as_retriever, verbose=True,
)


ValidationError: 2 validation errors for LLMChain
llm
  instance of Runnable expected (type=type_error.arbitrary_type; expected_arbitrary_type=Runnable)
llm
  instance of Runnable expected (type=type_error.arbitrary_type; expected_arbitrary_type=Runnable)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)

In [None]:
from langchain.chains import LLMChain
from langchain.llms import HuggingFacePipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer,    max_lengt=512, temperature=0.6, top_p=0.95, repetition_penalty=1.2)
local_llm=HuggingFacePipeline(pipeline=pipe)
pipe.model.config.pad_token_id=pipe.model.config.eos_token_id