In [3]:
import os 
from dotenv import load_dotenv
from pathlib import Path

from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

from pinecone import Pinecone, ServerlessSpec

from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [4]:
dotenv_path = Path('.env')
load_dotenv(dotenv_path=dotenv_path)
PINECONE_API_KEY=os.getenv('PINECONE_API_KEY')
os.environ['PINECONE_API_KEY']=PINECONE_API_KEY

In [5]:
## extract data from the pdf

def load_pdf(data):
    loader=DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    documents=loader.load() 

    return documents

In [6]:
extracted_data=load_pdf("data/")

In [7]:
len(extracted_data) #sample data

674

In [8]:
extracted_data[100]

Document(page_content='scores are uniformly distributed, we might then suppose that their average is\nalso uniformly distributed. \nBut is this actually true? \nBegin by considering the\nextremes: there is only one way to obtain a mean test score of 300; both\nindividuals must score 300. \nSimilarly, to obtain a mean of 0, both individuals\nmust score 0. \nBy contrast, consider a mean of 150. \nThis could result from a\nnumber of individual score combinations, for example (\nscore\nA\n, \nscore\nB\n) = :\n(150,150),(100,200),(125,175). \nIntuitively, there are many more ways to\nobtain moderate values for the sample mean than there are for the extremes.\nThis central tendency of the sample mean increases along with sample size,\nsince extreme values then require more individual scores to be\nsimultaneously extreme, which is less likely. \nThis effect is visible in \nFigure\n3.15\n; however, we also see another impact on the probability distribution for\nthe mean: as our sample size inc

In [21]:
#Create text chunks

def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20) ## overlapp b/w embedddings
    text_chunks=text_splitter.split_documents(extracted_data)

    return text_chunks

In [22]:
text_chunks=text_split(extracted_data)
print("length of chunks: ",len(text_chunks))

length of chunks:  2625


In [23]:
#download embedding model , and sentence 
def download_hf_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    return embeddings

In [24]:
embeddings=download_hf_embeddings()

In [25]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [26]:
query_result=embeddings.embed_query("Hello World")
print("Length",len(query_result))

Length 384


In [27]:
import speech_recognition as sr
from gtts import gTTS
import os
import time
import playsound
import random 

def speak(text):
    tts = gTTS(text=text, lang='en')
    ls=[i for i in range(1,100)]
    r1=random.choice(ls)
    r2=random.choice(ls)
    filename = 'voice_'+str(r1)+'_'+str(r2)+'.mp3'
    tts.save(filename)
    playsound.playsound(filename)

#speak("Hi Mayank")


In [28]:
def get_audio():
	r = sr.Recognizer()
	with sr.Microphone() as source:
		audio = r.listen(source)
		said = ""

		try:
		    said = r.recognize_google(audio)
		    print("Query: "+said)
		except Exception as e:
		    print("Exception: " + str(e))

	return said


In [30]:
index_name="mchatbot"
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

In [31]:
## Initializing a given pinecone index/knowledge base
index_name="mchatbot"
#docsearch=PineconeVectorStore.from_documents(text_chunks, embeddings, index_name=index_name)
docs_chunks =[t.page_content for t in text_chunks]
docsearch=PineconeVectorStore.from_texts(docs_chunks ,embeddings, index_name=index_name)

In [34]:
#query = "What is Bayesian inference?"
query=get_audio()
#query="For highly viscous droplets, how much charge is lost by the parent droplet during Rayleigh fission?"
docs = vectorstore.similarity_search(query)
print(docs[0].page_content)

Query: for highly precious droplets how much charge is lost by the parent droplet during rally fission
is observed, accompanied by ion emission which is however insufﬁcient to prevent theCoulomb explosion. Ion emission and the smaller progeny droplets account for 24 % and
16 % of the initial charge
, respectively.
Key words: breakup/coalescence, electrohydrodynamic effects
†Email address for correspondence: mgameroc@uci.edu
© The Author(s), 2023. Published by Cambridge University Press. This is an Open Access article,


In [35]:
prompt_template=""" 
Use the following pieces of information to answer the user's question.
If you don't know the answer, just state that you don't know, don't try to make up an answer.


Context:{context}
Question:{question}

Only return the helpful answer below and nothing else.

Helpful answer: 
"""

In [36]:
PROMPT=PromptTemplate(template=prompt_template,input_variables=["context","question"])
chain_type_kwargs={"prompt":PROMPT}

In [37]:
llm=CTransformers(model="model/llama-7b.ggmlv3.q4_1.bin",
                 model_type="llama",
                 config={'max_new_tokens':512,
                        'temperature':0.8})

In [38]:

qa=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={'k':2}),
    chain_type_kwargs=chain_type_kwargs)

In [41]:
while True:
    #user_input=input(f"Input Prompt:")
    user_input=get_audio()
    print("".join(["*"]*100))
    if(user_input=='exit' or user_input=='Exit'):
        print('Shutting down RAG')
        break
    result=qa({"query":user_input})
    print("Response : ",result["result"])
    print("".join(["*"]*100))
    speak(result["result"])
    

****************************************************************************************************
Response :  

Ionic liquids with very large radii (e.g., C6H5O5-CH3OH) are known to be unstable at low ow rates, because they can undergo Rayleigh fission when a droplet is in ight.
This instability is well described by ρV(<) where V is the ow rate of liquid and R is the radius
of ionic liquid. 

In the ionic liquids EMI-Im,MIs that do not I-Im, MI/F2A-Im (CMI-Im-Im-Im-Im-Im(NOIM-10403-Im and C6HMI-Im, M IMI-IMiMe-Im-im or -Im–TGIL, N , R 
mi-MI-MIm and Im, which have notably to J MI-O/O21405OH-Im-Im-Im, MI, HIM-EMI-IM (EMIm-Im-Im, M IMI-Im, the ionic-Im, M IMI-Im and EMI-MII-MI-Im, M IMI-AL (HIOA (em Im and F 09 and CH3O33S-Im-Im and BMlEMI-EMiMIm and TMSI and DMS-Im, M IMI-Im, the ion and C6HIM-Im+27HT-Im,MI-MI-102-MI-C-Im, M IMI-Im, M IMI-L403-Im(im− and F MI-EM-Im-MI-MI-Im (CH-N3MII-Im, the radius is Im-Im–F3 [EI-Im with very small radii that under test electrospontheIM-ImiO7026HMDL