In [2]:
import os 
from dotenv import load_dotenv
from pathlib import Path

from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [8]:
dotenv_path = Path('.env')
load_dotenv(dotenv_path=dotenv_path)
PINECONE_API_KEY=os.getenv('PINECONE_API_KEY')
os.environ['PINECONE_API_KEY']=PINECONE_API_KEY

In [3]:
## extract data from the pdf

def load_pdf(data):
    loader=DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    documents=loader.load() 

    return documents

In [6]:
#!pip install pypdf

In [6]:
extracted_data=load_pdf("data/")

In [7]:
extracted_data[100] #sample data

Document(page_content='scores are uniformly distributed, we might then suppose that their average is\nalso uniformly distributed. \nBut is this actually true? \nBegin by considering the\nextremes: there is only one way to obtain a mean test score of 300; both\nindividuals must score 300. \nSimilarly, to obtain a mean of 0, both individuals\nmust score 0. \nBy contrast, consider a mean of 150. \nThis could result from a\nnumber of individual score combinations, for example (\nscore\nA\n, \nscore\nB\n) = :\n(150,150),(100,200),(125,175). \nIntuitively, there are many more ways to\nobtain moderate values for the sample mean than there are for the extremes.\nThis central tendency of the sample mean increases along with sample size,\nsince extreme values then require more individual scores to be\nsimultaneously extreme, which is less likely. \nThis effect is visible in \nFigure\n3.15\n; however, we also see another impact on the probability distribution for\nthe mean: as our sample size inc

In [8]:
#Create text chunks

def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20) ## overlapp b/w embedddings
    text_chunks=text_splitter.split_documents(extracted_data)

    return text_chunks

In [9]:
text_chunks=text_split(extracted_data)
print("length of chunks: ",len(text_chunks))

length of chunks:  2443


In [25]:
#download embedding model
def download_hf_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    return embeddings

In [26]:
embeddings=download_hf_embeddings()

In [27]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [28]:
query_result=embeddings.embed_query("Hello World")
print("Length",len(query_result))

Length 384


In [49]:
import speech_recognition as sr
from gtts import gTTS
import os
import time
import playsound
import random 

def speak(text):
    tts = gTTS(text=text, lang='en')
    ls=[i for i in range(1,100)]
    r1=random.choice(ls)
    r2=random.choice(ls)
    filename = 'voice_'+str(r1)+'_'+str(r2)+'.mp3'
    tts.save(filename)
    playsound.playsound(filename)

#speak("Hi Mayank")


In [35]:
def get_audio():
	r = sr.Recognizer()
	with sr.Microphone() as source:
		audio = r.listen(source)
		said = ""

		try:
		    said = r.recognize_google(audio)
		    print("Query: "+said)
		except Exception as e:
		    print("Exception: " + str(e))

	return said


In [21]:
text_gen=get_audio()
print(text_gen)

Delhi's capital of India
Delhi's capital of India


In [29]:
index_name="mchatbot"
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

In [12]:
## Initializing a given pinecone index/knowledge base
index_name="mchatbot"
#docsearch=PineconeVectorStore.from_documents(text_chunks, embeddings, index_name=index_name)
docs_chunks =[t.page_content for t in text_chunks]
docsearch=PineconeVectorStore.from_texts(docs_chunks ,embeddings, index_name=index_name)

In [36]:
#query = "What is Bayesian inference?"
query=get_audio()
docs = vectorstore.similarity_search(query)
print(docs[0].page_content)

Query: explain posterior in detail
the posterior distribution:
The posterior is the synthesis of past experience and information from
observed data and represents our updated state of knowledge. 
The uncertainty
in the posterior is usually (although not always) reduced compared to the
prior because the data allows us to better understand the world.


In [40]:
prompt_template=""" 
Use the following pieces of information to answer the user's question.
If you don't know the answer, just state that you don't know, don't try to make up an answer.


Context:{context}
Question:{question}

Only return the helpful answer below and nothing else.

Helpful answer: 
"""

In [41]:
PROMPT=PromptTemplate(template=prompt_template,input_variables=["context","question"])
chain_type_kwargs={"prompt":PROMPT}

In [42]:
llm=CTransformers(model="model/llama-7b.ggmlv3.q4_1.bin",
                 model_type="llama",
                 config={'max_new_tokens':256,
                        'temperature':0.8})

In [53]:

qa=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={'k':1}),
    chain_type_kwargs=chain_type_kwargs)

In [54]:
while True:
    #user_input=input(f"Input Prompt:")
    user_input=get_audio()
    print("".join(["="]*100))
    if(user_input=='exit' or user_input=='Exit'):
        print('Shutting down RAG')
        break
    result=qa({"query":user_input})
    print("Response : ",result["result"])
    speak(result["result"])
    

Query: what is central limit theorem
Response :  The Central Limit Theorem states that, if we have enough observations in a sample, then the sampling distribution of the mean will look approximately normal. 
This statement implies that an increase in size of the sample increases the probability that the sampling distribution of the mean will be normally distributed. 
Based on this fact and Figure 3.15 (which shows the histogram for
an unweighted random sample with 20 observations), we can say that the sampling
distribution will probably not have a normal distribution if the size of the
sample is less than or equal to about 20. 


"""
from numpy import linspace, randint, array, pi, cos, sin, pi / 4, pi * sqrt(3)
from scipy.stats import gamma
import matplotlib.pyplot as plt


def get_answer():
    """Return the answer."""
    return "The Central Limit Theorem states that, if we have enough observations in a sample, then the sampling distribution of the mean will look approximately normal