In [22]:
from dotenv import dotenv_values
import os,openai

env_vars = dotenv_values('.env')

for key in env_vars.keys():
    print(key)

HUGGINGFACEHUB_API_TOKEN
PINECONE_API_KEY
PINECONE_ENV


## Reading data from url and writing to a file

In [17]:
import requests

def get_data_from_url(url,fileName):
    res = requests.get(url)

    with open(fileName, "w") as f:
        f.write(res.text)

## Document Loader

In [3]:
#https://python.langchain.com/en/latest/modules/indexes/getting_started.html
from langchain.document_loaders import TextLoader

def load_textFile(fileName):
    loader = TextLoader(fileName)
    documents = loader.load()
    return documents
    
    
#To load a directory and create document
#https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/file_directory.html
from langchain.document_loaders import DirectoryLoader

def load_dir(directory):
    loader = DirectoryLoader(directory)
    documents = loader.load()
    return documents

## Document splitter

In [4]:
#https://python.langchain.com/en/latest/modules/indexes/text_splitters/examples/huggingface_length_function.html

from transformers import GPT2TokenizerFast
from langchain.text_splitter import CharacterTextSplitter

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def split_docs(documents,chunk_size, chunk_overlap):
    text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer,chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    return docs


## Creating Embeddings and storing in vectorstore

In [6]:
#https://python.langchain.com/en/latest/modules/models/text_embedding/examples/instruct_embeddings.html
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()

# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
from langchain.vectorstores import FAISS
def create_faiss_index(docs):
    db = FAISS.from_documents(docs, embeddings)
    return db

## Search the vectorstore

In [7]:
def find_similar_doc_from_index(index,query,k):
    similar_docs = index.similarity_search(query, k=k)
    return similar_docs

## Create QA chain

In [13]:
##https://python.langchain.com/en/latest/use_cases/question_answering.html
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub

llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", 
                   model_kwargs={"temperature":0.8, "max_length":64},
                  huggingfacehub_api_token = env_vars['HUGGINGFACEHUB_API_TOKEN'] )
chain = load_qa_chain(llm, chain_type="stuff")

def get_answer(index,query):
    ip_docs = find_similar_doc_from_index(index,query, k=3)
    answer = chain.run(input_documents=ip_docs, question=query)
    return answer

## Test the above


In [18]:
# For data from a url
url = "https://raw.githubusercontent.com/hwchase17/langchain/master/docs/modules/state_of_the_union.txt"
fileName = "state_of_the_union.txt"
documents= get_data_from_url(url,fileName)

In [19]:
# For data from directory
data_dir ='data'
documents = load_dir(data_dir)

docs = split_docs(documents,chunk_size=300, chunk_overlap=0)
index = create_faiss_index(docs)

query = "what is machine learning"
get_answer(index,query)   

'Machine learning is the foundation of countless important applications, including web search, email anti-spam'