#### Install some necessary libraries

In [None]:
# %pip install streamlit
# %pip install langchain
# %pip install unstructured
# %pip install openai
# %pip install tiktoken
# %pip install faiss-gpu
# %pip install dill

In [2]:
import os
import streamlit as st
import pickle
import time
import langchain

from dotenv import load_dotenv
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS


In [None]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
print("api key:", api_key)

In [6]:
# Instantiate the OpenAI model with the specified temperature and max_tokens
llm = OpenAI(temperature=0.9, max_tokens=500)

## (1) Load data

In [None]:
# This code snippet loads data from the given URLs using the UnstructuredURLLoader.
loaders = UnstructuredURLLoader(
    urls=[
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html",
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html"
    ]
)
data = loaders.load()
data

## (2) Split data to create chunks

```RecursiveCharacterTextSplitter()```

This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is ["\n\n", "\n", " ", ""]. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text.

In [None]:
# Split the data into chunks using the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

print(docs)

In [None]:
len(docs)

In [None]:
docs[9]

## (3) Create embeddings for these chunks and save them to FAISS index

#### OpenAI’s text embeddings measure the relatedness of text strings. Embeddings are commonly used for:
* Search (where results are ranked by relevance to a query string)
* Clustering (where text strings are grouped by similarity)
* Recommendations (where items with related text strings are recommended)
* Anomaly detection (where outliers with little relatedness are identified)
* Diversity measurement (where similarity distributions are analyzed)
* Classification (where text strings are classified by their most similar label)

In [None]:
embeddings = OpenAIEmbeddings()
vectorindex_openai = FAISS.from_documents(docs, embeddings)

In [None]:
db = FAISS.from_documents(docs, embeddings)
db

In [None]:
# Save the faiss index to the local storage
db.save_local("faiss_index")

In [None]:
type(vectorindex_openai)

In [None]:
# # storing vector index create in local
# file_path = "vector_index.pkl"

# with open(file_path, "wb") as f:
#   pickle.dump(vectorindex_openai, f)

In [None]:
# Load the faiss index from local storage
new_db = FAISS.load_local("faiss_index", embeddings)

# Perform similarity search on the faiss index
docs = new_db.similarity_search("What did the president say about Ketanji Brown Jackson")

In [None]:
docs[0].page_content

## (4) Retrieve similar embeddings for a given question and call LLM to retieve final answer

`RetrievalQAWithSourcesChain`

* Question-answering with sources over an index.
* Create a new model by parsing and validating input data from keyword arguments.
* Raises ValidationError if the input data cannot be parsed to form a valid model.

In [None]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=new_db.as_retriever())
chain



In [None]:
query = "What is the price of Tiago iCNG?"
langchain.debug = True
chain({"question": query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "What is the price of Tiago iCNG?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQAWithSourcesChain > 3:chain:MapReduceDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "The company also said it has also introduced the twin-cylinder technology on its Tiago and Tigor models.\n\nThe Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh, while the Tigor iCNG comes at a price range of Rs 7.8 lakh to Rs 8.95 lakh.\n\nTata Motors Passenger Vehicles Ltd Head-Marketing, Vinay Pant said these introductions put together will make the company's CNG line up \"appealing, holistic, and stronger than ever\".\n\nPTI\n\nTags:\n\n#Business\n\n#Companies\n\nfi

{'answer': ' The Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh.\n',
 'sources': 'https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html'}