In [1]:
import os
import time 
import pickle
import langchain
import streamlit as st
from langchain.vectorstores import FAISS
from langchain.chat_models import init_chat_model
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
llm = init_chat_model(
    "gemini-2.5-flash",
    model_provider="google_genai",
    temperature=0.9
)

In [4]:
loader = UnstructuredURLLoader(urls=[
    "https://www.worldometers.info/coronavirus/",
    "https://en.wikipedia.org/wiki/Coronavirus",
    "https://my.clevelandclinic.org/health/diseases/21214-coronavirus-covid-19",
])

data = loader.load()
len(data)

3

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n","\n","."," "],
    chunk_size=1000
)

docs = text_splitter.split_documents(data)
len(docs)

119

In [7]:
docs[88]

Document(metadata={'source': 'https://www.worldometers.info/coronavirus/'}, page_content='3,155 259,953 4,080 11 1,233 15 5,708,974 26,339 216,746,934 Africa 811 68,700 38 19 109 Zimbabwe 266,359 5,740 258,888 1,731 12 17,373 374 2,525,756 164,744 15,331,428 Africa 58 2,671 6 113 110 Uzbekistan 253,662 1,637 241,486 10,539 23 7,378 48 1,377,915 40,077 34,382,084 Asia 136 21,003 25 307 111 Afghanistan 234,174 7,996 211,080 15,098 5,746 196 1,390,730 34,125 40,754,388 Asia 174 5,097 29 370 112 Mozambique 233,731 2,250 228,805 2,676 11 7,064 68 1,371,127 41,437 33,089,461 Africa 142 14,706 24 81 113 Martinique 230,354 1,102 N/A N/A N/A N/A 615,777 2,946 828,928 2,215,870 374,087 North America 2 339 0 612,553 114 Laos 218,970 758 N/A N/A N/A 29,270 101 1,233,207 164,845 7,481,023 Asia 34 9,869 6 28,145 115 Iceland 209,906 229 N/A N/A N/A 607,731 663 1,996,384 5,780,036 345,393 Europe 2 1,508 0 387,941 116 Kyrgyzstan 206,897 2,991 196,406 7,500 131 30,750 445 1,907,195 283,460 6,728,271 Asi

In [9]:
embeddings = HuggingFaceEmbeddings()

vector_index = FAISS.from_documents(docs, embeddings)



In [10]:
embeddings.model_name

'sentence-transformers/all-mpnet-base-v2'

In [11]:
file_path = "./stored_vectors/vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vector_index, f)

In [12]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vector_index = pickle.load(f)

In [13]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vector_index.as_retriever())
chain



In [14]:
query = "What are the symptops of corona virus?"

langchain.debug = True

chain({"question":query}, return_only_outputs=True)

  chain({"question":query}, return_only_outputs=True)


[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "What are the symptops of corona virus?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Locations:\n\nAbu Dhabi|Canada|Florida|London|Nevada|Ohio|\n\nGray gradient\n\nGray gradient\n\nCleveland Clinic logo\n\nHome/\n\nHealth Library/\n\nDiseases & Conditions/\n\nCOVID-19 (Coronavirus)\n\nAdvertisement\n\nAdvertisement\n\nCOVID-19 (Coronavirus)\n\nCOVID-19 is an illness caused by the SARS-CoV-2 virus. It spreads through respiratory droplets. Symptoms include fever, cough, shortness of breath, runny or stuffy nose, body aches and more. COVID can cause mild to severe illnes

{'answer': 'The symptoms of COVID-19 include fever, cough, shortness of breath, runny or stuffy nose, body aches, sore throat, headache, tiredness (fatigue), chills, loss of or altered sense of smell and taste, difficulty thinking and focusing (brain fog), and digestive symptoms such as diarrhea, nausea, and vomiting. Symptoms can range from mild and cold-like to severe and life-threatening. Some infected individuals may not show any symptoms but can still spread the virus.\n',
 'sources': 'https://my.clevelandclinic.org/health/diseases/21214-coronavirus-covid-19'}

In [15]:
query = "What is the total death from COVID 19?"

#langchain.debug = True

chain({"question":query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "What is the total death from COVID 19?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Population\n\nCoronavirus\n\nNOTE: As of April 13, 2024, the Coronavirus Tracker is no longer being updated due to the unfeasibility of providing statistically valid global totals, as the majority of countries have now stopped reporting. However, historical data remain accessible. Worldometer delivered the most accurate and timely global statistics to users and institutions around the world at a time when this was extremely challenging. We thank everyone who participated in this extra

{'answer': 'The total number of deaths from COVID-19 is 7,010,681.\n',
 'sources': 'https://www.worldometers.info/coronavirus/'}