In [41]:
import streamlit as st
import langchain, os, openai, dotenv

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage

import datetime

from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma

import chromadb
import dotenv, time

import os, glob
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

print(openai.__version__)



1.3.7


In [2]:
embeddings = AzureOpenAIEmbeddings(
    deployment=os.getenv("OPENAI_DEPLOYMENT_NAME_EMBED")
)

In [3]:
base_path = "./vectorDB/"

In [4]:
from langchain.document_loaders import DataFrameLoader
import pandas as pd

In [5]:
df = pd.read_parquet("TEIs.parquet.gzip")
print(len(df))
loader = DataFrameLoader(df[["article","text"]])
documents = []
documents.extend(loader.load())
text_splitter = CharacterTextSplitter(separator='\n',chunk_size=2000, chunk_overlap=100)
chunked_documents = text_splitter.split_documents(documents)

Created a chunk of size 2014, which is longer than the specified 2000
Created a chunk of size 2610, which is longer than the specified 2000
Created a chunk of size 2120, which is longer than the specified 2000
Created a chunk of size 2502, which is longer than the specified 2000
Created a chunk of size 2064, which is longer than the specified 2000
Created a chunk of size 2219, which is longer than the specified 2000
Created a chunk of size 2257, which is longer than the specified 2000
Created a chunk of size 2709, which is longer than the specified 2000


327


Created a chunk of size 2264, which is longer than the specified 2000
Created a chunk of size 3219, which is longer than the specified 2000
Created a chunk of size 2281, which is longer than the specified 2000
Created a chunk of size 2432, which is longer than the specified 2000
Created a chunk of size 2108, which is longer than the specified 2000
Created a chunk of size 2304, which is longer than the specified 2000
Created a chunk of size 2349, which is longer than the specified 2000
Created a chunk of size 2075, which is longer than the specified 2000
Created a chunk of size 2133, which is longer than the specified 2000
Created a chunk of size 3492, which is longer than the specified 2000
Created a chunk of size 2172, which is longer than the specified 2000
Created a chunk of size 3917, which is longer than the specified 2000
Created a chunk of size 2098, which is longer than the specified 2000
Created a chunk of size 2342, which is longer than the specified 2000
Created a chunk of s

In [6]:
print(len(documents),len(chunked_documents))

327 6706


In [19]:
chunked_documents.reverse()



In [23]:

if not os.path.isfile(base_path+"chroma.sqlite3"):
    print("Start a new DB")
    vectordb = Chroma.from_documents(
        documents=[chunked_documents[0]],
        embedding=embeddings,
        persist_directory=base_path
    )
    vectordb.persist()
else:
    print("Continue on the DB")
    vectordb = Chroma(persist_directory=base_path,embedding_function=embeddings)
    print(len(vectordb.get()["ids"]),"elements already stored.")
    LSDOCS = vectordb.get()["documents"]

    for doc in chunked_documents:
        # Check if the text already exists somewhere
        if not doc.page_content in LSDOCS:
            vectordb.add_documents(
                documents=[doc], 
                embedding=embeddings, 
                persist_directory=base_path
            )
            # Ugly hack to avoid reaching token per min limit 
            # So it sleeps 1s between page
            time.sleep(0.2)
            vectordb.persist()


Continue on the DB
6643 elements already stored.


In [24]:
def create_agent_chain(llm):
    chain = load_qa_chain(llm, chain_type="stuff")
    return chain

In [44]:
def get_llm_response(query,vectordb,temperature=0.1,k=10):
    F = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    llm = AzureChatOpenAI(
        deployment_name=os.getenv("OPENAI_DEPLOYMENT_NAME"),
        temperature=temperature,
        openai_api_version="2023-05-15"
    )
    chain = create_agent_chain(llm)
    matching_docs = vectordb.similarity_search(query,k)
    answer = chain.run(input_documents=matching_docs, question=query)

    with open("cache/"+F+".md","w") as f:
        f.write(">Q:\n"+query + "\n\n---\n\n>A:\n"+answer)
    return answer, matching_docs

In [45]:
answer, matching_docs = get_llm_response("What are the different vectors of contagious diseases in cities? Be as detailed as possible.",vectordb,temperature=0.2,k=10)
print(answer)

The different vectors of contagious diseases in cities include:

1. Mosquitoes: Mosquitoes are the most common vectors of contagious diseases in urban areas. Species such as Aedes aegypti and Aedes albopictus are responsible for spreading diseases like dengue fever, Zika virus, chikungunya, and yellow fever. These mosquitoes breed in manmade containers like tires, pots, and water storage containers.

2. Triatomine bugs: Triatomine bugs, also known as kissing bugs, transmit Chagas disease. They are commonly found in urban areas, especially in houses with poor housing quality and land tenure security.

3. Ticks: Ticks are vectors for diseases like Lyme disease and tick-borne encephalitis. While they are more commonly associated with rural areas, ticks can also be found in urban parks and green spaces where they can come into contact with humans.

4. Sandflies: Sandflies transmit diseases such as leishmaniasis. They are usually found in areas with poor sanitation and crowded living condit

In [46]:
answer, matching_docs = get_llm_response("What are the main mitigation strategies against pandemics? Be as detailed as possible, and only answers with a bullet point list.",vectordb,temperature=0.2,k=10)
print(answer)

- Tracking and tracing transmissions
- Mass testing
- Social distancing
- Vaccine formulation
- Strengthening public health and hygiene
- Accumulating ample stocks
- Planning pathogen testing drills
- Mobilizing mass health workers
- Adopting advanced monitoring systems (e-healthcare analytics)
- Factors influencing PPE production, distribution, and usage
- Automated intensive care setup
- Patient handling protocols
- Social and healthcare sustainability protocols
- Circular healthcare business models
- Medical reverse logistics
- PPE reuse/disposal protocols
- Non-conventional semi-automated medical facilities
- Healthcare humanitarian logistics
- Sustainable medical supply chains
- Innovative governmental backup
- Frequent hand hygiene (using alcohol-based hand sanitizer or washing hands with soap and water)
- Practicing social distancing (maintaining a distance of at least 1 meter/3 feet from others)
- Staying at home unless necessary to go out
- Avoiding touching eyes, nose, and mo

In [51]:
answer, matching_docs = get_llm_response("What are the main stakeholders groups involved? Be as detailed as possible, and only answers with a bullet point list.",vectordb,temperature=0.2,k=10)
print(answer)

- Construction workers
- Engineers and managers
- Suppliers and subcontractors
- Public agencies
- Project team
- Construction company
- Owner
- Aid donors
- International aid networks
- Legislative and regulatory arms of the state
- Logistics organizations
- Private health care sectors
- Direct suppliers
- Media
- Social media
- Local aid networks
- Private insurance companies
- Military and para-military forces
- Government and inter-government organizations


In [52]:
answer, matching_docs = get_llm_response("What are the main types of technology used to mitigate pandemic risks? Be as detailed as possible, and only answers with a bullet point list.",vectordb,temperature=0.2,k=10)
print(answer)

The main types of technology used to mitigate pandemic risks include:

- Information communication technologies:
  - Media and social media platforms for effective communication and dissemination of information to the public.
  - Chatbots and apps for tracking infected individuals and providing real-time updates.
  - Database systems for accumulating and analyzing SARS-CoV-2 related research.

- Artificial Intelligence (AI) and Machine Learning (ML) technologies:
  - Prediction models for outbreak forecasting.
  - Diagnosis of diseases through scans and classification of medical data.
  - Handling and analysis of big data related to patients.
  - Assisting in treatments and developing personalized treatment plans.

- Supply chain technologies:
  - Optimization algorithms for efficient production and transportation of essential medical supplies.
  - Adaptation of production lines to produce necessary medical equipment and supplies.

- Work from home technologies:
  - Remote collaboratio