In [20]:
from qdrant_client import QdrantClient
import os 
from dotenv import load_dotenv

load_dotenv()
QDRANT_CLOUD_URL = os.environ.get("QDRANT_CLOUD_URL")
QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")

In [2]:
## Qdrant Connection
URL_DATABASE="http://localhost:6333"
COLLECTION_NAME ="mpox_collection"

In [3]:
#Create a client
client = QdrantClient(url=URL_DATABASE)

In [43]:
# create a collection
from qdrant_client.models import Distance, VectorParams

client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)


True

In [4]:
## Get text and chunk from document
from langchain_community.document_loaders import PyPDFLoader

PATH_DOCUMENT = "Mpox_information.pdf"


loader = PyPDFLoader(PATH_DOCUMENT)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [5]:
## Split pages in small chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 150,
    chunk_overlap  = 20,
    length_function = len,
)
docs = text_splitter.split_documents(pages)


In [6]:
docs[0:5]

[Document(metadata={'source': 'Mpox_information.pdf', 'page': 0}, page_content='Mpox\n26 August 2024ا中⽂ Français\nРусский Español\nKey facts \xa0\nMpox, previously known as monkeypox, is a viral illness caused by the'),
 Document(metadata={'source': 'Mpox_information.pdf', 'page': 0}, page_content='monkeypox virus, a species of the genus Orthopoxvirus . There are two'),
 Document(metadata={'source': 'Mpox_information.pdf', 'page': 0}, page_content='distinct clades of the virus: clade I (with subclades Ia and Ib) and clade II (with'),
 Document(metadata={'source': 'Mpox_information.pdf', 'page': 0}, page_content='subclades IIa and IIb). In 2022–2023 a global outbreak of mpox was caused by\nthe clade IIb strain.'),
 Document(metadata={'source': 'Mpox_information.pdf', 'page': 0}, page_content='Mpox continues to be a threat today , and an upsurge of cases in the\nDemocratic Republic of the Congo and other countries caused by clades Ia')]

In [10]:
## save this documents in Qdrant

from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore

## embedding model from openai
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
from langchain_qdrant import RetrievalMode

# load data to collection
vector_store_qdrant = QdrantVectorStore.from_documents(
    docs,
    embeddings,
    url=URL_DATABASE,
    prefer_grpc=False,
    collection_name=COLLECTION_NAME,
    retrieval_mode=RetrievalMode.DENSE
)
print("DOCUMENTS DATA SAVED SUCCESFULLY")

DOCUMENTS DATA SAVED SUCCESFULLY


In [23]:
#QDRANT CLOUD
doc_store = QdrantVectorStore.from_documents(
    docs, embeddings, 
    url=QDRANT_CLOUD_URL,
    api_key=QDRANT_API_KEY,
    collection_name="mpox_collection"
)

In [30]:
context = doc_store.similarity_search(query="What mpox is transmited?", k=5,)
context

[Document(metadata={'source': 'Mpox_information.pdf', 'page': 1, '_id': 'b1412ff9-a7e5-49aa-8b70-aa1be02021c8', '_collection_name': 'mpox_collection'}, page_content='Mpox can be transmitted through close contact with someone who has mpox,'),
 Document(metadata={'source': 'Mpox_information.pdf', 'page': 8, '_id': '462a07df-6ec5-4aca-9b2b-103932647730', '_collection_name': 'mpox_collection'}, page_content='transmission can reduce the risk of getting mpox. Those who have had contact'),
 Document(metadata={'source': 'Mpox_information.pdf', 'page': 2, '_id': 'e0ff17d8-1c04-4220-86bc-ba391d91fb28', '_collection_name': 'mpox_collection'}, page_content='Transmission\xa0\nMpox spreads from person to person mainly through close contact with someone'),
 Document(metadata={'source': 'Mpox_information.pdf', 'page': 5, '_id': 'bdaf9088-797e-455d-b043-db10250e715e', '_collection_name': 'mpox_collection'}, page_content='https://www.who.int/news-room/fact-sheets/detail/mpox#:~:text=Common symptoms of m

In [31]:
## test new information
new_information="""
Title of the page: Esquema das joias movimentou R$ 6,8 mi; Bolsonaro recebeu dinheiro vivo, diz PF
Ex-presidente Jair Bolsonaro é indiciado pela PF no caso das joias
Moraes retira sigilo do inquérito das joias e manda PGR se manifestar
DÃª a sua opiniÃ£o! O Correio tem um espaÃ§o na ediÃ§Ã£o impressa para publicar a opiniÃ£o dos leitores pelo e-mail sredat.df@dabr.com.br
RepÃ³rter de polÃ­tica, setorista do Supremo Tribunal Federal (STF). Vencedor do PrÃªmio CNT de Jornalismo, possuÃ­ passagens pelo SBT, Record/R7 e estÃ¡ entre os jornalistas mais influentes do Twitter no Brasil.
Formada em Jornalismo pela UFSM, no RS, onde pesquisou sobre Jornalismo Internacional, focado em AmÃ©rica Latina. JÃ¡ trabalhou com Cultura, Entretenimento, Redes Sociais, GÃªnero e Minorias Sociais.
"""

In [33]:
fact_information = " ".join([i.page_content for i in context])
fact_information

'Mpox can be transmitted through close contact with someone who has mpox, transmission can reduce the risk of getting mpox. Those who have had contact Transmission\xa0\nMpox spreads from person to person mainly through close contact with someone https://www.who.int/news-room/fact-sheets/detail/mpox#:~:text=Common symptoms of mpox are,materials%2C or with infected animals. 6/11 https://www.who.int/news-room/fact-sheets/detail/mpox#:~:text=Common symptoms of mpox are,materials%2C or with infected animals. 5/11'

In [51]:
##setting the parser
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain_openai import ChatOpenAI


class NewChecker(BaseModel):
    info: str = Field(description="a score between 0 and 1, where 1 means the news is completely true and closer to 0 means it is false")

parser_new = PydanticOutputParser(pydantic_object=NewChecker)


In [63]:
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from pydantic import BaseModel, Field, model_validator


model = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0.0)


# Define your desired data structure.
class NewsChecker(BaseModel):
    score: str = Field(description="a score between 0 and 1, where 1 means the news is completely true and closer to 0 means it is false")
    justification: str = Field(description="a justification about the score", default="Ninguna")

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=NewsChecker)

template = """
Eres un asistente experto en verificación de información. Te voy a proporcionar un conjunto de hechos verificados que se consideran verdaderos. Luego te proporcionaré una noticia y tu tarea será comparar los hechos de la noticia con la información verdadera proporcionada, y determinar cuán verídica es la noticia en una escala de 0 a 1, donde 0 significa que la noticia es completamente falsa y 1 significa que es completamente verdadera. Proporciona tu evaluación explicando brevemente las razones de tu puntuación, y justifica por qué asignaste ese número de veracidad.

### Información verificada:
{fact_information}

### Noticia a evaluar:
{new_information}

### Resultado esperado:
- **Puntuación de veracidad (0 a 1):**
- **Razón de la puntuación:**
- **Justificación de la puntuación:**
{format_instructions}

"""

prompt = PromptTemplate(
    template=template,
    input_variables=["new_information","fact_information"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# And a query intended to prompt a language model to populate the data structure.
prompt_and_model = prompt | model
output = prompt_and_model.invoke({"new_information":new_information, "fact_information":fact_information})
result = parser.invoke(output)

In [66]:
result.justification

'La noticia menciona que Bolsonaro recibió dinero vivo en el caso de las joias, lo cual es cierto según la información verificada. Sin embargo, la noticia no proporciona suficiente evidencia para respaldar el hecho de que Bolsonaro fue indiciado por la PF o que Moraes retiró el sigilo del inquérito. Además, la noticia incluye información irrelevante sobre el autor y su formación, lo cual puede ser considerado como una táctica de distracción. Por lo tanto, la noticia tiene ciertos elementos de verdad, pero también contiene información inexacta y distracciones, lo que la hace poco confiable. Por lo tanto, se le asigna una puntuación de veracidad de 0.5.'