# Fact Checking Data Generation

This notebook will generate synthetic data for building a fact checking dataser based on the Maldita dataset. 


## Requirements

This notebook requires `pip install -e requirements_minimal.txt`

In [35]:
import asyncio
import json
import os
from typing import Dict, List

from pydantic import BaseModel
from tqdm.auto import tqdm

from factchecking_api import (
    answer_questions,
    generate_article,
    generate_metadata,
    generate_questions_and_searches,
)
from src.api_calls.api_manager import ApiManager
from src.citation.cication_manager import CitationManager
from src.pipelines.rag import RAG

# API KEYS

In this notebook we will use OpenAI GPT4o as an LLM and SEPER as a google search results providers. You should set your own API keys. 

- Get the OpenAI API KEY here: https://platform.openai.com/
- Get the Serper API Key here (2500 free searches): https://serper.dev/api-key

The API also support the following LLMs
- `gpt3`, `gpt4`, `gpt4o`, `gpt4omini` from OpenAI
- `haiku`, `sonnet`, `opus` from Claude
- `gemini-flash`, `gemini-flash-thinking`, `gemini-pro` from Google
- `llama3-8b`, `llama3-70B`, `llama3-405b` via Groq
- `deepseek`from Deepseek-Chat

It supports the following embedding models
- `openai_embeddings_small`, `openai_embeddings_large` from OpenAI
- `cohere_embeddings`, `cohere_embeddings_light`, `cohere_rerank_english`, `cohere_rerank_multilingual` from Cohere
- `NV-Embed`, `E5-Multilingual-Large`, `E5-Mistral` using the Sentence Transformers library (loads the model in a GPU)

It supports the following web search results providers:
- `serper` Google search results, cheap and fast: https://serper.dev/
- `serpapi` same as serper but more expensive: https://serpapi.com
- `you` another search engine https://you.com
- `local_google` searches in google locally, free but google will eventually detect that you are robot and refuse your requests
- `newsapi` only news https://newsapi.org

If you want to implement new models, modify the code, or fix anything there is a `.py` file for each API in `src/api_calls/*.py`. if you want to check if the API is running correctly in your environment you can run the tests is `src/tests/test_*.py`. If you want to run the tests remember to export the API keys to the path using `export OPEN_API_KEY=""`

In [13]:
# These API KEYS are REQUIRED to run the notebook
os.environ["OPENAI_API_KEY"] = ""
os.environ["SERPER_API_KEY"] = ""

# These API KEYS are NOT REQUIRED to run the notebook, set them if you want to experiment with other models
os.environ['SerpAPI_KEY'] = "" # https://serpapi.com/dashboard
os.environ['YOU_API_KEY'] =  "" # https://api.you.com
os.environ['ANTHROPIC_API_KEY'] = "" # https://console.anthropic.com/settings/keys
os.environ['COHERE_API_KEY'] = "" # https://dashboard.cohere.com/api-keys
os.environ['GROQ_API_KEY'] = "" # https://console.groq.com/keys
os.environ['NEWS_API_KEY'] = "" # https://newsapi.org/docs/get-started#search
os.environ['REPLICATE_API_TOKEN'] = "" # https://replicate.com/account/api-tokens
os.environ['GEMINI_API_KEY'] = "" # https://aistudio.google.com/app/apikey
os.environ['DEEPSEEK_API_KEY'] = "" # https://platform.deepseek.com/api_keys

am = ApiManager()

# Get all the unique claims

By running this code we will get all the unique claims in `CONSULTA_PV-Validated-title-edited.json` by `id`

In [None]:
claims = []

with open("maldita_dataset/CONSULTA_PV-Validated-title-edited.json", "r", encoding="utf8") as f:
    data = json.load(f)

# Get claims

print(f"Total elements: {len(data)}")
data = [x for x in data if x["claim"] is True]
print(f"Total elements that are claims: {len(data)}")

# Remove duplicates by id

ids = set()
data = [x for x in data if not (x["id"] in ids or ids.add(x["id"]))]
print(f"Total elements that are claims and are unique: {len(data)}")

# Write to file

with open("maldita_dataset/unique_claims.json", "w", encoding="utf8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

# Get all the unique articles

By running this code we will get all the unique articles in `CONSULTA_PV-Validated-title-edited.json` by `id`

In [None]:
claims = []

with open("maldita_dataset/CONSULTA_PV-Validated-title-edited.json", "r", encoding="utf8") as f:
    data = json.load(f)

# Get claims

print(f"Total elements: {len(data)}")
data = [x for x in data if x["claim"] is True]
print(f"Total elements that are claims: {len(data)}")

# Remove duplicates by id

ids = set()
data = [x for x in data if not (x["article_id"] in ids or ids.add(x["article_id"]))]
print(f"Total articles that are claims and are unique: {len(data)}")

# Write to file

with open("maldita_dataset/unique_articles.json", "w", encoding="utf8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

# Get questions and searches based on claim

For every unique claim in `unique_claims.json` we will retrieve a set of `critical_questions` and `web_searches` that will allow us to verify the claim. We will generate them based in the `claim` field in the dataset. The prompt we will use is defined in `src/prompts/factchecking.py` in `questions_and_searches_prompt`.

The process will output the current API cost, estimated API cost and the estimated remaining time. If the process is stoped you can run the cell again and we will resume from the last item that was processed. The cell will generate the file `unique_claims_qs.json`. 

We use `GPT4o` but you can experiments with any of the other models mentioned above. 

[MANUAL LABOUR]: Validate the Questions and Searches. 

In [None]:
with open("maldita_dataset/unique_claims.json", "r", encoding="utf8") as f:
    data = json.load(f)

if os.path.exists("maldita_dataset/unique_claims_qs.json"):
    with open("maldita_dataset/unique_claims_qs.json", "r", encoding="utf8") as f:
        annotated_data = json.load(f)
else:
    annotated_data = {}

print(f"Total elements: {len(data)}")
print(f"Already annotated elements: {len(annotated_data)}")
data = data[len(annotated_data) :]
print(f"Elements to annotate: {len(data)}")

In [None]:
data[0]

In [None]:
am.reset_cost()
with tqdm(data, desc="Total cost: $0.0. Estimated cost: $0.0", total=len(data)) as pbar:
    for i, fc in enumerate(data):
    
        claim_id = fc["id"]
        claim = fc["new_title"] 
            
        tqdm.write(f"\nClaim: {claim}, ID: {claim_id}")
        searches, questions = generate_questions_and_searches(
            am=am,
            fact_checking_topic=claim,
            model="gpt4o",
            language="es",
        )
        annotated_data[claim_id] = {
            "claim": claim,
            "searches": searches,
            "questions": questions,
        }
        with open("maldita_dataset/unique_claims_qs.json", "w", encoding="utf8") as f:
            json.dump(annotated_data, f, indent=4, ensure_ascii=False)
        total_cost = am.total_cost
        estimated_cost = (total_cost / (i + 1)) * len(data)
        pbar.set_description(
            f"Total cost: ${total_cost:.2f}. Estimated cost: ${estimated_cost:.2f}"
        )
        pbar.update(1)

# Get questions and evidences from text

*We decided that we will not use this data, so you can skip this*

This cell will get from every unique article which questions does that article answer, and which evidence is provided in the article. We will use the prompt defined in the cell below. It will generate the file `unique_articles_eq.json`. 

In [19]:
def get_prompt_ft(fact_checking_article, claim):
    return f"""
You will be analyzing a fact-checking article to extract two key elements: evidences and critic questions. 

The article is about the following claim: "{claim}".
The article is the following:

{fact_checking_article}

---

First retrieve a list of main evidences from the article. Evidences can support, refute or be neutral to the claim. All the evidences should be extracted from the arcticle. Evidences should be short and concise. Generate as many evidences as you can find in the article, ensure that all the evidences are extracted from the article, do not generate evidences that are not in the article. Evidences should be self-contained and understandable without the need to read the article.

Second, retrieve the critic questions that the article uses to challenge the claim. Critic questions are questions that can be asked to challenge the claim. These questions should be critical and aimed at uncovering various aspects of the claim. You should read the article and infer which critical questions the author uses to challenge the claim. These questions might not be explicitly stated in the article, but you should infer them from the content of the article. The questions must be self-contained and understandable without the need to read the article. Questions should be neutral and not biased towards the information already presented in the article. You should generate the questions that the author made to himself before writing the article.

Present your response in the following JSON format. The JSON should include three main keys: "evidences", "questions". "evidences" and "searches" should be arrays containing the items you generated in each step. 


Your answer should be in Spanish.

""".strip()

In [None]:
with open("maldita_dataset/unique_articles.json", "r", encoding="utf8") as f:
    data = json.load(f)

if os.path.exists("maldita_dataset/unique_articles_eq.json"):
    with open("maldita_dataset/unique_articles_eq.json", "r", encoding="utf8") as f:
        annotated_data = json.load(f)
else:
    annotated_data = {}

print(f"Total elements: {len(data)}")
print(f"Already annotated elements: {len(annotated_data)}")
data = data[len(annotated_data) :]
print(f"Elements to annotate: {len(data)}")

In [22]:
class Retrieve(BaseModel):
    evidences: List[str]
    questions: List[str]


def retrieve(
    am: ApiManager,
    fact_checking_article: str,
    claim: str,
    model: str = "gpt4o",
):
    prompt = get_prompt_ft(fact_checking_article=fact_checking_article, claim=claim)

    search_dict = am.structured_completion(
        model=model,
        prompt=prompt,
        max_tokens=4096,
        pydanctic_model=Retrieve,
    )

    evidences = search_dict["evidences"]
    questions = search_dict["questions"]

    return evidences, questions

In [None]:
am.reset_cost()
with tqdm(data, desc="Total cost: $0.0. Estimated cost: $0.0", total=len(data)) as pbar:
    for i, fc in enumerate(data):
        article_id = fc["article_id"]
        article = fc["content"]
        claim = fc["title"]
        evidences, questions = retrieve(
            am=am,
            fact_checking_article=article,
            claim=claim,
            model="gpt4o",
        )
        annotated_data[article_id] = {
            "claim": claim,
            "article": article,
            "evidences": evidences,
            "questions": questions,
        }
        with open("maldita_dataset/unique_articles_eq.json", "w", encoding="utf8") as f:
            json.dump(annotated_data, f, indent=4, ensure_ascii=False)
        total_cost = am.total_cost
        estimated_cost = (total_cost / (i + 1)) * len(data)
        pbar.set_description(
            f"Total cost: ${total_cost:.2f}. Estimated cost: ${estimated_cost:.2f}"
        )
        pbar.update(1)

# Retrieve context

Run the web searches and RAG pipelines. For every `article` we have a set of `questions` and `searches`. We will
- Retrieve from google search the top_5 web result for every `search`
- Split the webpages by `\n` characters to generate chunks
- Generate an embedding for each `question` and `chunk`
- Rank the `chunks` by the similarity to the `questions`. 

Similar to before, you can stop the process and resume it latter. This process will generate the `unique_claims_qs_context.json` file. In this file, for each `question` we will store all the `chunks` ranked by similarity from more similar to less similar. Latter you should decide how many chunks you will use and get the top_k ones. 

[MANUAL LABOUR] Validate a set of top_k chunks. 


In [None]:
with open("maldita_dataset/unique_claims_qs.json", "r", encoding="utf8") as f:
    data_all = json.load(f)

if os.path.exists("maldita_dataset/unique_claims_qs_context.json"):
    with open("maldita_dataset/unique_claims_qs_context.json", "r", encoding="utf8") as f:
        annotated_data = json.load(f)
else:
    annotated_data = {}

annotated_keys = annotated_data.keys()
all_keys = data_all.keys()

remaining_keys = set(all_keys) - set(annotated_keys)

# Get remaining data
data = {k: data_all[k] for k in remaining_keys}

print(f"Total elements: {len(data_all)}")
print(f"Already annotated elements: {len(annotated_data)}")
print(f"Elements to annotate: {len(data)}")

In [24]:
def retrieve_relevant_context(
    am: ApiManager,
    searches: List[str],
    questions: List[str],
    top_k_websites: int, # How many websites to retrieve for each search
    embedding_model: str,
    web_search_provider: str,
    chunk_size: str, # Minimun chunk size for the RAG model. We will generate chunks by splitting the websites by newlines, but each chunk will have at least this size, we wont split until we reach this size.
    language: str, # Language of the search, "es" for spanish
    location: str = None, # Location of the search, "es" for retrieving results as if the user was in Spain
):
    rag = RAG(api_manager=am)

    rag.search_questions(
        searches,
        top_k_websites,
        web_search_provider,
        language,
        location,
    )

    rag_results = rag.generate_w_ranks(
        embedding_model,
        chunk_size,
        16,
        questions,
        512,
    )

    return rag_results

In [None]:
am.reset_cost()
with tqdm(data, desc="Total cost: $0.0. Estimated cost: $0.0", total=len(data)) as pbar:
    for i, (key, elem) in enumerate(data.items()):
        claim = elem["claim"]
        searches = elem["searches"]
        questions = elem["questions"]

        context = retrieve_relevant_context(
            am=am,
            searches=searches,
            questions=questions,
            top_k_websites=5,
            embedding_model="openai_embeddings_large",
            web_search_provider="serper",
            chunk_size=128,
            language="es",
            location="es",
        )

        elem["context"] = context
        annotated_data[key] = elem
        with open("maldita_dataset/unique_claims_qs_context.json", "w", encoding="utf8") as f:
            json.dump(annotated_data, f, indent=4, ensure_ascii=False)
        total_cost = am.total_cost
        estimated_cost = (total_cost / (i + 1)) * len(data)
        pbar.set_description(
            f"Total cost: ${total_cost:.2f}. Estimated cost: ${estimated_cost:.2f}"
        )
        pbar.update(1)

# Answer questions and get article

Generate synthetic answers for each question based on the retrieved context. And generate a fact checking with three paragraphs "supporting evidence", "counter evidence" and "summary" based on all the retrieved context. 

The model will generate citations for every claim.



In [None]:
with open("maldita_dataset/unique_claims_qs_context.json", "r", encoding="utf8") as f:
    data = json.load(f)

if os.path.exists("maldita_dataset/unique_claims_qs_context_synthetic.json"):
    with open("maldita_dataset/unique_claims_qs_context_synthetic.json", "r", encoding="utf8") as f:
        annotated_data = json.load(f)
else:
    annotated_data = {}

annotated_keys = annotated_data.keys()
all_keys = data.keys()

remaining_keys = set(all_keys) - set(annotated_keys)

# Get remaining data
data = {k: data[k] for k in remaining_keys}

print(f"Total elements: {len(data)}")
print(f"Already annotated elements: {len(annotated_data)}")
print(f"Elements to annotate: {len(data)}")

In [38]:
def find_text(url, rag_results):
    for q in rag_results:
        for i in range(len(rag_results[q])):
            if rag_results[q][i]["metadata"]["url"] == url:
                return rag_results[q][i]["text"]
    return ""

def run_async(func, *args, **kwargs):
    return asyncio.create_task(asyncio.to_thread(func, *args, **kwargs))


def generate_factcheking_response(
    cmg: CitationManager,
    fact_checking: str,
    qa_results: List[str],
    metadata: Dict[str, str],
    rag_results: Dict[str, List[Dict[str, str]]],
):
    #print(fact_checking)
    fixed_texts, citations = cmg.reorder_citations(
        [fact_checking] + [answer for _, answer in qa_results]
    )
    #print(citations)

    fact_checking = fixed_texts[0]
    qa_results = {
        question: fixed_texts[i + 1] for i, (question, _) in enumerate(qa_results)
    }

    citations_dict = {
        n: {
            "url": "".join(x.url.split("?"))[:-1],
            "source": x.source,
            "favicon": x.favicon,
            "text": find_text(x.url, rag_results),
        }
        for n, x in zip(citations, cmg.get_metadata(citations))
    }

    response = {
        "automated_factchecking": fact_checking,
        "sources": citations_dict,
        "questions_answers": qa_results,
        "metadata": metadata,
    }

    return response


async def generate_data(
    am: ApiManager,
    rag_results,
):
    uniqueid = 0
    for q in rag_results:
        for i in range(len(rag_results[q])):
            rag_results[q][i]["metadata"]["url"] = (
                rag_results[q][i]["metadata"]["url"] + f"?{uniqueid}"
            )
            uniqueid += 1

    # print(json.dumps(rag_results,ensure_ascii=False, indent=4))

    metadatas = [
        snippet["metadata"] for snippets in rag_results.values() for snippet in snippets
    ]

    # print(metadatas)
    cmg = CitationManager(metadatas=metadatas)

    qa_task = run_async(answer_questions, am, cmg, rag_results, "es", "gpt4omini")

    article_task = run_async(
        generate_article,
        am,
        cmg,
        rag_results,
        "es",
        claim,
        "gpt4o",
        None,
    )

    qa_results = await qa_task
    fact_checking = await article_task

    metadata = generate_metadata(
        am,
        claim,
        fact_checking,
        "gpt4omini",
        "es",
        "es",
    )

    response = generate_factcheking_response(
        cmg=cmg,
        fact_checking=fact_checking,
        qa_results=qa_results,
        metadata=metadata,
        rag_results=rag_results,
    )
    return response

In [None]:
from copy import deepcopy

top_k_chunks = 3

am.reset_cost()
with tqdm(data, desc="Total cost: $0.0. Estimated cost: $0.0", total=len(data)) as pbar:
    for i, (key, elem) in enumerate(data.items()):
        cmg = CitationManager()
        elem_tmp = deepcopy(elem)
        claim = elem_tmp["claim"]
        questions = elem_tmp["questions"]
        rag_results = elem_tmp["context"]

        for q in rag_results:
            rag_results[q] = rag_results[q][:top_k_chunks]

        response = await generate_data(am, rag_results)

        elem["synthetic_factchecking"] = response
        annotated_data[key] = elem

        with open("maldita_dataset/unique_claims_qs_context_synthetic.json", "w", encoding="utf8") as f:
            json.dump(annotated_data, f, indent=4, ensure_ascii=False)

        total_cost = am.total_cost
        estimated_cost = total_cost / (i + 1) * len(data)
        pbar.set_description(
            f"Total cost: ${total_cost:.2f}. Estimated cost: ${estimated_cost:.2f}"
        )
        pbar.update(1)