# Datasets

In [None]:
import os
import json
import pandas as pd
from google.colab import drive
drive.mount("/content/drive/")

DATASET_DIR = os.path.join("/content/drive/MyDrive/TCC/workspace/datasets")
RESULTS_DIR =  os.path.join("/content/drive/MyDrive/TCC/workspace/experiments/results")

realNewsWebsites_path = os.path.join(DATASET_DIR, "realNewsWebsites.json") # 1368 sites
with open(realNewsWebsites_path) as file:
    realNewsWebsites = json.load(file)

realbrsites_path = f"{DATASET_DIR}/realbrsites.txt" #73 sites
with open(realbrsites_path) as file:
    realbrsites = file.readlines()
LEGITM_SITES = realNewsWebsites + realbrsites

full_text_fakebrcorpus = pd.read_csv(f"{DATASET_DIR}/Fakebr/full_text_fakebrcorpus.csv")
liar_full = pd.read_csv(f'{DATASET_DIR}/LIAR/liar_train.csv')

# Enviroment Setup

In [None]:
!python --version

In [None]:
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth==2025.1.1
!pip install pyarrow==17.0.0
!pip install transformers==4.47.1
!pip install torch==2.5.1
!pip install sklearn==1.6.0

In [None]:
import os
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay

os.environ.setdefault("GOOGLE_API_KEY", "")
os.environ.setdefault("CSE_ID", "")

# Functions

In [None]:
def plot(confusion_matrix, title):
  disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=["Fake News", "True News"])
  disp.plot()
  plt.title(title)
  plt.show()


def evaluate_assistant_answer(true_labels, predicted_labels):
    y_true = list(true_labels['label'])
    y_pred = list(predicted_labels['label'])

    cm = confusion_matrix(y_true, y_pred, labels=[0,1], normalize='true')

    # TP = cm[0, 0]  # True Positives
    # FN = cm[0, 1]  # False Negatives
    # FP = cm[1, 0]  # False Positives
    # TN = cm[1, 1]  # True Negatives

    # accuracy = (TP + TN) / (TP + TN + FP + FN)
    # precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    # recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    # f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    accuracy = accuracy_score(y_true, y_pred)

    # 0 = Fake news. Using pos_label=0, we're telling that the class of the interesse is 0 (fake news), then the "positive" values is Fake news
    precision_f = precision_score(y_true, y_pred, pos_label=0)
    recall_f = recall_score(y_true, y_pred, pos_label=0)
    f1_f = f1_score(y_true, y_pred, pos_label=0)

    # 1 = True news. Using pos_label=1, we're telling that the class of the interesse is 1 (true news), then the "positive" values is True news
    precision_t = precision_score(y_true, y_pred, pos_label=1)
    recall_t = recall_score(y_true, y_pred, pos_label=1)
    f1_t = f1_score(y_true, y_pred, pos_label=1)

    f1_micro = f1_score(y_true, y_pred, average='micro')
    f1_macro = f1_score(y_true, y_pred, average='macro')

    metrics = {
        "accuracy": round(accuracy,4),
        "precision_f": round(precision_f,4),
        "recall_f": round(recall_f,4),
        "f1_f": round(f1_f,4),
        "precision_t": round(precision_t,4),
        "recall_t": round(recall_t,4),
        "f1_t": round(f1_t,4),
        "f1_micro": round(f1_micro,4),
        "f1_macro": round(f1_macro,4)

    }

    return cm, metrics

In [None]:
import requests
import torch
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, TextIteratorStreamer
from unsloth import FastLanguageModel

REGEX_EXPLANATION = r"(Explanation|Explicação):\s*(.*)"
REGEX_ANSWER = r"(Answer|Resposta):\s*(\d+)"
REGEX_QUERY = r"(Query|Consulta):\s*(.*)"
REGEX_COMPRESSED_INFO = r"(Compressed information|Informação comprimida):\s*(.*)"
REGEX_CONFIDENCE = r"(Confidence|Confiança):\s*(\d+)%"
REGEX_PARAPHRASED = r"(Query|Consulta):\s*(.*)"


STEP_CHECK_PROMPT = {
"en_US": '''Instructions: Based on the provided web search results, analyze in english whether the
information has enough evidence to decide whether the claim is true or false.
Claim: {search_text}
Web search result: {context_str}
Respond "1" if this claim is true, respond "0" if this claim is false.
If you think you need more information, respond "2".
You must to provide your evaluation of the claim in following format:
Explanation: [Explain why you make this judgement.]
Answer: [0/1/2]
Query: [paraphrase the statement so that it is more suitable for websearch engines to search for evidence.]
Compressed information: [Compressed information]
Confidence: [0~100%]
Don't forget to provide the Confidence.
''',

"pt_BR": '''Instruções: Com base nos resultados da pesquisa na web fornecidos, analise em português
se as informações possuem evidências suficientes para decidir se a notícia é verdadeira ou falsa.
Notícia: {search_text}
Resultado da pesquisa na web: {context_str}
Responda "1" se a notícia for verdadeira e "0" se a notícia for falsa.
Se você achar que precisa de mais informações, responda "2".
Você deve fornecer sua avaliação da notícia no seguinte formato:
Explicação: [Explique por que você fez esse julgamento.]
Resposta: [0/1/2]
Consulta: [Parafrasear a notícia para que ela seja mais adequada para os mecanismos de busca na web procurarem por evidências.]
Informação comprimida: [Informação resumida]
Confiança: [0~100%]
Não esqueça de fornecer a Confiança.
'''}

HyDE_PROMPT = {
"en_US": '''Instructions: Given a claim, I want to verify the truth or falsehood of the
claim, help me paraphrase the statement so that it is more suitable for websearch engines to search for evidence.
Claim: {search_text}
Please provide your output in following format:
Query: [paraphrase the statement so that it is more suitable for websearch engines to search for evidence.]''',

"pt_BR": '''Instruções: Dada uma notícia, eu quero verificar a verdade ou falsidade da notícia,
ajude-me a parafrasear a declaração para que ela seja mais adequada para os mecanismos de busca na web procurarem por evidências.
Notícia: {search_text}
Por favor, forneça sua resposta seguinte formato:
Consulta: [Parafrasear a declaração para que ela seja mais adequada para os mecanismos de busca na web procurarem por evidências.]'''
}

OUTPUT = ""

class Model:
  def __init__(self, model, tokenizer, system_prompt="You are a helpful assistant"):
      self.model = model
      self.tokenizer = tokenizer
      self.max_new_tokens = 2048
      self.system_prompt = system_prompt
      self.messages_base = {"role": "system", "content": system_prompt}
      self.terminators = [self.tokenizer.eos_token_id, self.tokenizer.convert_tokens_to_ids("<|endoftext|>")]


  def generate_message(self, messages):
      chat = [self.messages_base, messages]

      input_ids = self.tokenizer.apply_chat_template(
          chat,
          add_generation_prompt=True,
          return_tensors="pt"
      ).to(self.model.device)

      outputs = self.model.generate(
          input_ids,
          max_new_tokens=self.max_new_tokens,
          eos_token_id=self.terminators,
          pad_token_id=self.tokenizer.eos_token_id,
          do_sample=False,
          temperature=0.1,
          top_p=0.75,
          use_cache=True,
          streamer=TextIteratorStreamer(self.tokenizer),
      )

      response = outputs[0][input_ids.shape[-1]:]
      return self.tokenizer.decode(response, skip_special_tokens=True)


  def set_for_inference(self):
      FastLanguageModel.for_inference(self.model)


class STEEL_HyDE:
  def __init__(self, agent: Model, language):
      self.agent = agent
      self.language = language


  def calculate_confidence(self, Conf):
      B = 0.7
      alpha = B * Conf

      return alpha

  def HyDE(self, claim, web_search_results):
      # Return a paraphrased claim to choose documents
      hyde_prompt = HyDE_PROMPT[self.language]
      message = {"role": "user", "content": hyde_prompt.format(search_text=claim)}
      paraphrase_claim = re.search(REGEX_PARAPHRASED, self.agent.generate_message(message)).group(2)

      embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

      web_results_embedding = embedding_model.encode(web_search_results)
      paraphrase_claim_embedding = embedding_model.encode(paraphrase_claim)
      normalized_paraphrase_claim_embedding = normalize(paraphrase_claim_embedding.reshape(1, -1), norm="l2", axis=1)

      list_of_similarities = []
      for document_embedding in web_results_embedding:
          normalized_document_embedding = normalize(document_embedding.reshape(1, -1), norm="l2", axis=1)
          con_sim = cosine_similarity(normalized_document_embedding, normalized_paraphrase_claim_embedding).reshape(1)
          list_of_similarities.append(con_sim[0])

      return list_of_similarities

  def retrieval_module(self, claim, query, paraphrase_claim):
      web_search_results = web_search(query)
      if web_search_results:

        list_of_similarities = self.HyDE(claim, web_search_results)
        top_results = [k for k, v in sorted(zip(web_search_results, list_of_similarities), key=lambda x: x[1], reverse=True)]
        top_k_results = top_results[:3]
        claim_tokens_length = len(self.agent.tokenizer.encode(claim, truncation=False))
        max_context_length = self.agent.model.max_seq_length - claim_tokens_length
        evidences = filter_retrieved_documents(top_k_results, self.agent.tokenizer, max_context_length)

        return evidences

      return []


  def reasoning_module(self, claim, evidences, prompt):
      global OUTPUT
      evidences = "\n".join(evidences) if evidences else ""
      message = {"role": "user", "content": prompt.format(search_text=claim, context_str=evidences)}
      output = self.agent.generate_message(message)
      OUTPUT = output
      print(output)

      explanation = re.search(REGEX_EXPLANATION, output).group(2).strip()
      answer = re.search(REGEX_ANSWER, output).group(2).strip()
      query = re.search(REGEX_QUERY, output).group(2).strip()
      compressed_info = re.search(REGEX_COMPRESSED_INFO, output).group(2).strip()
      confidence = re.search(REGEX_CONFIDENCE, output).group(2).strip()

      real_confidence = self.calculate_confidence(float(confidence))
      research_query = [string.strip('"') for string in query.strip('[]').split(', ')]

      output_map = {
          "explanation": explanation,
          "answer": answer,
          "query": research_query,
          "compressed_info": compressed_info,
          "confidence": real_confidence
      }

      return output, output_map


  def answer(self, claim):
      step = 1
      query = claim # In first interation, we use claim as query to web research
      compressed_info = ""
      paraphrase_claim = self.HyDE(claim)
      while step <= 3:
        print(f"Iteration {step}")
        evidences = self.retrieval_module(claim=claim, query=query, paraphrase_claim=paraphrase_claim)
        prompt = STEP_CHECK_PROMPT[self.language]
        evidences = evidences + [compressed_info] if compressed_info else evidences
        output, output_map = self.reasoning_module(claim, evidences, prompt)

        confidence, query, compressed_info = output_map["confidence"], output_map["query"], output_map["compressed_info"]
        step += 1

        if output_map["answer"] == '2' and step <= 3:
          continue
        if confidence > 50:
          break


      return output_map["answer"], confidence, output_map["explanation"], output



def extract_highly_similar_fragments(doc, tokenizer, max_fragment_length):
    """
    Extract highly similar fragments from a document.

    Parameters:
    - doc (str): The document to process.
    - tokenizer: The tokenizer used for the LLM.
    - max_fragment_length (int): The maximum token length for a fragment.

    Returns:
    - fragments (list of str): Extracted fragments that fit the fragment length.
    """
    # Split the document into sentences or paragraphs for more granular processing
    fragments = []
    sentences = doc.split(". ")

    for sentence in sentences:
        sentence_tokens = tokenizer.encode(sentence, truncation=False)
        if len(sentence_tokens) <= max_fragment_length:
            fragments.append(sentence)
        else:
            # For very long sentences, truncate to fit the fragment length
            truncated = tokenizer.decode(sentence_tokens[:max_fragment_length])
            fragments.append(truncated)

    return fragments


def filter_retrieved_documents(retrieved_docs, tokenizer, max_context_length):
    """
    Adjust retrieved documents to fit within the LLM's context length.
    Parameters:
    - retrieved_docs (list of str): The documents retrieved by the retriever.
    - tokenizer: The tokenizer used for the LLM.
    - max_context_length (int): The maximum context length allowed by the LLM.
    Returns:
    - adjusted_docs (list of str): The adjusted documents that fit the context length.
    """
    adjusted_docs = []
    total_tokens = 0
    for doc in retrieved_docs:
        # Tokenize the document to check its length
        doc_tokens = tokenizer.encode(doc, truncation=False)
        doc_length = len(doc_tokens)
        if total_tokens + doc_length > max_context_length:
            if doc_length > max_context_length:
                # Document exceeds the max length; extract similar fragments
                fragments = extract_highly_similar_fragments(doc, tokenizer, max_context_length)
                for frag in fragments:
                    frag_tokens = tokenizer.encode(frag, truncation=False)
                    frag_length = len(frag_tokens)
                    if total_tokens + frag_length <= max_context_length:
                        adjusted_docs.append(frag)
                        total_tokens += frag_length
                    else:
                        break  # Stop adding fragments once max length is reached
            else:
                break  # Stop processing documents if adding exceeds max length
        else:
            # Document length is within acceptable limits; add to the adjusted list
            adjusted_docs.append(doc)
            total_tokens += doc_length
    return adjusted_docs


def web_search(query):
    search_url = "https://www.googleapis.com/customsearch/v1"
    params = {
        "key": os.environ.get("GOOGLE_API_KEY"),
        "cx": os.environ.get("CSE_ID"),
        "q": query[:100],
        "num": 10  # Maximum number of results
    }

    response = requests.get(search_url, params=params)
    results = response.json()

    filtered_results = []
    for i, item in enumerate(results.get("items", [])):
      if any(site in item.get("link", "") for site in LEGITM_SITES):
        snippet = item.get("snippet", "")
        site = item.get("link", "")
        source_str = f"Source [{i+1}] {site.split('/')[2]}"
        document = source_str + f" \n{snippet}"
        filtered_results.append(document)

    return filtered_results

def run_experiments(agent, dataset):
    claims = list(dataset["text"])
    results_df = pd.DataFrame(columns=["label", "confidence", "explanation", "output", "claim"])
    print(f"Total of claims to classify: {len(claims)}")
    print(f"{STEP_CHECK_PROMPT[agent.language]}\n")
    for i,claim in enumerate(claims):
      try:
          print(f"Running claim number [{i+1}]: {claim}")
          answer, confidence, explanation, output = agent.answer(claim)
          new_row = [answer, confidence, explanation, output, claim]
          print('\n')
      except:
        new_row = ["0", "0", "CHINESE OUTPUT", OUTPUT, claim]

      results_df.loc[len(results_df)] = new_row

    return results_df

# Model load

In [None]:
from unsloth import FastLanguageModel

base_model_id = "Qwen/Qwen2.5-7B-Instruct"
max_seq_length = 4096
dtype = None
load_in_4bit = True
device_map = "auto"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model_id,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    device_map=device_map,
)

In [None]:
model.device

# Experiments

## enUS

In [None]:
SYSTEM_PROMPT = "You are an assistant specialized in analyzing and identifying fake or true news."
agent = Model(model, tokenizer, SYSTEM_PROMPT)
agent.set_for_inference()
fake_detector = STEEL_HyDE(agent, "en_US")

In [None]:
outputs = run_experiments(fake_detector, liar_full[0:100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_0100.csv"))

### 1100

In [None]:
outputs = run_experiments(fake_detector, liar_full[100:284])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_100284.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[284:300])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_284300.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[300:398])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_300398.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[398:500])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_398500.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[500:700])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_500700.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[700:900])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_700900.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[900:1100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_9001100.csv"))

### 2100

In [None]:
outputs = run_experiments(fake_detector, liar_full[1100:1300])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_11001300.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[1300:1500])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_13001500.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[1500:1700])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_15001700.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[1700:1900])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_17001900.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[1900:2100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_19002100.csv"))

### 3100

In [None]:
outputs = run_experiments(fake_detector, liar_full[2100:2300])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_21002300.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[2300:2500])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_23002500.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[2500:2700])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_25002700.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[2700:2900])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_27002900.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[2900:3100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_29003100.csv"))

### 4100

In [None]:
outputs = run_experiments(fake_detector, liar_full[3100:3300])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_31003300.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[3300:3500])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_33003500.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[3500:3700])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_35003700.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[3700:3900])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_37003900.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[3900:4100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_39004100.csv"))

### 5100

In [None]:
outputs = run_experiments(fake_detector, liar_full[4100:4300])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_41004300.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[4300:4500])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_43004500.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[4500:4700])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_45004700.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[4700:4900])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_47004900.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[4900:5100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_49005100.csv"))

### 6100

In [None]:
outputs = run_experiments(fake_detector, liar_full[5100:5300])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_51005300.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[5300:5500])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_53005500.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[5500:5700])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_55005700.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[5700:5900])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_57005900.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[5900:6100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_59006100.csv"))

### 7100

In [None]:
outputs = run_experiments(fake_detector, liar_full[6100:6300])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_61006300.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[6300:6500])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_63006500.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[6500:6700])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_65006700.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[6700:6900])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_67006900.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[6900:7100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_69007100.csv"))

In [None]:
outputs = run_experiments(fake_detector, liar_full[7100:]) #DONE
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_71007342.csv"))

### full_results

In [None]:
files = os.listdir(RESULTS_DIR)
enUS_STEEL_files = []
for file in files:
  if "enUS_STEEL_HyDE" in file:
    enUS_STEEL_files.append(file)
enUS_STEEL_files

In [None]:
raw_enUS_STEEL_HyDE_0100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_0100.csv'))
raw_enUS_STEEL_HyDE_100284 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_100284.csv'))
raw_enUS_STEEL_HyDE_284300 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_284300.csv'))
raw_enUS_STEEL_HyDE_300398 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_300398.csv'))
raw_enUS_STEEL_HyDE_398500 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_398500.csv'))
raw_enUS_STEEL_HyDE_500700 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_500700.csv'))
raw_enUS_STEEL_HyDE_700900 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_700900.csv'))
raw_enUS_STEEL_HyDE_9001100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_9001100.csv'))
raw_enUS_STEEL_HyDE_11001300 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_11001300.csv'))
raw_enUS_STEEL_HyDE_13001500 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_13001500.csv'))
raw_enUS_STEEL_HyDE_15001700 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_15001700.csv'))
raw_enUS_STEEL_HyDE_17001900 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_17001900.csv'))
raw_enUS_STEEL_HyDE_19002100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_19002100.csv'))
raw_enUS_STEEL_HyDE_21002300 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_21002300.csv'))
raw_enUS_STEEL_HyDE_23002500 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_23002500.csv'))
raw_enUS_STEEL_HyDE_25002700 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_25002700.csv'))
raw_enUS_STEEL_HyDE_27002900 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_27002900.csv'))
raw_enUS_STEEL_HyDE_29003100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_29003100.csv'))
raw_enUS_STEEL_HyDE_31003300 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_31003300.csv'))
raw_enUS_STEEL_HyDE_33003500 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_33003500.csv'))
raw_enUS_STEEL_HyDE_35003700 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_35003700.csv'))
raw_enUS_STEEL_HyDE_37003900 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_37003900.csv'))
raw_enUS_STEEL_HyDE_39004100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_39004100.csv'))
raw_enUS_STEEL_HyDE_41004300 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_41004300.csv'))
raw_enUS_STEEL_HyDE_43004500 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_43004500.csv'))
raw_enUS_STEEL_HyDE_45004700 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_45004700.csv'))
raw_enUS_STEEL_HyDE_47004900 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_47004900.csv'))
raw_enUS_STEEL_HyDE_49005100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_49005100.csv'))
raw_enUS_STEEL_HyDE_51005300 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_51005300.csv'))
raw_enUS_STEEL_HyDE_53005500 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_53005500.csv'))
raw_enUS_STEEL_HyDE_55005700 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_55005700.csv'))
raw_enUS_STEEL_HyDE_57005900 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_57005900.csv'))
raw_enUS_STEEL_HyDE_59006100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_59006100.csv'))
raw_enUS_STEEL_HyDE_61006300 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_61006300.csv'))
raw_enUS_STEEL_HyDE_63006500 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_63006500.csv'))
raw_enUS_STEEL_HyDE_65006700 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_65006700.csv'))
raw_enUS_STEEL_HyDE_67006900 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_67006900.csv'))
raw_enUS_STEEL_HyDE_69007100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_69007100.csv'))
raw_enUS_STEEL_HyDE_71007342 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_enUS_STEEL_HyDE_71007342.csv'))

enUS_STEEL_results = pd.concat([raw_enUS_STEEL_HyDE_0100,
raw_enUS_STEEL_HyDE_100284,
raw_enUS_STEEL_HyDE_284300,
raw_enUS_STEEL_HyDE_300398,
raw_enUS_STEEL_HyDE_398500,
raw_enUS_STEEL_HyDE_500700,
raw_enUS_STEEL_HyDE_700900,
raw_enUS_STEEL_HyDE_9001100,
raw_enUS_STEEL_HyDE_11001300,
raw_enUS_STEEL_HyDE_13001500,
raw_enUS_STEEL_HyDE_15001700,
raw_enUS_STEEL_HyDE_17001900,
raw_enUS_STEEL_HyDE_19002100,
raw_enUS_STEEL_HyDE_21002300,
raw_enUS_STEEL_HyDE_23002500,
raw_enUS_STEEL_HyDE_25002700,
raw_enUS_STEEL_HyDE_27002900,
raw_enUS_STEEL_HyDE_29003100,
raw_enUS_STEEL_HyDE_31003300,
raw_enUS_STEEL_HyDE_33003500,
raw_enUS_STEEL_HyDE_35003700,
raw_enUS_STEEL_HyDE_37003900,
raw_enUS_STEEL_HyDE_39004100,
raw_enUS_STEEL_HyDE_41004300,
raw_enUS_STEEL_HyDE_43004500,
raw_enUS_STEEL_HyDE_45004700,
raw_enUS_STEEL_HyDE_47004900,
raw_enUS_STEEL_HyDE_49005100,
raw_enUS_STEEL_HyDE_51005300,
raw_enUS_STEEL_HyDE_53005500,
raw_enUS_STEEL_HyDE_55005700,
raw_enUS_STEEL_HyDE_57005900,
raw_enUS_STEEL_HyDE_59006100,
raw_enUS_STEEL_HyDE_61006300,
raw_enUS_STEEL_HyDE_63006500,
raw_enUS_STEEL_HyDE_65006700,
raw_enUS_STEEL_HyDE_67006900,
raw_enUS_STEEL_HyDE_69007100,
raw_enUS_STEEL_HyDE_71007342])

enUS_STEEL_results.to_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_full.csv"), index=False)

In [None]:
len(enUS_STEEL_results)

In [None]:
df = pd.read_csv(os.path.join(RESULTS_DIR, "raw_enUS_STEEL_HyDE_full.csv"), index_col=False)
df['label'] = df['label'].replace({2:0})
results_enUS_df = df

In [None]:
cm, metrics = evaluate_assistant_answer(liar_full, results_enUS_df)
cm, metrics

In [None]:
plot(cm, "STEEL_HyDE")

## ptBR

In [None]:
SYSTEM_PROMPT = "Você é um assistente especializado em analisar e identificar notícias falsas ou verdadeiras."
agent = Model(model, tokenizer, SYSTEM_PROMPT)
agent.set_for_inference()
fake_detector = STEEL_HyDE(agent, "pt_BR")

### 1100

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[0:100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_064.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[63:100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_63100.csv"))

In [None]:
# string = full_text_fakebrcorpus[48]
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[63:64])
outputs

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[100:300])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_100300.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[300:500])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_300500.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[500:700])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_500700.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[700:900])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_700900.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[900:1100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_9001100.csv"))

2100

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[1100:1300])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_11001300.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[1300:1500])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_13001500.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[1500:1700])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_15001700.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[1700:1900])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_17001900.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[1900:2100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_19002100.csv"))

### 3100

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[2100:2300])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_21002300.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[2300:2500])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_23002500.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[2500:2700])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_25002700.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[2700:2900])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_27002900.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[2900:3100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_29003100.csv"))

### 4100

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[3100:3300])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_31003300.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[3300:3500])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_33003500.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[3500:3700])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_35003700.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[3700:3900])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_37003900.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[3900:4100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_39004100.csv"))

### 5100

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[4100:4300])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_41004300.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[4300:4500])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_43004500.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[4500:4700])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_45004700.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[4700:4900])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_47004900.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[4900:5100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_49005100.csv"))

### 6100

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[5100:5300])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_51005300.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[5300:5500])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_53005500.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[5500:5700])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_55005700.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[5700:5900])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_57005900.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[5900:6100])
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_59006100.csv"))

### 7100

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[6100:6300]) #DONE
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_61006300.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[6300:6500]) # DONE
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_63006500.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[6500:6700]) #DONE
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_65006700.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[6700:6900]) #DONE
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_67006900.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[6900:7100]) #DONE
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_69007100.csv"))

In [None]:
outputs = run_experiments(fake_detector, full_text_fakebrcorpus[7100:]) #DONE
outputs.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_71007200.csv"))

### full results

In [None]:
df['label'] = df['label'].replace({2:0})

In [None]:
df[df['label']==0]

In [None]:
results_enUS_df = df

In [None]:
cm, metrics = evaluate_assistant_answer(liar_full, results_enUS_df)
cm, metrics

In [None]:
plot(cm, "STEEL_HyDE")

In [None]:
ptBR_STEEL_dfs = []
for file in os.listdir(RESULTS_DIR):
  if "ptBR_STEEL_HyDE" in file:
    df = pd.read_csv(os.path.join(RESULTS_DIR, file), index_col=False)
    print(file, len(df))
    ptBR_STEEL_dfs.append(df)

ptBR_STEEL_results = pd.concat(ptBR_STEEL_dfs)
ptBR_STEEL_results.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_full.csv"))

In [None]:
raw_ptBR_STEEL_HyDE_063 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_063.csv'))
raw_ptBR_STEEL_HyDE_63100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_63100.csv'))
raw_ptBR_STEEL_HyDE_100300 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_100300.csv'))
raw_ptBR_STEEL_HyDE_300500 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_300500.csv'))
raw_ptBR_STEEL_HyDE_500700 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_500700.csv'))
raw_ptBR_STEEL_HyDE_700900 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_700900.csv'))
raw_ptBR_STEEL_HyDE_9001100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_9001100.csv'))
raw_ptBR_STEEL_HyDE_11001300 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_11001300.csv'))
raw_ptBR_STEEL_HyDE_13001500 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_13001500.csv'))
raw_ptBR_STEEL_HyDE_15001700 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_15001700.csv'))
raw_ptBR_STEEL_HyDE_17001900 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_17001900.csv'))
raw_ptBR_STEEL_HyDE_19002100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_19002100.csv'))
raw_ptBR_STEEL_HyDE_21002300 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_21002300.csv'))
raw_ptBR_STEEL_HyDE_23002500 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_23002500.csv'))
raw_ptBR_STEEL_HyDE_25002700 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_25002700.csv'))
raw_ptBR_STEEL_HyDE_27002900 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_27002900.csv'))
raw_ptBR_STEEL_HyDE_29003100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_29003100.csv'))
raw_ptBR_STEEL_HyDE_31003300 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_31003300.csv'))
raw_ptBR_STEEL_HyDE_33003500 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_33003500.csv'))
raw_ptBR_STEEL_HyDE_35003700 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_35003700.csv'))
raw_ptBR_STEEL_HyDE_37003900 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_37003900.csv'))
raw_ptBR_STEEL_HyDE_39004100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_39004100.csv'))
raw_ptBR_STEEL_HyDE_41004300 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_41004300.csv'))
raw_ptBR_STEEL_HyDE_43004500 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_43004500.csv'))
raw_ptBR_STEEL_HyDE_45004700 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_45004700.csv'))
raw_ptBR_STEEL_HyDE_47004900 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_47004900.csv'))
raw_ptBR_STEEL_HyDE_49005100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_49005100.csv'))
raw_ptBR_STEEL_HyDE_51005300 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_51005300.csv'))
raw_ptBR_STEEL_HyDE_53005500 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_53005500.csv'))
raw_ptBR_STEEL_HyDE_55005700 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_55005700.csv'))
raw_ptBR_STEEL_HyDE_57005900 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_57005900.csv'))
raw_ptBR_STEEL_HyDE_59006100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_59006100.csv'))
raw_ptBR_STEEL_HyDE_61006300 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_61006300.csv'))
raw_ptBR_STEEL_HyDE_63006500 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_63006500.csv'))
raw_ptBR_STEEL_HyDE_65006700 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_65006700.csv'))
raw_ptBR_STEEL_HyDE_67006900 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_67006900.csv'))
raw_ptBR_STEEL_HyDE_69007100 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_69007100.csv'))
raw_ptBR_STEEL_HyDE_71007200 = pd.read_csv(os.path.join(RESULTS_DIR,'raw_ptBR_STEEL_HyDE_71007200.csv'))

ptBR_STEEL_results = pd.concat([raw_ptBR_STEEL_HyDE_063,
raw_ptBR_STEEL_HyDE_63100
,raw_ptBR_STEEL_HyDE_100300
,raw_ptBR_STEEL_HyDE_300500
,raw_ptBR_STEEL_HyDE_500700
,raw_ptBR_STEEL_HyDE_700900
,raw_ptBR_STEEL_HyDE_9001100
,raw_ptBR_STEEL_HyDE_11001300
,raw_ptBR_STEEL_HyDE_13001500
,raw_ptBR_STEEL_HyDE_15001700
,raw_ptBR_STEEL_HyDE_17001900
,raw_ptBR_STEEL_HyDE_19002100
,raw_ptBR_STEEL_HyDE_21002300
,raw_ptBR_STEEL_HyDE_23002500
,raw_ptBR_STEEL_HyDE_25002700
,raw_ptBR_STEEL_HyDE_27002900
,raw_ptBR_STEEL_HyDE_29003100
,raw_ptBR_STEEL_HyDE_31003300
,raw_ptBR_STEEL_HyDE_33003500
,raw_ptBR_STEEL_HyDE_35003700
,raw_ptBR_STEEL_HyDE_37003900
,raw_ptBR_STEEL_HyDE_39004100
,raw_ptBR_STEEL_HyDE_41004300
,raw_ptBR_STEEL_HyDE_43004500
,raw_ptBR_STEEL_HyDE_45004700
,raw_ptBR_STEEL_HyDE_47004900
,raw_ptBR_STEEL_HyDE_49005100
,raw_ptBR_STEEL_HyDE_51005300
,raw_ptBR_STEEL_HyDE_53005500
,raw_ptBR_STEEL_HyDE_55005700
,raw_ptBR_STEEL_HyDE_57005900
,raw_ptBR_STEEL_HyDE_59006100
,raw_ptBR_STEEL_HyDE_61006300
,raw_ptBR_STEEL_HyDE_63006500
,raw_ptBR_STEEL_HyDE_65006700
,raw_ptBR_STEEL_HyDE_67006900
,raw_ptBR_STEEL_HyDE_69007100
,raw_ptBR_STEEL_HyDE_71007200])
ptBR_STEEL_results.to_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_full.csv"))

In [None]:
ptBR_STEEL_results = pd.read_csv(os.path.join(RESULTS_DIR, "raw_ptBR_STEEL_HyDE_full.csv"), index_col=False)
ptBR_STEEL_results['label'] = ptBR_STEEL_results['label'].replace({2:0})

In [None]:
ptBR_STEEL_results[ptBR_STEEL_results['label']==0]

In [None]:
cm, metrics = evaluate_assistant_answer(full_text_fakebrcorpus, ptBR_STEEL_results)
cm, metrics

In [None]:
plot(cm, "STEEL_HyDE")