### **Installing some required modules**

In [None]:
!python -m spacy download xx_ent_wiki_sm -q
! python -m spacy download en_core_web_sm -q
! python -m spacy download fr_core_news_sm -q
! python -m spacy download de_core_news_sm -q
! python -m spacy download es_core_news_sm -q
! python -m spacy download it_core_news_sm -q
! python -m spacy download sv_core_news_sm -q
! python -m spacy download fi_core_news_sm -q
! python -m spacy download zh_core_web_sm -q

### **Import Packages**

In [3]:
import os
import re
import requests
from bs4 import BeautifulSoup
import numpy as np
from scipy.stats import pearsonr
import spacy
import openai
from openai import OpenAI
import json

import pandas as pd
from scipy.stats import spearmanr

import argparse
from io import StringIO

#### Retrieving apis and secret keys

In [4]:
openai.api_key = 'sk-...'
openai_api = openai.api_key
deepseek_api = 'sk-...' 
# if you are using deepseek api then you should change the base_url parameter in th efollowing line to 'https://api.deepseek.com'
client = OpenAI(api_key = openai_api )
google_search_api_key = os.getenv('GOOGLE_SEARCH_API_KEY')
google_search_engine = os.getenv('GOOGLE_SEARCH_ENGINE')

# The following list was to obtain several apis for several projects in google custom search service
#  as due to limited qouta of the google_custom_search_api 50 per day, It was essential to use apis
#  from different projects to be able to proceed in our pipeline
apis_lst = []


### **1. Splitting the Response into factual statements**

In [5]:
def split_response_into_facts(client, data_point):
    text = data_point["model_output_text"].strip()

    # Prompt includes emphasis to separate short words or phrases that imply a claim
    # and clarifies the text could be in any language
    split_prompt = f'''
You are a multi-lingual text analysis assistant. The provided text is in the {data_point['lang']} language.

Text to analyze:
"""{text}"""

**Task**:
1. Split this text into atomic, independent statements or claims.
   - This includes short words or phrases that might carry a claim (e.g., "Yes", "No", "Certainly", "Indeed", in English
     "نعم", "لا" in Arabic, "Bai", "Ez" in Basque "Sí", "No" in Spanish or Catalan
     "Ano", "Ne" in Czech, "Ja", "Nein" in German, "是", "不是" in Chinese
     "हाँ", "नहीं" in Hindi,  "بله", "خیر" in Farsi, "Kyllä", "Ei" in Finnish
     "Oui", "Non" in French or their equivalents in other languages.)

   - If a short phrase is attached to a longer statement but clearly provides its own  claim, separate it.

2. Output a JSON array of objects. Each object should have:
   - "factual_statement_i": the i-th extracted statement.
   - "original_substring_i": the exact substring from the text with punctuation as in the original provided text .

3. Return only valid JSON. No extra commentary or markdown or backticks.

IMPORTANT:
- Retrieve the "original_substring_i" axactly as is in the original text, including purely leading/trailing punctuation,
  spacing, and capitalization. DO NOT modify, rephrase or alter the original text in anyway.
  Pay attention to skip characters always consider skip characters.
- Do not merge short claim-words with other sentences unless they are obviously part of the same statement.
- Number statements sequentially using "i", i is numeric in "factual_statement_i" and "original_substring_i" must be integer indices (1,2,3,...) such as original_substring_1, factual_statement_2, original_substring_3, ...

'''

    chat_completion = client.chat.completions.create(
        model="gpt-4o-2024-11-20",
        messages=[{"role": "user", "content": split_prompt}],
        temperature=0,
        max_tokens=1500
    )
    response_text = chat_completion.choices[0].message.content.strip()
    try:
        statements_json = json.loads(response_text)
    except json.JSONDecodeError:
        # If for some reason parsing fails, fallback to empty
        statements_json = []

    return statements_json


### 2. **RAG with the retrieved context from searching Google**

In [8]:
class GoogleSearchToolSpec:
    def __init__(self, key, engine, apis_lst):
        self.key = key
        self.engine = engine
        self.apis_lst =  apis_lst
        self.apis_idx = 0

    def google_search(self, query, language=None):
        base_url = "https://customsearch.googleapis.com/customsearch/v1"
        params = {
            "key": self.apis_lst[self.apis_idx],
            "cx": self.engine,
            "q": query
        }

        # Add maps for hl (UI language) and gl (country code)
        google_hl_map = {
            'lang_fr': 'fr',
            'lang_en': 'en',
            'lang_de': 'de',
            'lang_es': 'es',
            'lang_it': 'it',
            'lang_fi': 'fi',
            'lang_sv': 'sv',
            'lang_ar': 'ar',
            'lang_hi': 'hi',
            'lang_ZH-CN': 'zh',
            'lang_eu': 'eu',   # Basque
            'lang_ca': 'ca',   # Catalan
            'lang_cs': 'cs',   # Czech
            'lang_fa': 'fa'    # Farsi
        }

        google_gl_map = {
            'lang_fr': 'fr',
            'lang_en': 'us',
            'lang_de': 'de',
            'lang_es': 'es',
            'lang_it': 'it',
            'lang_fi': 'fi',
            'lang_sv': 'se',
            'lang_ar': 'sa',
            'lang_hi': 'in',
            'lang_ZH-CN': 'cn',
            'lang_eu': 'es',   # Basque
            'lang_ca': 'es',   # Catalan
            'lang_cs': 'cz',   # Czech
            'lang_fa': 'ir'    # Farsi
        }

        if language:
            # Existing line from your code:
            params["lr"] = language  # e.g. "lang_fr"

            # Add hl and gl to further reinforce the language signal
            params["hl"] = google_hl_map.get(language, "en")
            params["gl"] = google_gl_map.get(language, "us")

        try:
            response = requests.get(base_url, params=params)
            response.raise_for_status()  # Raises HTTPError for bad responses
            return [response]
        except requests.exceptions.HTTPError as http_err:
          if "Quota exceeded" in response.text or "Too Many Requests" in response.text:
               self.apis_idx = (self.apis_idx + 1) % len(self.apis_lst)
               print("Switching to next key...")
               return self.google_search(query, language)
          print(f"HTTP error occurred: {http_err} - {response.text}")
        except requests.exceptions.RequestException as e:
            print(f"HTTP Request failed: {e}")
        return []

google_search_tool = GoogleSearchToolSpec(key=google_search_api_key, engine=google_search_engine,apis_lst=apis_lst)

##### The first fallback method when there is no context from the search is to redo the search again but after extracting keywords from th euser query

In [9]:
def get_spacy_model(language_code):

    models = {
        'AR': 'xx_ent_wiki_sm',
        'ZH': 'zh_core_web_sm',
        'EN': 'en_core_web_sm',
        'FI': 'fi_core_news_sm',
        'FR': 'fr_core_news_sm',
        'DE': 'de_core_news_sm',
        'HI': 'xx_ent_wiki_sm',
        'IT': 'it_core_news_sm',
        'ES': 'es_core_news_sm',
        'SV': 'sv_core_news_sm',
        'EU': 'xx_ent_wiki_sm',  # Basque
        'CA': 'xx_ent_wiki_sm',  # Catalan
        'CS': 'xx_ent_wiki_sm',  # Czech
        'FA': 'xx_ent_wiki_sm'   # Farsi
    }

    return models.get(language_code.upper(), 'xx_ent_wiki_sm')  # Ensuring uppercase



def extract_keywords(query, language_code):
    model_name = get_spacy_model(language_code)
    try:
        nlp = spacy.load(model_name)
    except OSError:
        print(f"spaCy model '{model_name}' not found. Please install it.")
        return query

    doc = nlp(query)
    keywords = [token.text for token in doc if token.pos_ in ('NOUN', 'PROPN')]
    return ' '.join(keywords) if keywords else query


In [10]:
def perform_search(query, lang=None):
    """
    Performs a Google search for the provided query.
    If `lang` is provided, it will be used to restrict search results to that language.
    """
    google_lr_map = {
        'AR': 'lang_ar',
        'ZH': 'lang_ZH-CN',
        'EN': 'lang_en',
        'FI': 'lang_fi',
        'FR': 'lang_fr',
        'DE': 'lang_de',
        'HI': 'lang_hi',
        'IT': 'lang_it',
        'ES': 'lang_es',
        'SV': 'lang_sv',
        'EU': 'lang_eu',  # Basque
        'CA': 'lang_ca',  # Catalan
        'CS': 'lang_cs',  # Czech
        'FA': 'lang_fa'   # Farsi
    }


    if lang:
        lr_value = google_lr_map.get(lang.upper(), 'lang_en')
        results = google_search_tool.google_search(query, language=lr_value)
    else:
        results = google_search_tool.google_search(query)

    if not results:
        print("No response from Google Search.")
        return ""

    try:
        response_obj = json.loads(results[0].text)
    except json.JSONDecodeError:
        print("Failed to decode JSON from Google Search results.")
        return ""

    items = response_obj.get("items", [])
    if not items:
        print("No items in Google Search results.")
        return ""

    url = items[0].get("link", "")
    if not url:
        print("No link found in first result.")
        return ""

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        for script in soup(["script", "style"]):
            script.extract()
        text = soup.get_text(separator=" ")
        text = re.sub(r"\s+", " ", text).strip()
        return text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return ""
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return ""


##### The second fallback method is to depend on the power of the LLM model (gpt-4o) to construct some context as a reference 

In [11]:
def get_context_from_link(google_search_tool, query, lang, limit = 3000) -> str:
    """
    1) Search Google for 'query' using the given google_search_tool.
    2) Parse the top JSON result, extracting the first link.
    3) Use Beautiful Soup to fetch and parse the webpage text.
    4) Return the combined text as a single string (the 'context').

    If no link or result is found, preprocess the query by extracting keywords
    and perform the search again. Supports multiple languages.

    If still no results are found after retrying, we now:
       - call an LLM to answer the query in the same language
       - return that LLM answer as context (plain text).
    """
    context = perform_search(query, lang=lang)
    if context:
        return context[:limit]

    # Attempt the keywords fallback
    keywords = extract_keywords(query, lang)
    if not keywords or keywords == query:
        print("Keyword extraction failed or no keywords found.")
    else:
        print(f"Retrying search with extracted keywords: '{keywords}'")
        context = perform_search(keywords, lang=lang)
        if context:
            return context[:limit]

    # Use only plain text, no special formatting.
    print("No context found from Google. Calling LLM to answer the query in the same language.")
    llm_prompt = f"""
You must answer the following question in {lang} providing some context in arbitrarily {limit} characters.
No special characters, no extra formatting. Just a pure text in {lang}:

Question: {query}
"""
    try:
        chat_completion = client.chat.completions.create(
            model="gpt-4o-2024-11-20",
            messages=[{"role": "user", "content": llm_prompt}],
            temperature=0,
            max_tokens=1000
        )
        llm_answer = chat_completion.choices[0].message.content.strip()
        # Return that answer as the "context"
        return llm_answer
    except Exception as e:
        print(f"Error calling LLM fallback: {e}")
        return ""

### 3.  **Facts Verification**

In [12]:
#New using chain of thought prompting
def verify_facts_with_context(client, context: str, facts_json: list) -> list:

    verification_prompt = f'''
    You are given the following context:

    \"\"\"{context}\"\"\"

    Below is a JSON array of factual statements extracted from a given text.
    Each object has keys like "factual_statement_i" and "original_substring_i".

    Your task:
    1. **Reason step by step internally** (chain-of-thought) to analyze each statement:
       - Compare each factual statement with the context.
       - Determine whether it is True or False.
       - If False, identify the smallest contradictory spans (e.g., incorrect names, dates, events, etc.).
    2. Produce a final JSON array where each element is:
       {{
         "original_substring_i": "...",
         "verdict": true or false,
         "hallucination": []
       }}
       - Ensure "i" matches the index from the original object.
       - "verdict" must be boolean (true/false).
       - If the statement is False, "hallucination" must list minimal contradictory parts.

    **IMPORTANT GUIDELINES**:
    - Only output valid JSON (no additional comments or explanations or triple backticks).).
    - **Highlight minimal contradictory spans** for False statements:
      - Examples: locations, entities, events, or dates.
      - Ignore purely leading/trailing punctuation unless part of the claim.
    - Treat incomplete or partially incorrect statements as False.
    - Ensure that repeated or redundant parts are included in "hallucination".
    - The queries, statements, and context can be in any language.

    **CHAIN-OF-THOUGHT REASONING**:
    - First, analyze internally for each statement and reason step by step.
    - Do NOT include this reasoning in the output. Only present the final JSON array.

    Here is the JSON array of factual statements:

    {facts_json}

    Based on your reasoning, return only the final valid JSON.
'''


    try:
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": verification_prompt}],
            model="gpt-4o-2024-11-20",
            temperature=0.0,
            max_tokens=1500
        )
    except Exception as e:
        print(f"Error during OpenAI API call: {e}")
        return []

    model_response = chat_completion.choices[0].message.content.strip()

    try:
        verification_results = json.loads(model_response)

        if not isinstance(verification_results, list):
            print("Unexpected response format: Expected a list.")
            return []

        for item in verification_results:
            # Basic sanity checks
            found_substring_key = any(key.startswith("original_substring_") for key in item.keys())
            found_verdict_key = ("verdict" in item.keys())

            if not (found_substring_key and found_verdict_key):
                print("Missing keys in one of the verification results:", item)
                return []

        return verification_results

    except json.JSONDecodeError as e:
        print(f"Error parsing model response as JSON: {e}")
        print("Model response was:", model_response)
        return []


### **Post-processing**
#### Here we postprocess the output hallucinated spans to extract hard and soft labels

In [14]:
def extract_predicted_spans_hard(model_output_text, preds):
    """
    Given the entire model_output_text (string) and a list of dicts like in `preds`,
    return a list of [start, end] pairs (hard-label style).
    """
    predicted_hard_labels = []

    for item in preds:
        # Each item has a "hallucination" key with a list of substrings
        for h_substr in item["hallucination"]:
            start_idx = model_output_text.find(h_substr)
            if start_idx != -1:
                end_idx = start_idx + len(h_substr)
                predicted_hard_labels.append([start_idx, end_idx])
            else:
                # The substring wasn't found in the text (log or skip).
                pass

    return predicted_hard_labels



def extract_predicted_spans_soft(model_output_text, preds):
    predicted_spans = []

    for item in preds:
        for h_substr in item["hallucination"]:
            start_idx = model_output_text.find(h_substr)
            if start_idx != -1:
                end_idx = start_idx + len(h_substr)
                predicted_spans.append({
                    "start": start_idx,
                    "end": end_idx,
                    "prob": 1.0
                })
            else:
                pass

    return predicted_spans

---

### **Putting it all together**

#### **Example**

In [16]:
#dp = {"id":8,"lang":"FR","model_input":"8437 Bernicla est-il un objet transneptunien ?","model_output_text":"8437 Bernicla est-il un objet transneptunien?\nLa d\u00e9couverte de l'objet 8436 Berniccila en 1988 a suscit\u00e9 un int\u00e9r\u00eat particulier car il s'agit d'un objet qui se trouve dans la ceinture de Kuiper, zone du syst\u00e8me solaire o\u00f9 se trouvent les objets trans-Neptuniens (OTN). Cependant, la classification d'OTNe de Bernicia a \u00e9t\u00e9 contest\u00e9e par certains astronomes et la question de savoir si Bernica est un OTNe ou non a longtemps \u00e9t\u00e9 d\u00e9battue.\nLa ceINTURE de KUIPER\n","model_id":"meta-llama\/Meta-Llama-3-8B-Instruct","soft_labels":[{"start":46,"prob":0.3333333333,"end":71},{"start":71,"prob":1.0,"end":75},{"start":75,"prob":0.6666666667,"end":82},{"start":82,"prob":1.0,"end":84},{"start":84,"prob":0.6666666667,"end":86},{"start":86,"prob":0.3333333333,"end":90},{"start":90,"prob":0.6666666667,"end":92},{"start":92,"prob":1.0,"end":94},{"start":157,"prob":0.3333333333,"end":184},{"start":184,"prob":0.6666666667,"end":187},{"start":187,"prob":1.0,"end":193},{"start":193,"prob":0.3333333333,"end":194},{"start":233,"prob":0.3333333333,"end":267},{"start":269,"prob":0.6666666667,"end":280},{"start":280,"prob":1.0,"end":436},{"start":436,"prob":0.6666666667,"end":437},{"start":437,"prob":0.3333333333,"end":443},{"start":443,"prob":0.6666666667,"end":449},{"start":449,"prob":0.3333333333,"end":453},{"start":453,"prob":0.6666666667,"end":459},{"start":459,"prob":0.3333333333,"end":460}],"hard_labels":[[71,86],[90,94],[184,193],[269,437],[443,449],[453,459]]}
#dp ={"id":12,"lang":"ES","model_input":"\u00bfQu\u00e9 caracter\u00edsticas presentan los miembros de la familia l\u00edquenes Cladoniaceae?","model_output_text":"Los miudades de Cladotholus, Cladoscypha y Cladiopelma son todos miembro de las Cladoriaceae. Estos mi\u00e9rcoles presentaban caracter\u00edsticas \u00fanicas que los diferenciaban de otros miembres de su familia.\n\nCladothus es una especie de l\u00edbano que se encuentra en climas des\u00e9rticos. Se caracteriza por sus hojas verdes y gruesas que pueden crecer hasta 30 cm de longitud. Tambi\u00e9n tiene un tallo carnoso que puede almacenar agua durante largos periodos de tiempo, lo que lo convierte en un excelente candidato para su uso en paisajismo en regiones \u00e1ridas.\n\nPor otro lado, la Cladia es un g\u00e9nero de hongos que crece en grupos en \u00e1rboles y arbustos. Sus cuerpos fruct\u00edferos son peque\u00f1os y tienen forma de dedos, con una coloraci\u00f3n amarilla o naranja brillante. La Clada tambi\u00e9n tiene una estructura \u00fanica llamada \"sistema de cladiocistos\", que le permite reproducirse tanto sexual como asexualmente.\n\nFinalmente, el Claudiopella es otro g\u00e9neros f\u00fangicos que tambi\u00e9n crecen en plantas. Tienen una apariencia parecida a plumas y pueden ser de varios colores, incluyendo rojo, amarronado y verde. Su estructuras frutales son distintivas en forma y usualmente se encuentran en grandes grupos.\n\nEn general, estos mi\u00e9rmes de los Cladores son \u00fanicos en su apreciaci\u00f3n y presentaci\u00f3n, mostrando las diversas y fascinantes caracter\u00edsticas de sus respectivos g\u00e9neos.","model_id":"Iker\/Llama-3-Instruct-Neurona-8b-v2","soft_labels":[{"start":4,"prob":1.0,"end":12},{"start":12,"prob":0.3333333333,"end":13},{"start":16,"prob":0.6666666667,"end":21},{"start":21,"prob":1.0,"end":27},{"start":29,"prob":0.6666666667,"end":34},{"start":34,"prob":1.0,"end":40},{"start":40,"prob":0.3333333333,"end":41},{"start":43,"prob":1.0,"end":54},{"start":54,"prob":0.3333333333,"end":55},{"start":80,"prob":0.3333333333,"end":92},{"start":100,"prob":1.0,"end":109},{"start":109,"prob":0.3333333333,"end":110},{"start":200,"prob":0.3333333333,"end":206},{"start":206,"prob":0.6666666667,"end":210},{"start":228,"prob":0.3333333333,"end":229},{"start":229,"prob":1.0,"end":235},{"start":235,"prob":0.3333333333,"end":236},{"start":239,"prob":0.3333333333,"end":253},{"start":253,"prob":0.6666666667,"end":273},{"start":273,"prob":0.3333333333,"end":274},{"start":298,"prob":0.6666666667,"end":303},{"start":303,"prob":0.3333333333,"end":304},{"start":304,"prob":0.6666666667,"end":310},{"start":310,"prob":0.3333333333,"end":311},{"start":311,"prob":0.6666666667,"end":312},{"start":312,"prob":0.3333333333,"end":313},{"start":313,"prob":0.6666666667,"end":320},{"start":320,"prob":0.3333333333,"end":344},{"start":344,"prob":0.6666666667,"end":345},{"start":345,"prob":1.0,"end":347},{"start":347,"prob":0.6666666667,"end":348},{"start":348,"prob":1.0,"end":350},{"start":350,"prob":0.6666666667,"end":362},{"start":378,"prob":0.3333333333,"end":405},{"start":405,"prob":0.6666666667,"end":453},{"start":453,"prob":0.3333333333,"end":454},{"start":530,"prob":0.3333333333,"end":545},{"start":563,"prob":0.3333333333,"end":566},{"start":566,"prob":1.0,"end":572},{"start":572,"prob":0.3333333333,"end":573},{"start":579,"prob":0.3333333333,"end":619},{"start":619,"prob":0.6666666667,"end":636},{"start":636,"prob":0.3333333333,"end":640},{"start":677,"prob":0.3333333333,"end":694},{"start":694,"prob":0.6666666667,"end":699},{"start":720,"prob":0.6666666667,"end":728},{"start":728,"prob":0.3333333333,"end":731},{"start":731,"prob":0.6666666667,"end":738},{"start":738,"prob":0.3333333333,"end":739},{"start":739,"prob":0.6666666667,"end":748},{"start":750,"prob":0.3333333333,"end":753},{"start":753,"prob":0.6666666667,"end":758},{"start":758,"prob":0.3333333333,"end":759},{"start":802,"prob":0.6666666667,"end":827},{"start":870,"prob":0.3333333333,"end":874},{"start":875,"prob":0.3333333333,"end":887},{"start":905,"prob":1.0,"end":917},{"start":917,"prob":0.3333333333,"end":918},{"start":974,"prob":0.3333333333,"end":1007},{"start":1007,"prob":1.0,"end":1013},{"start":1013,"prob":0.6666666667,"end":1014},{"start":1014,"prob":0.3333333333,"end":1030},{"start":1030,"prob":0.6666666667,"end":1036},{"start":1036,"prob":0.3333333333,"end":1037},{"start":1037,"prob":0.6666666667,"end":1043},{"start":1043,"prob":0.3333333333,"end":1044},{"start":1045,"prob":0.3333333333,"end":1057},{"start":1057,"prob":0.6666666667,"end":1061},{"start":1061,"prob":0.3333333333,"end":1063},{"start":1063,"prob":0.6666666667,"end":1073},{"start":1073,"prob":0.3333333333,"end":1076},{"start":1076,"prob":0.6666666667,"end":1081},{"start":1098,"prob":0.3333333333,"end":1106},{"start":1132,"prob":0.3333333333,"end":1176},{"start":1197,"prob":0.6666666667,"end":1200},{"start":1200,"prob":1.0,"end":1204},{"start":1204,"prob":0.3333333333,"end":1205},{"start":1212,"prob":0.6666666667,"end":1216},{"start":1216,"prob":1.0,"end":1220},{"start":1220,"prob":0.3333333333,"end":1221}],"hard_labels":[[4,12],[16,27],[29,40],[43,54],[100,109],[206,210],[229,235],[253,273],[298,303],[304,310],[311,312],[313,320],[344,362],[405,453],[566,572],[619,636],[694,699],[720,728],[731,738],[739,748],[753,758],[802,827],[905,917],[1007,1014],[1030,1036],[1037,1043],[1057,1061],[1063,1073],[1076,1081],[1197,1204],[1212,1220]]}
dp = {"id":3,"lang":"EN","model_input":"Did the Scotland team debut in the UEFA Euro 1996 qualifying phase?","model_output_text":" Yes, Scotland made their debut in the UEFA Euro 1996 qualifying phase. This was their first appearance in a European Championship qualifying campaign since the inception of the UEFA European Football Championship in 1960. Scotland finished third in their group behind England and Switzerland, missing out on qualification for the tournament.","model_id":"TheBloke\/Mistral-7B-Instruct-v0.2-GGUF","soft_labels":[{"start":1,"prob":0.6666666667,"end":4},{"start":6,"prob":0.3333333333,"end":31},{"start":39,"prob":0.3333333333,"end":49},{"start":49,"prob":0.6666666667,"end":53},{"start":53,"prob":0.3333333333,"end":70},{"start":72,"prob":0.3333333333,"end":87},{"start":87,"prob":1.0,"end":92},{"start":92,"prob":0.6666666667,"end":103},{"start":103,"prob":0.3333333333,"end":221},{"start":223,"prob":0.3333333333,"end":232},{"start":232,"prob":0.6666666667,"end":246},{"start":246,"prob":0.3333333333,"end":262},{"start":262,"prob":0.6666666667,"end":269},{"start":269,"prob":1.0,"end":276},{"start":276,"prob":0.6666666667,"end":281},{"start":281,"prob":1.0,"end":292},{"start":292,"prob":0.3333333333,"end":294},{"start":294,"prob":0.6666666667,"end":322},{"start":322,"prob":0.3333333333,"end":341}],"hard_labels":[[1,4],[49,53],[87,103],[232,246],[262,292],[294,322]]}
#dp = {"id":4,"lang":"FR","model_input":"Contre quelles \u00e9quipes l'Azerba\u00efdjan a-t-il \u00e9t\u00e9 \u00e9limin\u00e9 lors de l'Euro 1996 ?","model_output_text":" L'\u00e9quipe d'Azerba\u00efdjan n'a pas particip\u00e9 \u00e0 l'Euro 1996 car elle n'\u00e9tait pas encore redevenue ind\u00e9pendante et n'avait donc pas pu se qualifier pour la comp\u00e9tition. L'Azerba\u00efdjan a obtenu son ind\u00e9pendance en 1991, mais il fallut attendre 2009 avant que l'\u00e9quipe puisse participer \u00e0 une comp\u00e9tition internationale majeure, les Qualifications pour le Championnat d'Europe de football 2012.","model_id":"TheBloke\/Mistral-7B-Instruct-v0.2-GGUF","soft_labels":[{"start":1,"prob":0.3333333333,"end":24},{"start":24,"prob":0.6666666667,"end":55},{"start":55,"prob":0.3333333333,"end":60},{"start":60,"prob":0.6666666667,"end":65},{"start":65,"prob":1.0,"end":109},{"start":109,"prob":0.6666666667,"end":118},{"start":118,"prob":1.0,"end":122},{"start":122,"prob":0.6666666667,"end":162},{"start":164,"prob":0.3333333333,"end":213},{"start":213,"prob":0.6666666667,"end":218},{"start":218,"prob":1.0,"end":319},{"start":319,"prob":0.6666666667,"end":320},{"start":320,"prob":0.3333333333,"end":386}],"hard_labels":[[24,55],[60,162],[213,320]]}


In [17]:
print(dp['model_output_text'])

 Yes, Scotland made their debut in the UEFA Euro 1996 qualifying phase. This was their first appearance in a European Championship qualifying campaign since the inception of the UEFA European Football Championship in 1960. Scotland finished third in their group behind England and Switzerland, missing out on qualification for the tournament.


In [18]:
facts_json = split_response_into_facts(client, dp)

In [19]:
facts_json

[{'factual_statement_1': 'Yes', 'original_substring_1': 'Yes,'},
 {'factual_statement_2': 'Scotland made their debut in the UEFA Euro 1996 qualifying phase',
  'original_substring_2': 'Scotland made their debut in the UEFA Euro 1996 qualifying phase.'},
 {'factual_statement_3': 'This was their first appearance in a European Championship qualifying campaign since the inception of the UEFA European Football Championship in 1960',
  'original_substring_3': 'This was their first appearance in a European Championship qualifying campaign since the inception of the UEFA European Football Championship in 1960.'},
 {'factual_statement_4': 'Scotland finished third in their group behind England and Switzerland, missing out on qualification for the tournament',
  'original_substring_4': 'Scotland finished third in their group behind England and Switzerland, missing out on qualification for the tournament.'}]

In [20]:
context = get_context_from_link(google_search_tool, query=dp["model_input"], lang=dp.get("lang", "EN"))

Switching to next key...
Switching to next key...
Switching to next key...


In [21]:
context

'UEFA Euro 1996 qualifying - Wikipedia Jump to content Main menu Main menu move to sidebar hide Navigation Main page Contents Current events Random article About Wikipedia Contact us Contribute Help Learn to edit Community portal Recent changes Upload file Search Search Appearance Donate Create account Log in Personal tools Donate Create account Log in Pages for logged out editors learn more Contributions Talk Contents move to sidebar hide (Top) 1 Qualified teams 2 Seedings 3 Summary 4 Tiebreakers 5 Groups Toggle Groups subsection 5.1 Group 1 5.2 Group 2 5.3 Group 3 5.4 Group 4 5.5 Group 5 5.6 Group 6 5.7 Group 7 5.8 Group 8 5.9 Ranking of second-placed teams 6 Play-off 7 Goalscorers 8 References 9 External links Toggle the table of contents UEFA Euro 1996 qualifying 26 languages Azərbaycanca Bosanski Čeština Deutsch Eesti Español فارسی Français 한국어 Italiano עברית Қазақша Lietuvių Magyar Nederlands 日本語 Norsk bokmål Polski Português Română Русский Српски / srpski Svenska Türkçe Українсь

In [22]:
# Step 3: Verify statements
preds = verify_facts_with_context(client, context, facts_json)

In [23]:
preds

[{'original_substring_1': 'Yes,', 'verdict': False, 'hallucination': ['Yes,']},
 {'original_substring_2': 'Scotland made their debut in the UEFA Euro 1996 qualifying phase.',
  'verdict': False,
  'hallucination': ['made their debut']},
 {'original_substring_3': 'This was their first appearance in a European Championship qualifying campaign since the inception of the UEFA European Football Championship in 1960.',
  'verdict': False,
  'hallucination': ['first appearance in a European Championship qualifying campaign since the inception']},
 {'original_substring_4': 'Scotland finished third in their group behind England and Switzerland, missing out on qualification for the tournament.',
  'verdict': False,
  'hallucination': ['finished third in their group behind England and Switzerland',
   'missing out on qualification']}]

In [24]:
predicted_hard_labels = extract_predicted_spans_hard(dp['model_output_text'], preds)
predicted_soft_labels = extract_predicted_spans_soft(dp['model_output_text'], preds)

In [25]:
predicted_hard_labels
#[[1,4],[49,53],[87,103],[232,246],[262,292],[294,322]]

[[1, 5], [15, 31], [87, 170], [232, 292], [294, 322]]

### Here we validate the pipeline with some metrics using data files from the val set 

In [26]:
input_jsonl = "/content/mushroom.en-val.v2.jsonl"
output_json = "/content/mushroom.en-val_set_as_array.v2.json"

data_points = []
with open(input_jsonl, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            data_points.append(json.loads(line))

with open(output_json, "w", encoding="utf-8") as f:
    json.dump(data_points, f, ensure_ascii=False, indent=2)

print(f"Converted {input_jsonl} into a single JSON array at {output_json}.")

Converted /content/mushroom.en-val.v2.jsonl into a single JSON array at /content/mushroom.en-val_set_as_array.v2.json.


In [27]:
input_array_file = "/content/mushroom.en-val_set_as_array.v2.json"
output_jsonl_file = "/content/mushroom.en-val_set_as_array.v2.jsonl"

with open(input_array_file, "r", encoding="utf-8") as fin:
    data_array = json.load(fin)

with open(output_jsonl_file, "w", encoding="utf-8") as fout:
    for obj in data_array:
        json.dump(obj, fout, ensure_ascii=False)
        fout.write("\n")

print(f"Converted {input_array_file} to line-based JSON at {output_jsonl_file}.")


Converted /content/mushroom.en-val_set_as_array.v2.json to line-based JSON at /content/mushroom.en-val_set_as_array.v2.jsonl.


In [30]:
!python /content/baseline_random_guess.py /content/mushroom.en-val_set_as_array.v2.jsonl \
        --output_file /content/baseline_preds.jsonl

!python /content/scorer.py /content/mushroom.en-val_set_as_array.v2.jsonl \
        /content/baseline_preds.jsonl \
        /content/results.txt

print("Scoring done. See /content/results.txt for final metrics.")

Scoring done. See /content/results.txt for final metrics.


---

## **Inference**
### Here we run the pipeline on the test data and saving results


In [34]:
# Paths of files are related to colab locations
all_files = [
    "/content/mushroom.ar-tst.v1.jsonl",
    "/content/mushroom.ca-tst.v1.jsonl",
    "/content/mushroom.cs-tst.v1.jsonl",
    "/content/mushroom.en-tst.v1.jsonl",
    "/content/mushroom.de-tst.v1.jsonl",
    "/content/mushroom.es-tst.v1.jsonl",
    "/content/mushroom.eu-tst.v1.jsonl",
    "/content/mushroom.fa-tst.v1.jsonl",
    "/content/mushroom.fr-tst.v1.jsonl",
    "/content/mushroom.fi-tst.v1.jsonl",
    "/content/mushroom.hi-tst.v1.jsonl",
    "/content/mushroom.it-tst.v1.jsonl",
    "/content/mushroom.sv-tst.v1.jsonl",
    "/content/mushroom.zh-tst.v1.jsonl"
    ]


def convert_jsonl_to_json_array(input_jsonl: str, output_json: str):
    data_points = []
    with open(input_jsonl, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                data_points.append(json.loads(line))

    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(data_points, f, ensure_ascii=False, indent=2)

    print(f"Converted {input_jsonl} -> {output_json} (JSON array).")

In [35]:
def HalluSearch_inference(
    json_input_file: str = "input_data.json",
    ref_jsonl_file: str = "ref.jsonl",
    pred_jsonl_file: str = "pred.jsonl",
    output_txt: str = "output.txt"):
    """
    1. Read the input_data.json (or any JSON with multiple data points).
    2. For each data point, run the verification pipeline to get predictions.
    3. Write ref.jsonl and pred.jsonl
    4. Run the scorer script to produce IoU/Cor metrics in output.txt
    """

    # 1) Load the input JSON
    with open(json_input_file, "r", encoding="utf-8") as f:
        data_points = json.load(f)

    # We'll accumulate references and preds
    ref_list = []
    pred_list = []

    for dp in data_points:

        # A) Run the pipeline to get predicted hallucinations
        model_output_text = dp["model_output_text"]

        # Step 1: Split the model output into statements
        facts_json = split_response_into_facts(client, dp)

        # Step 2: Get context from link (with new fallback to LLM if empty)
        context = get_context_from_link(google_search_tool, query=dp["model_input"], lang=dp.get("lang", "EN"))

        # Step 3: Verify statements
        preds = verify_facts_with_context(client, context, facts_json)

        # Step 4: Convert hallucinations to predicted labels
        predicted_hard_labels = extract_predicted_spans_hard(model_output_text, preds)
        predicted_soft_labels = extract_predicted_spans_soft(model_output_text, preds)

        pred_list.append({
            "id": dp["id"],
            "model_output_text": model_output_text,
            "soft_labels": predicted_soft_labels,
            "hard_labels": predicted_hard_labels
        })

    with open(pred_jsonl_file, "w", encoding="utf-8") as pf:
        for p in pred_list:
            pf.write(json.dumps(p, ensure_ascii=False) + "\n")

    print(f"Prediction finished. See {pred_jsonl_file} for results.")

In [None]:
#Process each file
for file_path in all_files:
    # A) Convert JSONL -> JSON array
    output_json_array = file_path.replace(".jsonl", ".array.json")
    convert_jsonl_to_json_array(file_path, output_json_array)

    # B) Prepare names for outputs

    pred_out = file_path.replace(".jsonl", "_pred.jsonl")


    # C) Load the JSON array
    with open(output_json_array, "r", encoding="utf-8") as f:
        data_points = json.load(f)

    # D) Build references & predictions
    pred_list = []
    for dp in data_points:

        model_output_text = dp.get("model_output_text", "")
        facts_json = split_response_into_facts(client, dp)
        context = get_context_from_link(
            google_search_tool,
            query=dp.get("model_input", ""),
            lang=dp.get("lang", "EN")
        )
        preds = verify_facts_with_context(client, context, facts_json)

        predicted_hard_labels = extract_predicted_spans_hard(model_output_text, preds)
        predicted_soft_labels = extract_predicted_spans_soft(model_output_text, preds)

        pred_list.append({
            "id": dp.get("id", ""),
            "model_output_text": model_output_text,
            "soft_labels": predicted_soft_labels,
            "hard_labels": predicted_hard_labels
        })


    with open(pred_out, "w", encoding="utf-8") as pf:
        for p in pred_list:
            pf.write(json.dumps(p, ensure_ascii=False) + "\n")


In [None]:
pred_list[0]