<a href="https://colab.research.google.com/github/manya9155/Hallucination-Detector/blob/main/split_claims.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy

# Load model (start with sm, upgrade to trf if you want better accuracy)
nlp = spacy.load("en_core_web_sm")

def extract_triples(text):
    doc = nlp(text)
    triples = []

    for token in doc:
        if token.dep_ == "ROOT":  # main verb
            # Find subject
            subject = [w for w in token.lefts if w.dep_ in ("nsubj", "nsubjpass")]
            # Find object
            obj = [w for w in token.rights if w.dep_ in ("dobj", "attr", "prep")]

            # Expand subject/object to full spans (not just last token)
            subj_text = " ".join([w.text for w in subject[0].subtree]) if subject else None
            obj_text = " ".join([w.text for w in obj[0].subtree]) if obj else None

            triples.append((subj_text, token.text, obj_text))

    return triples

# -------------------------------
# Test it
sentence = "Leonada di carpo won an Oscar for Inception which was directed by nolan"
triples = extract_triples(sentence)

print("Extracted Triples:")
for t in triples:
    print(t)


Extracted Triples:
('Leonada di carpo', 'won', 'an Oscar for Inception which was directed by nolan')


In [None]:
!pip install stanza
import stanza

# Download English models
stanza.download('en')


Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.10.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m31.7 MB/s[0m  [33m0:00:00[0m
[?25hDownloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m21.4 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [stanza]
[1A[2KSuccessfully installed emoji-2.14.1 stanza-1.10.1


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources


In [None]:
import stanza

# Download English models if not already
stanza.download('en')

# Correct pipeline initialization
nlp = stanza.Pipeline(
    'en',
    processors='tokenize,pos,lemma,depparse,ner',  # added pos and lemma
    use_gpu=False  # change to True if Colab GPU is enabled
)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...
INFO:stanza:File exists: /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| pos       | combined_charlm           |
| lemma     | combined_nocharlm         |
| depparse  | combined_charlm           |
| ner       | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [None]:
def extract_triples_atomic(text):
    doc = nlp(text)
    triples = []

    for sent in doc.sentences:
        verbs = [w for w in sent.words if w.deprel in ('root', 'ROOT')]
        for verb in verbs:
            subj_tokens = [w for w in sent.words if w.head == verb.id and w.deprel in ('nsubj', 'nsubj:pass')]
            obj_tokens = [w for w in sent.words if w.head == verb.id and w.deprel in ('obj', 'dobj', 'iobj')]

            for subj in subj_tokens:
                subj_phrase = " ".join([w.text for w in sent.words if w.id in get_subtree_ids(sent, subj)])

                # Main objects
                for obj in obj_tokens:
                    # Only main object words, ignore prepositional children
                    main_obj_words = [w for w in sent.words if w.id in get_subtree_ids(sent, obj) and w.deprel != 'obl']
                    main_obj_phrase = " ".join([w.text for w in main_obj_words])
                    triples.append((subj_phrase, verb.text, main_obj_phrase))

                    # Prepositional phrases attached to object
                    for child in sent.words:
                        if child.head == obj.id and child.deprel == 'obl':
                            prep_phrase = " ".join([sent.words[i-1].text for i in get_subtree_ids(sent, child)])
                            triples.append((subj_phrase, verb.text, prep_phrase))

                # If no direct object, add oblique/prep attached directly to verb
                if not obj_tokens:
                    for child in sent.words:
                        if child.head == verb.id and child.deprel.startswith('obl'):
                            prep_phrase = " ".join([sent.words[i-1].text for i in get_subtree_ids(sent, child)])
                            triples.append((subj_phrase, verb.text, prep_phrase))
    return triples

def get_subtree_ids(sent, root):
    ids = [root.id]
    added = True
    while added:
        added = False
        for w in sent.words:
            if w.head in ids and w.id not in ids:
                ids.append(w.id)
                added = True
    return ids

# -------------------------------
# Test
sentence = "Leonardo DiCaprio won an Oscar for Inception in 2016"
triples = extract_triples_atomic(sentence)
print(triples)

[('Leonardo DiCaprio', 'won', 'an Oscar for Inception')]


In [None]:
!pip install openai




In [None]:
!pip install --upgrade openai




In [None]:
!pip install -q google-genai


In [None]:
import os

# Replace with your actual API key
os.environ["GEMINI_API_KEY"] = "AIzaSyCNs-FR4ti3Xz_olgxXQWQt1h8boDWEJhU"


In [None]:
from google import genai

# Initialize the Gemini client
client = genai.Client()

def extract_claims(sentence):
    prompt = f"""
    You are a fact extraction assistant.
    Break the following sentence into independent factual claims:
    "{sentence}"
    """
    response = client.models.generate_content(
        model="gemini-2.5-flash",  # You can choose other models like "gemini-2.5-pro" if needed
        contents=prompt
    )
    claims = response.text.strip().split("\n")
    return [claim.strip() for claim in claims if claim.strip()]

# Example usage
sentence = "Leonardo DiCaprio won an Oscar for Inception in 2016 directed by Nolan."
claims = extract_claims(sentence)
for claim in claims:
    print(f"- {claim}")


- Here are the independent factual claims from the sentence:
- *   Leonardo DiCaprio won an Oscar.
- *   Leonardo DiCaprio won the Oscar for Inception.
- *   Leonardo DiCaprio won the Oscar in 2016.
- *   Inception was directed by Nolan.
