<a href="https://colab.research.google.com/github/nolanwelch/ai-factcheck/blob/main/proof-of-concept/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install python-dotenv openai pydantic


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import sys

IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
  from google.colab import userdata
  openai_token = userdata.get("OPENAI_API_KEY")
else:
  import os
  import dotenv
  dotenv.load_dotenv()
  openai_token = os.environ.get("OPENAI_API_KEY")

assert openai_token is not None, "Must set the OPENAI_API_KEY environment variable"

In [None]:
from typing import Dict, List
import json
from pydantic import BaseModel, model_validator, RootModel

class KGMapping(BaseModel):
    entities: Dict[str, str]
    relations: Dict[str, str]

class KG(RootModel[Dict[str, Dict[str, List[str]]]]):
    """
    A true root model: keys are entity‑IDs, values map relation‑IDs to lists of target entity‑IDs.
    """
    def __getitem__(self, ent_id: str) -> Dict[str, List[str]]:
        return self.root[ent_id]

class KnowledgeGraph(BaseModel):
    mapping: KGMapping
    graph: KG

    @model_validator(mode='after')
    def _check_references(self) -> "KnowledgeGraph":
        mapping = self.mapping
        graph_dict = self.graph.root

        # 1) every source‑entity in graph must exist
        for ent_id, rels in graph_dict.items():
            if ent_id not in mapping.entities:
                raise ValueError(f"Unknown entity in graph: {ent_id}")

            # 2) every relation must exist
            for rel_id, targets in rels.items():
                if rel_id not in mapping.relations:
                    raise ValueError(f"Unknown relation in graph: {rel_id}")

                # 3) every target entity must exist
                for tgt in targets:
                    if tgt not in mapping.entities:
                        raise ValueError(f"Unknown target entity in graph: {tgt}")

        return self

def load_knowledge_graph(filepath: str):
    with open(filepath) as f:
        data = json.load(f)
        return KnowledgeGraph.model_validate(data)

kg = load_knowledge_graph("kg.json")

mapping=KGMapping(entities={'ent_01': 'Barack Obama', 'ent_02': 'Michelle Obama', 'ent_03': 'Presidency of the United States'}, relations={'rel_01': 'spouseOf', 'rel_02': 'formerOfficeHolder'}) graph=KG(root={'ent_01': {'rel_01': ['ent_02'], 'rel_02': ['ent_03']}, 'ent_02': {'rel_01': ['ent_02']}})


In [3]:
from pydantic import BaseModel
from enum import Enum

class SemanticTriple(BaseModel):
  entityA: str
  relationship: str
  entityB: str

class SemanticTripleList(BaseModel):
  triples: list[SemanticTriple]

class EntailmentRelationship(Enum):
  IMPLIES = "implies"
  CONTRADICTS = "contradicts"
  NEUTRAL = "neutral"

class EntailmentResponse(BaseModel):
  entailment_relationship: EntailmentRelationship

In [10]:
import openai
from dataclasses import dataclass
import time


@dataclass
class SemanticTripleExtractor:
    client: openai.OpenAI
    GPT_MODEL = "gpt-4o"
    SCHEMA = SemanticTripleList
    ERROR_RETRY_SLEEP = 0.001

    def get_semantic_triples(self, text: str):
        system_prompt = """
        You are a highly capable natural language processing model tasked with extracting **semantic triples** from text. A semantic triple represents a structured relationship in the form **(subject, predicate, object)**. Your primary goals are to ensure that the extracted triples are:

        1. **Parsable**: Use concise, standardized predicates that capture the semantic meaning of the relationship.
        2. **Semantic-Rich**: Avoid vague predicates like "is" or "was" and replace them with more specific, semantically meaningful terms.
        3. **Normalized**: Coerce relationships into forms that align with commonly used knowledge graphs and ontologies.

        ---

        ### Instructions:

        1. **Triple Structure**:
          - Each triple must be in the format: **(subject, predicate, object)**.
          - **Subject**: The entity or concept being described.
          - **Predicate**: A semantically meaningful relationship (use standardized terms wherever possible).
          - **Object**: The entity or concept associated with the subject via the predicate.

        2. **Predicate Coercion**:
          - Avoid using generic copular verbs like "is" or "was" as the predicate. Instead, infer a more meaningful relationship based on the context:
            - **Nationality**: Replace "is/was [nationality]" with predicates like **hasNationality**.
              - Example: "Alexander the Great was British."
              - Output: **(Alexander the Great, hasNationality, British)**
            - **Role or Occupation**: Replace "is/was [role]" with predicates like **hasRole** or **profession**.
              - Example: "Thomas Edison was the inventor of the lightbulb."
              - Output: **(Thomas Edison, inventorOf, lightbulb)**
            - **Properties**: Replace "is/was [property]" with predicates like **hasProperty** or **characteristicOf**.
              - Example: "The Eiffel Tower is tall."
              - Output: **(The Eiffel Tower, hasProperty, tall)**
            - **Location**: Replace "is/was in [location]" with predicates like **locatedIn**.
              - Example: "The Great Wall of China is in China."
              - Output: **(The Great Wall of China, locatedIn, China)**
            - **Other Actions**: Use action-oriented predicates like **invented**, **discovered**, or **authored**, when the sentence provides such explicit relationships.

        3. **Normalization**:
          - Use camelCase for predicates to ensure consistency and ease of parsing.
          - Avoid overly specific predicates that are difficult to generalize or parse.
            - Example: Instead of "was the first person to discover," simplify to **discovered**.

        4. **Negated Relationships**:
          - For negated relationships, transform the predicate to its positive form.
            - Example: "Alexander the Great was not British."
            - Output: **(Alexander the Great, hasNationality, British)**
          - Add a brief justification for why the negation was ignored.

        5. **Context Awareness**:
          - Handle implied or indirect relationships by inferring logical triples.
          - Example:
            - Sentence: "The capital of France is Paris."
            - Output: **(France, hasCapital, Paris)**

        6. **Output Format**:
          - Return triples as plain text, formatted as **(subject, predicate, object)**.
          - Each triple must be on a new line.

        ---

        ### Examples:

        #### Example 1:
        - **Input**: "Thomas Edison was the inventor of the lightbulb."
        - **Output**: **(Thomas Edison, inventorOf, lightbulb)**

        #### Example 2:
        - **Input**: "The Eiffel Tower is in Paris."
        - **Output**: **(The Eiffel Tower, locatedIn, Paris)**

        #### Example 3:
        - **Input**: "Marie Curie discovered radium."
        - **Output**: **(Marie Curie, discovered, radium)**

        #### Example 4:
        - **Input**: "Alexander the Great was not British."
        - **Output**: **(Alexander the Great, hasNationality, British)**

        #### Example 5:
        - **Input**: "The capital of France is Paris."
        - **Output**: **(France, hasCapital, Paris)**

        ---

        ### Constraints:
        - Focus on meaningful predicates that convey the underlying relationship clearly and concisely.
        - When in doubt about the semantic intent, prefer general predicates like **hasRelation** or **associatedWith** and provide an explanation.
        - Avoid duplicating triples or creating overly granular relationships.

        With these adjustments, extract triples that are both semantically precise and consistent with standard knowledge representation frameworks."""
        return self._request_with_retry(system_prompt, text)

    def _request_with_retry(self, system_prompt: str, text: str):
        n_retries = 0
        while True:
            try:
                response = (
                    self.client.beta.chat.completions.parse(
                        model=self.GPT_MODEL,
                        response_format=self.SCHEMA,
                        messages=[
                            {"role": "system", "content": system_prompt},
                            {"role": "user", "content": text},
                        ],
                    )
                    .choices[0]
                    .message.parsed
                )
                break

            except openai.RateLimitError as err:
                n_retries += 1
                print(err)
                print("Exceeded rate limit")
                print(f"Sleeping before retry (done {n_retries} time(s))")
                time.sleep(self.ERROR_RETRY_SLEEP)

            except Exception as err:
                n_retries += 1
                print(f"Unexpected error ({err})")
                print(f"Sleeping before retry (done {n_retries} time(s))")
                time.sleep(self.ERROR_RETRY_SLEEP)

        if response is None:
            raise ValueError("Got null response")

        return response


@dataclass
class EntailmentClassifier:
    client: openai.OpenAI
    GPT_MODEL = "gpt-4o"
    SCHEMA = EntailmentResponse
    ERROR_RETRY_SLEEP = 0.001

    def get_entailment_type(self, sentence: str, statement: str):
        system_prompt = """
    You are a highly capable natural language processing model tasked with evaluating the relationship between a given **sentence** and an **assertion**. Your objective is to determine whether the sentence:

    1. **Agrees with (Implies)** the assertion: The sentence provides evidence that the assertion is true.
    2. **Contradicts** the assertion: The sentence provides evidence that the assertion is false.
    3. **Is Neutral** towards the assertion: The sentence neither supports nor refutes the assertion, or it lacks sufficient information to establish a clear relationship.

    ### Instructions:

    1. **Input Format**:
      - You will receive two inputs:
        - **Sentence**: A statement or set of statements that describe an event, opinion, or fact.
        - **Assertion**: A single statement whose relationship to the sentence you must evaluate.

    2. **Output Format**:
      - Your response should contain one of the following three terms:
        - **"Implies"**: If the sentence provides direct or strong evidence supporting the truth of the assertion.
        - **"Contradicts"**: If the sentence provides direct or strong evidence opposing the truth of the assertion.
        - **"Neutral"**: If the sentence does not provide sufficient evidence to determine the truth or falsity of the assertion.

      - Additionally, provide a brief justification explaining your reasoning.

    3. **Evaluation Criteria**:
      - **Agreement (Implies)**:
        - Identify whether the sentence affirms or supports the assertion.
        - Example:
          - Sentence: "I like dogs. I would like to have one some day."
          - Assertion: "I like dogs."
          - Output: **"Implies"** (The sentence explicitly states liking dogs.)

      - **Contradiction**:
        - Determine if the sentence negates or opposes the assertion.
        - Example:
          - Sentence: "I don't like dogs. They make me nervous."
          - Assertion: "I like dogs."
          - Output: **"Contradicts"** (The sentence explicitly denies liking dogs.)

      - **Neutrality**:
        - Assess whether the sentence is unrelated to the assertion or lacks sufficient evidence to establish a relationship.
        - Example:
          - Sentence: "I like dogs. I would like to have one some day."
          - Assertion: "I like cats."
          - Output: **"Neutral"** (The sentence does not provide information about cats.)

    4. **Context Awareness**:
      - Consider the context and nuances of both the sentence and assertion.
      - Handle implied meanings, indirect statements, and logical implications.
      - Example:
        - Sentence: "I love all kinds of animals, especially dogs."
        - Assertion: "I like dogs."
        - Output: **"Implies"** (The statement includes liking dogs as part of loving animals.)

    5. **Ambiguity**:
      - If the relationship between the sentence and assertion is ambiguous, select "Neutral" and explain why the connection is unclear or insufficient.

    6. **Complex Sentences**:
      - For compound or complex sentences, focus on the part(s) relevant to the assertion.
      - Example:
        - Sentence: "I like dogs, but I don't like walking them in the rain."
        - Assertion: "I like dogs."
        - Output: **"Implies"** (The sentence explicitly states liking dogs, even if it mentions a dislike of walking them in certain conditions.)

    7. **Negations and Opposites**:
      - Pay special attention to negations, double negatives, and antonyms.
      - Example:
        - Sentence: "I would never say I dislike dogs."
        - Assertion: "I like dogs."
        - Output: **"Implies"** (The sentence indirectly supports the assertion by rejecting the opposite.)

    ### Examples:

    #### Example 1:
    - **Sentence**: "I absolutely love spending time with dogs."
    - **Assertion**: "I like dogs."
    - **Output**: **"Implies"** (The sentence strongly supports the assertion.)

    #### Example 2:
    - **Sentence**: "I prefer cats over dogs."
    - **Assertion**: "I like dogs."
    - **Output**: **"Neutral"** (The sentence does not confirm or deny liking dogs, only that cats are preferred.)

    #### Example 3:
    - **Sentence**: "I don't like dogs because they bark too much."
    - **Assertion**: "I like dogs."
    - **Output**: **"Contradicts"** (The sentence directly denies liking dogs.)

    #### Example 4:
    - **Sentence**: "I think dogs are cute, but I wouldn't want one as a pet."
    - **Assertion**: "I like dogs."
    - **Output**: **"Neutral"** (The sentence expresses admiration for dogs but does not explicitly indicate liking them.)

    ### Constraints:
    - Focus on the logical relationship between the sentence and the assertion, rather than external knowledge.
    - Avoid over-interpreting vague or ambiguous statements; err on the side of neutrality when in doubt.

    With these instructions, evaluate each input pair thoroughly and consistently."""
        text = f"Sentence: \"{sentence}\"\nAssertion: \"{statement}\""
        return self._request_with_retry(system_prompt, text)

    def _request_with_retry(self, system_prompt: str, text: str):
        n_retries = 0
        while True:
            try:
                response = (
                    self.client.beta.chat.completions.parse(
                        model=self.GPT_MODEL,
                        response_format=self.SCHEMA,
                        messages=[
                            {"role": "system", "content": system_prompt},
                            {"role": "user", "content": text},
                        ],
                    )
                    .choices[0]
                    .message.parsed
                )
                break

            except openai.RateLimitError as err:
                n_retries += 1
                print(err)
                print("Exceeded rate limit")
                print(f"Sleeping before retry (done {n_retries} time(s))")
                time.sleep(self.ERROR_RETRY_SLEEP)

            except Exception as err:
                n_retries += 1
                print(f"Unexpected error ({err})")
                print(f"Sleeping before retry (done {n_retries} time(s))")
                time.sleep(self.ERROR_RETRY_SLEEP)

        if response is None:
            raise ValueError("Got null response")

        return response


In [11]:
client = openai.OpenAI(api_key=openai_token)

semantic_extractor = SemanticTripleExtractor(client)
entailment_classifier = EntailmentClassifier(client)

In [12]:
sentence = "Thomas Edison was the inventor of the lightbulb."

semantic_extractor.get_semantic_triples(sentence)

SemanticTripleList(triples=[SemanticTriple(entityA='Thomas Edison', relationship='inventorOf', entityB='lightbulb')])

In [13]:
true_statement = "The lightbulb has been invented."
entailment_classifier.get_entailment_type(sentence, true_statement)

EntailmentResponse(entailment_relationship=<EntailmentRelationship.IMPLIES: 'implies'>)

In [14]:
false_statement = "Harry Truman invented the lightbulb."
entailment_classifier.get_entailment_type(sentence, false_statement)

EntailmentResponse(entailment_relationship=<EntailmentRelationship.CONTRADICTS: 'contradicts'>)

In [15]:
irrelevant_statement = "I had a sandwich for lunch."
entailment_classifier.get_entailment_type(sentence, irrelevant_statement)

EntailmentResponse(entailment_relationship=<EntailmentRelationship.NEUTRAL: 'neutral'>)

In [18]:
sentences = [
    "Albert Einstein was a physicist who developed the theory of relativity.",
    "Marie Curie did not discover penicillin, but she discovered radium.",
    "The Great Wall of China is not visible from space with the naked eye.",
    "Isaac Newton was not American; he was a British mathematician and physicist.",
    "Barack Obama was born in Hawaii and served as the 44th President of the United States.",
    "J.K. Rowling wrote the Harry Potter series, which became a global phenomenon.",
    "The Amazon rainforest is in South America and plays a crucial role in Earth's oxygen cycle.",
    "Neil Armstrong, the first person to walk on the moon, was an astronaut and test pilot.",
    "The Mona Lisa, painted by Leonardo da Vinci, is displayed in the Louvre in Paris.",
    "Mount Everest, the highest mountain in the world, is part of the Himalayas and lies in Nepal."
]

for sentence in sentences:
  print(sentence)
  semantic_triples = semantic_extractor.get_semantic_triples(sentence).triples
  for triple in semantic_triples:
    statement = f"Entity \"{triple.entityA}\" has relationship \"{triple.relationship}\" with entity \"{triple.entityB}\"."
    entail_type = entailment_classifier.get_entailment_type(sentence, statement)
    print(statement, entail_type)

Albert Einstein was a physicist who developed the theory of relativity.
Entity "Albert Einstein" has relationship "hasRole" with entity "physicist". entailment_relationship=<EntailmentRelationship.IMPLIES: 'implies'>
Entity "Albert Einstein" has relationship "developed" with entity "theory of relativity". entailment_relationship=<EntailmentRelationship.IMPLIES: 'implies'>
Marie Curie did not discover penicillin, but she discovered radium.
Entity "Marie Curie" has relationship "discovered" with entity "radium". entailment_relationship=<EntailmentRelationship.IMPLIES: 'implies'>
The Great Wall of China is not visible from space with the naked eye.
Entity "Great Wall of China" has relationship "visibilityFromSpace" with entity "notVisible". entailment_relationship=<EntailmentRelationship.IMPLIES: 'implies'>
Entity "Great Wall of China" has relationship "locatedIn" with entity "China". entailment_relationship=<EntailmentRelationship.NEUTRAL: 'neutral'>
Isaac Newton was not American; he was