<a href="https://colab.research.google.com/github/nolanwelch/ai-factcheck/blob/main/proof-of-concept/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install python-dotenv openai pydantic



In [6]:
import sys

IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
  from google.colab import userdata
  openai_token = userdata.get("OPENAI_API_KEY")
else:
  import os
  import dotenv
  dotenv.load_dotenv()
  openai_token = os.environ.get("OPENAI_API_KEY")

assert openai_token is not None, "Must set the OPENAI_API_KEY environment variable"

In [3]:
from pydantic import BaseModel
from enum import Enum

class SemanticTriple(BaseModel):
  entityA: str
  relationship: str
  entityB: str

class SemanticTripleList(BaseModel):
  triples: list[SemanticTriple]

class EntailmentRelationship(Enum):
  IMPLIES = "implies"
  CONTRADICTS = "contradicts"
  NEUTRAL = "neutral"

class EntailmentResponse(BaseModel):
  entailment_relationship: EntailmentRelationship

In [33]:
import openai
from dataclasses import dataclass
import time


@dataclass
class SemanticTripleExtractor:
    client: openai.OpenAI
    GPT_MODEL = "gpt-4o"
    SCHEMA = SemanticTripleList
    ERROR_RETRY_SLEEP = 0.001

    def get_semantic_triples(self, text: str):
        system_prompt = """
    You are an advanced natural language processing model tasked with extracting semantic triples from text. A semantic triple is a structured representation of knowledge in the form of **(subject, predicate, object)**. Your goal is to parse the text and extract meaningful triples that encapsulate the relationships and facts described.

    ### Instructions:
    1. **Core Structure**:
      - Each triple should be in the format: **(entityA, predicate, entityB)**.
      - **entityA** is the entity or concept performing or being described by the action or state.
      - The **predicate** is the verb or relational term connecting the subject and the object.
      - **entityB** is the entity or concept associated with entityA via the predicate.

    2. **Negated Triples**:
      - For any statement involving negation (e.g., "not," "never"), extract the triple as if the negation were absent.
      - Example:
        - Input: "Alexander the Great was not British."
        - Output: **(Alexander the Great, was, British)**.

    3. **Handling Ambiguity**:
      - Disambiguate unclear or indirect references to identify explicit triples.
      - If coreference resolution is needed (e.g., pronouns), resolve the references first.
      - Example:
        - Input: "He never became a king."
        - Output: **(He, became, king)**.

    4. **Complex Sentences**:
      - For compound or complex sentences, extract all relevant triples.
      - Example:
        - Input: "Albert Einstein was a physicist and did not believe in quantum entanglement."
        - Output: **(Albert Einstein, was, physicist)**, **(Albert Einstein, believed, quantum entanglement)**.

    5. **Contextual Relationships**:
      - Use contextual clues to infer relationships when not explicitly stated.
      - Example:
        - Input: "The Eiffel Tower is in Paris."
        - Output: **(The Eiffel Tower, is in, Paris)**.

    6. **Specificity**:
      - Extract triples with the highest level of specificity based on the input text.
      - Example:
        - Input: "The company Tesla manufactures electric cars."
        - Output: **(Tesla, manufactures, electric cars)**.

    7. **Language Precision**:
      - Maintain the integrity of the original text's meaning, but omit any hedging language or modifiers unless crucial to understanding.
      - Example:
        - Input: "It is widely believed that cats are nocturnal."
        - Output: **(cats, are, nocturnal)**.

    8. **Output Format**:
      - Return triples in plain text, separated by commas, and encapsulated in parentheses.
      - Multiple triples should be separated by line breaks.

    ### Examples for Clarity:
    #### Example 1:
    - **Input**: "The Great Wall of China is not visible from space."
    - **Output**:
      - **(The Great Wall of China, is visible from, space)**

    #### Example 2:
    - **Input**: "Marie Curie discovered radium and polonium but did not work on the atomic bomb."
    - **Output**:
      - **(Marie Curie, discovered, radium)**
      - **(Marie Curie, discovered, polonium)**
      - **(Marie Curie, worked on, the atomic bomb)**

    #### Example 3:
    - **Input**: "He was not interested in sports and did not enjoy reading novels."
    - **Output**:
      - **(He, was interested in, sports)**
      - **(He, enjoyed, reading novels)**

    #### Example 4:
    - **Input**: "Although Paris is not the capital of Germany, it is known for its culture."
    - **Output**:
      - **(Paris, is the capital of, Germany)**
      - **(Paris, is known for, culture)**

    ### Constraints:
    - Do not include duplicate triples.
    - Ensure grammatical consistency in predicates.
    - Focus on relationships that are explicitly stated or strongly implied.

    With these guidelines, extract accurate and meaningful semantic triples from any input text provided."""
        return self._request_with_retry(system_prompt, text)

    def _request_with_retry(self, system_prompt: str, text: str):
        n_retries = 0
        while True:
            try:
                response = (
                    self.client.beta.chat.completions.parse(
                        model=self.GPT_MODEL,
                        response_format=self.SCHEMA,
                        messages=[
                            {"role": "system", "content": system_prompt},
                            {"role": "user", "content": text},
                        ],
                    )
                    .choices[0]
                    .message.parsed
                )
                break

            except openai.RateLimitError as err:
                n_retries += 1
                print(err)
                print("Exceeded rate limit")
                print(f"Sleeping before retry (done {n_retries} time(s))")
                time.sleep(self.ERROR_RETRY_SLEEP)

            except Exception as err:
                n_retries += 1
                print(f"Unexpected error ({err})")
                print(f"Sleeping before retry (done {n_retries} time(s))")
                time.sleep(self.ERROR_RETRY_SLEEP)

        if response is None:
            raise ValueError("Got null response")

        return response


@dataclass
class EntailmentClassifier:
    client: openai.OpenAI
    GPT_MODEL = "gpt-4o"
    SCHEMA = EntailmentResponse
    ERROR_RETRY_SLEEP = 0.001

    def get_entailment_type(self, sentence: str, statement: str):
        system_prompt = """
    You are a highly capable natural language processing model tasked with evaluating the relationship between a given **sentence** and an **assertion**. Your objective is to determine whether the sentence:

    1. **Agrees with (Implies)** the assertion: The sentence provides evidence that the assertion is true.
    2. **Contradicts** the assertion: The sentence provides evidence that the assertion is false.
    3. **Is Neutral** towards the assertion: The sentence neither supports nor refutes the assertion, or it lacks sufficient information to establish a clear relationship.

    ### Instructions:

    1. **Input Format**:
      - You will receive two inputs:
        - **Sentence**: A statement or set of statements that describe an event, opinion, or fact.
        - **Assertion**: A single statement whose relationship to the sentence you must evaluate.

    2. **Output Format**:
      - Your response should contain one of the following three terms:
        - **"Implies"**: If the sentence provides direct or strong evidence supporting the truth of the assertion.
        - **"Contradicts"**: If the sentence provides direct or strong evidence opposing the truth of the assertion.
        - **"Neutral"**: If the sentence does not provide sufficient evidence to determine the truth or falsity of the assertion.

      - Additionally, provide a brief justification explaining your reasoning.

    3. **Evaluation Criteria**:
      - **Agreement (Implies)**:
        - Identify whether the sentence affirms or supports the assertion.
        - Example:
          - Sentence: "I like dogs. I would like to have one some day."
          - Assertion: "I like dogs."
          - Output: **"Implies"** (The sentence explicitly states liking dogs.)

      - **Contradiction**:
        - Determine if the sentence negates or opposes the assertion.
        - Example:
          - Sentence: "I don't like dogs. They make me nervous."
          - Assertion: "I like dogs."
          - Output: **"Contradicts"** (The sentence explicitly denies liking dogs.)

      - **Neutrality**:
        - Assess whether the sentence is unrelated to the assertion or lacks sufficient evidence to establish a relationship.
        - Example:
          - Sentence: "I like dogs. I would like to have one some day."
          - Assertion: "I like cats."
          - Output: **"Neutral"** (The sentence does not provide information about cats.)

    4. **Context Awareness**:
      - Consider the context and nuances of both the sentence and assertion.
      - Handle implied meanings, indirect statements, and logical implications.
      - Example:
        - Sentence: "I love all kinds of animals, especially dogs."
        - Assertion: "I like dogs."
        - Output: **"Implies"** (The statement includes liking dogs as part of loving animals.)

    5. **Ambiguity**:
      - If the relationship between the sentence and assertion is ambiguous, select "Neutral" and explain why the connection is unclear or insufficient.

    6. **Complex Sentences**:
      - For compound or complex sentences, focus on the part(s) relevant to the assertion.
      - Example:
        - Sentence: "I like dogs, but I don't like walking them in the rain."
        - Assertion: "I like dogs."
        - Output: **"Implies"** (The sentence explicitly states liking dogs, even if it mentions a dislike of walking them in certain conditions.)

    7. **Negations and Opposites**:
      - Pay special attention to negations, double negatives, and antonyms.
      - Example:
        - Sentence: "I would never say I dislike dogs."
        - Assertion: "I like dogs."
        - Output: **"Implies"** (The sentence indirectly supports the assertion by rejecting the opposite.)

    ### Examples:

    #### Example 1:
    - **Sentence**: "I absolutely love spending time with dogs."
    - **Assertion**: "I like dogs."
    - **Output**: **"Implies"** (The sentence strongly supports the assertion.)

    #### Example 2:
    - **Sentence**: "I prefer cats over dogs."
    - **Assertion**: "I like dogs."
    - **Output**: **"Neutral"** (The sentence does not confirm or deny liking dogs, only that cats are preferred.)

    #### Example 3:
    - **Sentence**: "I don't like dogs because they bark too much."
    - **Assertion**: "I like dogs."
    - **Output**: **"Contradicts"** (The sentence directly denies liking dogs.)

    #### Example 4:
    - **Sentence**: "I think dogs are cute, but I wouldn't want one as a pet."
    - **Assertion**: "I like dogs."
    - **Output**: **"Neutral"** (The sentence expresses admiration for dogs but does not explicitly indicate liking them.)

    ### Constraints:
    - Focus on the logical relationship between the sentence and the assertion, rather than external knowledge.
    - Avoid over-interpreting vague or ambiguous statements; err on the side of neutrality when in doubt.

    With these instructions, evaluate each input pair thoroughly and consistently."""
        text = f"Sentence: \"{sentence}\"\nAssertion: \"{statement}\""
        return self._request_with_retry(system_prompt, text)

    def _request_with_retry(self, system_prompt: str, text: str):
        n_retries = 0
        while True:
            try:
                response = (
                    self.client.beta.chat.completions.parse(
                        model=self.GPT_MODEL,
                        response_format=self.SCHEMA,
                        messages=[
                            {"role": "system", "content": system_prompt},
                            {"role": "user", "content": text},
                        ],
                    )
                    .choices[0]
                    .message.parsed
                )
                break

            except openai.RateLimitError as err:
                n_retries += 1
                print(err)
                print("Exceeded rate limit")
                print(f"Sleeping before retry (done {n_retries} time(s))")
                time.sleep(self.ERROR_RETRY_SLEEP)

            except Exception as err:
                n_retries += 1
                print(f"Unexpected error ({err})")
                print(f"Sleeping before retry (done {n_retries} time(s))")
                time.sleep(self.ERROR_RETRY_SLEEP)

        if response is None:
            raise ValueError("Got null response")

        return response


In [34]:
client = openai.OpenAI(api_key=openai_token)

semantic_extractor = SemanticTripleExtractor(client)
entailment_classifier = EntailmentClassifier(client)

In [35]:
sentence = "Thomas Edison was the inventor of the lightbulb."

semantic_extractor.get_semantic_triples(sentence)

SemanticTripleList(triples=[SemanticTriple(entityA='Thomas Edison', relationship='was the inventor of', entityB='the lightbulb')])

In [36]:
true_statement = "The lightbulb has been invented."
entailment_classifier.get_entailment_type(sentence, true_statement)

EntailmentResponse(entailment_relationship=<EntailmentRelationship.IMPLIES: 'implies'>)

In [37]:
false_statement = "Harry Truman invented the lightbulb."
entailment_classifier.get_entailment_type(sentence, false_statement)

EntailmentResponse(entailment_relationship=<EntailmentRelationship.CONTRADICTS: 'contradicts'>)

In [38]:
irrelevant_statement = "I had a sandwich for lunch."
entailment_classifier.get_entailment_type(sentence, irrelevant_statement)

EntailmentResponse(entailment_relationship=<EntailmentRelationship.NEUTRAL: 'neutral'>)