<a href="https://colab.research.google.com/github/nolanwelch/ai-factcheck/blob/main/proof-of-concept/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install python-dotenv openai pydantic



In [2]:
import sys

IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
  from google.colab import userdata
  openai_token_present = userdata.get("OPENAI_API_KEY") is not None
else:
  import os
  import dotenv
  dotenv.load_dotenv()
  openai_token_present = "HF_TOKEN" in os.environ

assert openai_token_present, "Must set the OPENAI_API_KEY environment variable"

In [3]:
from pydantic import BaseModel
from enum import Enum

class SemanticTriple(BaseModel):
  entityA: str
  relationship: str
  entityB: str

class SemanticTripleList(BaseModel):
  triples: list[SemanticTriple]

class EntailmentRelationship(Enum):
  IMPLIES = "implies"
  CONTRADICTS = "contradicts"
  NEUTRAL = "neutral"

class EntailmentResponse(BaseModel):
  entailment_relationship: EntailmentRelationship

In [None]:
import openai
from dataclasses import dataclass
import time

@dataclass
class SemanticTripleExtractor:
  client: openai.OpenAI
  GPT_MODEL = "gpt-4o"
  SCHEMA = SemanticTripleList

  def get_semantic_triples(self, text: str):
    system_prompt = """
    You are an advanced natural language processing model tasked with extracting semantic triples from text. A semantic triple is a structured representation of knowledge in the form of **(subject, predicate, object)**. Your goal is to parse the text and extract meaningful triples that encapsulate the relationships and facts described.

    ### Instructions:
    1. **Core Structure**:
      - Each triple should be in the format: **(entityA, predicate, entityB)**.
      - **entityA** is the entity or concept performing or being described by the action or state.
      - The **predicate** is the verb or relational term connecting the subject and the object.
      - **entityB** is the entity or concept associated with entityA via the predicate.

    2. **Negated Triples**:
      - For any statement involving negation (e.g., "not," "never"), extract the triple as if the negation were absent.
      - Example:
        - Input: "Alexander the Great was not British."
        - Output: **(Alexander the Great, was, British)**.

    3. **Handling Ambiguity**:
      - Disambiguate unclear or indirect references to identify explicit triples.
      - If coreference resolution is needed (e.g., pronouns), resolve the references first.
      - Example:
        - Input: "He never became a king."
        - Output: **(He, became, king)**.

    4. **Complex Sentences**:
      - For compound or complex sentences, extract all relevant triples.
      - Example:
        - Input: "Albert Einstein was a physicist and did not believe in quantum entanglement."
        - Output: **(Albert Einstein, was, physicist)**, **(Albert Einstein, believed, quantum entanglement)**.

    5. **Contextual Relationships**:
      - Use contextual clues to infer relationships when not explicitly stated.
      - Example:
        - Input: "The Eiffel Tower is in Paris."
        - Output: **(The Eiffel Tower, is in, Paris)**.

    6. **Specificity**:
      - Extract triples with the highest level of specificity based on the input text.
      - Example:
        - Input: "The company Tesla manufactures electric cars."
        - Output: **(Tesla, manufactures, electric cars)**.

    7. **Language Precision**:
      - Maintain the integrity of the original text's meaning, but omit any hedging language or modifiers unless crucial to understanding.
      - Example:
        - Input: "It is widely believed that cats are nocturnal."
        - Output: **(cats, are, nocturnal)**.

    8. **Output Format**:
      - Return triples in plain text, separated by commas, and encapsulated in parentheses.
      - Multiple triples should be separated by line breaks.

    ### Examples for Clarity:
    #### Example 1:
    - **Input**: "The Great Wall of China is not visible from space."
    - **Output**:
      - **(The Great Wall of China, is visible from, space)**

    #### Example 2:
    - **Input**: "Marie Curie discovered radium and polonium but did not work on the atomic bomb."
    - **Output**:
      - **(Marie Curie, discovered, radium)**
      - **(Marie Curie, discovered, polonium)**
      - **(Marie Curie, worked on, the atomic bomb)**

    #### Example 3:
    - **Input**: "He was not interested in sports and did not enjoy reading novels."
    - **Output**:
      - **(He, was interested in, sports)**
      - **(He, enjoyed, reading novels)**

    #### Example 4:
    - **Input**: "Although Paris is not the capital of Germany, it is known for its culture."
    - **Output**:
      - **(Paris, is the capital of, Germany)**
      - **(Paris, is known for, culture)**

    ### Constraints:
    - Do not include duplicate triples.
    - Ensure grammatical consistency in predicates.
    - Focus on relationships that are explicitly stated or strongly implied.

    With these guidelines, extract accurate and meaningful semantic triples from any input text provided."""
    return self._request_with_retry(system_prompt, text)

    def _request_with_retry(self, system_prompt: str, text: str):
      n_retries = 0
      while True:
          try:
              response = (
                  self._client.beta.chat.completions.parse(
                      model=self.model.value,
                      response_format=self.SCHEMA,
                      messages=[
                          {"role": "system", "content": system_prompt},
                          {"role": "user", "content": text},
                      ],
                  )
                  .choices[0]
                  .message.parsed
              )
              break

          except openai.RateLimitError as err:
              n_retries += 1
              print(err)
              print("Exceeded rate limit")
              print(f"Sleeping before retry (done {n_retries} time(s))")
              time.sleep(self.ERROR_RETRY_SLEEP)

          except Exception as err:
              n_retries += 1
              print(f"Unexpected error ({err})")
              print(f"Sleeping before retry (done {n_retries} time(s))")
              time.sleep(self.ERROR_RETRY_SLEEP)
