In [1]:
import os
import json
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

True

In [2]:
# establish a connection to the MongoDB database
from pymongo import MongoClient

# connect to your Atlas cluster
client = MongoClient(os.environ["MONGODB_URI"])

In [322]:
# establish a connection to the PostgreSQL database
import psycopg2 as pg

conn = pg.connect(
    dbname=os.environ["POSTGRES_DB"],
    user=os.environ["POSTGRES_USER"],
    password=os.environ["POSTGRES_PASSWORD"],
    host=os.environ["POSTGRES_HOST"]
)
cursor = conn.cursor()

# LangChain: Models, Prompts and Output Parsers


## Outline

 * Direct API calls to OpenAI
 * API calls through LangChain:
   * Prompts
   * Models
   * Output parsers

In [129]:
import openai

openai.api_key = os.environ['OPENAI_APIKEY']
llm_model = "gpt-4o-mini"
# llm_model = "chatgpt-4o-latest"


In [6]:
from langchain.chat_models import ChatOpenAI

In [135]:
# To control the randomness and creativity of the generated
# text by an LLM, use temperature = 0.0
chat = ChatOpenAI(temperature=0.0, model=llm_model, openai_api_key=openai.api_key)


In [103]:
template_string = """You are a web ontology creator. You are given part of an html web page from the fandom wiki web server, and your role is to extract entity information from this source. \
Page may containt information about a specific character, location, event, lore, in-game item, etc. One page corresponds to one entity only.\
You need to extract the information that will be used to populate a web ontology.\
- Name of the entity\
- Class of the entity\
- Subclass of the entity\
- Description of the entity

- Properties of the entity\

For each property, you need to extract the following information:\
- Name of the property\
- Value of the property\
- url of the link attached to the property (if exists in the html)\

Return the extracted information in the following format:\


Web page url: ```{page_url}```\
Game name: ```{game_name}```\
Html web page source: ```{content}```
"""

In [104]:
from langchain.prompts import ChatPromptTemplate

prompt_template = ChatPromptTemplate.from_template(template_string)

In [105]:
prompt_template.messages[0].prompt

PromptTemplate(input_variables=['content', 'game_name', 'page_url'], input_types={}, partial_variables={}, template='You are a web ontology creator. You are given part of an html web page from the fandom wiki web server, and your role is to extract entity information from this source. Page may containt information about a specific character, location, event, lore, in-game item, etc. One page corresponds to one entity only.You need to extract the information that will be used to populate a web ontology.- Name of the entity- Class of the entity- Subclass of the entity- Description of the entity\n\n- Properties of the entity\nFor each property, you need to extract the following information:- Name of the property- Value of the property- url of the link attached to the property (if exists in the html)\nReturn the extracted information in the following format:\n\nWeb page url: ```{page_url}```Game name: ```{game_name}```Html web page source: ```{content}```\n')

In [106]:
prompt_template.messages[0].prompt.input_variables

['content', 'game_name', 'page_url']

In [107]:
item = {'game_name': 'bravelydefault',
 'game_url': 'https://bravelydefault.fandom.com/',
 'page_name': 'Adam_Holograd',
 'page_url': 'https://bravelydefault.fandom.com/wiki/Adam_Holograd',
 'content': '<aside class="portable-infobox pi-background pi-border-color pi-theme-series pi-layout-default" role="region">\n<h2>Adam Holograd</h2>\n<div>\n<div><span><span>アダマス・ホログラード</span> (<span>Adamasu・Horogurādo</span><span><a>?</a></span>, lit. <span>Adamas Holograd</span>)</span></div>\n</div>\n\n<section>\n<h2>Biographical information</h2>\n<div>\n<h3>Home</h3>\n<div>Holograd</div>\n</div>\n<div>\n<h3>Affiliation</h3>\n<div>Holograd Empire</div>\n</div>\n<div>\n<h3>Occupation</h3>\n<div>Lord Commander of the Holograd Empire</div>\n</div>\n</section>\n<section>\n<h2>Physical description</h2>\n<div>\n<h3>Race</h3>\n<div>Human</div>\n</div>\n<div>\n<h3>Gender</h3>\n<div>Male</div>\n</div>\n<div>\n<h3>Hair color</h3>\n<div>Silver</div>\n</div>\n</section>\n<section>\n<h2>Behind the scenes information</h2>\n<div>\n<h3>Designer</h3>\n<div>Naoki Ikushima</div>\n</div>\n<div>\n<h3>Japanese voice actor</h3>\n<div>Rikiya Koyama</div>\n</div>\n<div>\n<h3>English voice actor</h3>\n<div>Gyuri Sarossy</div>\n</div>\n</section>\n</aside>',
 '_id': '67139f9e8f64cb721b2f3eec'}

In [108]:
fandom_input = prompt_template.format_messages(**item)

In [109]:
# Call the LLM to translate to the style of the customer message
entity_info = chat(fandom_input)

In [111]:
print(entity_info.content)

Web page url: ```https://bravelydefault.fandom.com/wiki/Adam_Holograd```  
Game name: ```bravelydefault```  
Html web page source: ```<aside class="portable-infobox pi-background pi-border-color pi-theme-series pi-layout-default" role="region">
<h2>Adam Holograd</h2>
<div>
<div><span><span>アダマス・ホログラード</span> (<span>Adamasu・Horogurādo</span><span><a>?</a></span>, lit. <span>Adamas Holograd</span>)</span></div>
</div>

<section>
<h2>Biographical information</h2>
<div>
<h3>Home</h3>
<div>Holograd</div>
</div>
<div>
<h3>Affiliation</h3>
<div>Holograd Empire</div>
</div>
<div>
<h3>Occupation</h3>
<div>Lord Commander of the Holograd Empire</div>
</div>
</section>
<section>
<h2>Physical description</h2>
<div>
<h3>Race</h3>
<div>Human</div>
</div>
<div>
<h3>Gender</h3>
<div>Male</div>
</div>
<div>
<h3>Hair color</h3>
<div>Silver</div>
</div>
</section>
<section>
<h2>Behind the scenes information</h2>
<div>
<h3>Designer</h3>
<div>Naoki Ikushima</div>
</div>
<div>
<h3>Japanese voice actor</h3>
<

### Parse the LLM output string into a Python dictionary

In [112]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

You need to extract the following information:\
- Name of the entity\
- Class of the entity\
- Subclass of the entity\
- Description of the entity\
- url of the image of the entity (if exists in the html)

- Properties of the entity\

For each property, you need to extract the following information:\
- Name of the property\
- Value of the property\
- url of the link attached to the property (if exists in the html)\

In [113]:
name_schema = ResponseSchema(name="name",
                             description="Name of the entity")
class_schema = ResponseSchema(name="class",
                              description="Class of the entity,  for example: character, location, event, lore, in-game item, etc.")
subclass_schema = ResponseSchema(name="subclass",
                                 description="An appropriate Subclass of the entity. For example - if the entity is a character, the subclass could be a hero, villain, etc. If the entity is a location, the subclass could be a city, village, etc.")
description_schema = ResponseSchema(name="description",
                                     description="Description of the entity, no more than 255 characters")
image_url_schema = ResponseSchema(name="image_url",
                                    description="url of the image of the entity (if exists in the html)")
properties_schema = ResponseSchema(name="properties",
                                    description="List of dictionaries for each property found on the page. Each dictionary contains 4 key value pairs: name - the name of the property (capitalize and remove spaces and special characters); \
value - value of the property, can't be empty; \
description - description of the property, as applied to any entity the specific class or subclass mentoioned above, no more than 255 characters; \
parent - most appropriate parent property of the property (based on the knowledge of video games and web ontology engineering, can't be empty), for example defense is a parent of physical defense, magic defense, etc. and voice actor is a parent of japanese voice actor, english voice actor, etc.")

response_schemas = [
    name_schema, 
    class_schema, 
    subclass_schema, 
    description_schema, 
    # image_url_schema, 
    properties_schema
]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

In [114]:
format_instructions

'The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"name": string  // Name of the entity\n\t"class": string  // Class of the entity,  for example: character, location, event, lore, in-game item, etc.\n\t"subclass": string  // An appropriate Subclass of the entity. For example - if the entity is a character, the subclass could be a hero, villain, etc. If the entity is a location, the subclass could be a city, village, etc.\n\t"description": string  // Description of the entity\n\t"properties": string  // List of dictionaries for each property found on the page. Each dictionary contains 4 key value pairs: name - the name of the property (capitalize and remove spaces and special characters); value - value of the property; description - description of the property, as applied to any entity the specific class or subclass mentoioned above; parent - most appropriate parent property of the pr

In [115]:
template_string_with_format = """You are a web ontology creator. You are given part of an html web page from the fandom wiki web server, and your role is to extract entity information from this source. \
Page may containt information about a specific character, location, event, lore, in-game item, etc.  and its properties. One page corresponds to one entity only.\
You need to extract the information that will be used to populate a web ontology, and return it in a json format.\

Web page url: ```{page_url}```\
Game name: ```{game_name}```\
Html web page source: ```{content}```

{format_instructions}
"""

In [116]:
prompt = ChatPromptTemplate.from_template(template=template_string_with_format)

messages = prompt.format_messages(**item, 
                                format_instructions=format_instructions)
print(messages[0].content)

In [131]:
response = chat(messages)

In [132]:
print(response.content)

```json
{
	"name": "Adam Holograd",
	"class": "character",
	"subclass": "villain",
	"description": "Adam Holograd is the Lord Commander of the Holograd Empire in the game Bravely Default. He is a human male with silver hair and serves as a key antagonist in the story.",
	"properties": [
		{
			"name": "Home",
			"value": "Holograd",
			"description": "The place where the character resides or originates from.",
			"parent": "Location"
		},
		{
			"name": "Affiliation",
			"value": "Holograd Empire",
			"description": "The organization or group the character is associated with.",
			"parent": "Organization"
		},
		{
			"name": "Occupation",
			"value": "Lord Commander of the Holograd Empire",
			"description": "The character's role or job within the game world.",
			"parent": "Role"
		},
		{
			"name": "Race",
			"value": "Human",
			"description": "The species or race of the character.",
			"parent": "BiologicalClassification"
		},
		{
			"name": "Gender",
			"value": "Male",
			"descri

In [133]:
output_dict = output_parser.parse(response.content)

In [134]:
output_dict

{'name': 'Adam Holograd',
 'class': 'character',
 'subclass': 'villain',
 'description': 'Adam Holograd is the Lord Commander of the Holograd Empire in the game Bravely Default. He is a human male with silver hair and serves as a key antagonist in the story.',
 'properties': [{'name': 'Home',
   'value': 'Holograd',
   'description': 'The place where the character resides or originates from.',
   'parent': 'Location'},
  {'name': 'Affiliation',
   'value': 'Holograd Empire',
   'description': 'The organization or group the character is associated with.',
   'parent': 'Organization'},
  {'name': 'Occupation',
   'value': 'Lord Commander of the Holograd Empire',
   'description': "The character's role or job within the game world.",
   'parent': 'Role'},
  {'name': 'Race',
   'value': 'Human',
   'description': 'The species or race of the character.',
   'parent': 'BiologicalClassification'},
  {'name': 'Gender',
   'value': 'Male',
   'description': 'The gender of the character.',
   'p

In [123]:
# save the extracted information to the mongodb database
from bson.objectid import ObjectId


collection = client['Fandom']['Pages']

collection.update_one(
    {"_id": ObjectId(item["_id"])},
    {"$set": output_dict},
    upsert=True
)


UpdateResult({'n': 1, 'nModified': 1, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1729355984, 118), 'signature': {'hash': b'l\x90/\x96\x13\xa8\x91\xdfa\x89\xeeT\x12\xa40\\"U\x85\xe7', 'keyId': 7366676203282890758}}, 'operationTime': Timestamp(1729355984, 118), 'updatedExisting': True}, acknowledged=True)

create table fandom_entities (
    id serial primary key,
    object_id varchar(255) not null,
    name varchar(255) not null,
    class varchar(255) not null,
    subclass varchar(255),
    description text not null,
    created_at timestamp default current_timestamp
);

grant all privileges on table fandom_entities to ontology_rw;
grant select on table fandom_entities to ontology_ro;

create table fandom_properties (
    id serial primary key,
    object_id varchar(255) not null,
    name varchar(255) not null,
    value varchar(255) not null,
    description text not null,
    parent varchar(255) not null,
    created_at timestamp default current_timestamp
);

grant all privileges on table fandom_properties to ontology_rw;
grant select on table fandom_properties to ontology_ro;

In [127]:
# insert the extracted information to the postgresql database, table 'fandom_entities'
cursor.execute(
    """INSERT INTO fandom_entities (object_id, name, class, subclass, description)
    VALUES (%s, %s, %s, %s, %s);""",
    (item["_id"], output_dict["name"], output_dict["class"], output_dict["subclass"], output_dict["description"])
)
conn.commit()

In [128]:
# insert the extracted information to the postgresql database, table 'fandom_properties'
for prop in output_dict["properties"]:
    cursor.execute(
        """INSERT INTO fandom_properties (object_id, name, value, description, parent)
        VALUES (%s, %s, %s, %s, %s);""",
        (item["_id"], prop["name"], prop["value"], prop["description"], prop["parent"])
    )
    conn.commit()

In [None]:
conn.close()

## Provide context with OWL definitions

As described in this tutorial:

"LangChain for LLM Application Development"

https://learn.deeplearning.ai/courses/langchain/lesson/2/models,-prompts-and-parsers

In [7]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

In [8]:
import openai
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

openai.api_key = os.environ['OPENAI_APIKEY']
llm_model = "gpt-4o-mini"
# llm_model = "chatgpt-4o-latest"


chat = ChatOpenAI(temperature=0.0, model=llm_model, openai_api_key=openai.api_key)


In [9]:
template_string_with_format = """
You are a highly skilled OWL (Web Ontology Language) ontology engineer. Your task is to assist in the creation, validation, and optimization of OWL ontologies, which are formal representations of knowledge. You have expert knowledge in knowledge engineering, description logic, and semantic web technologies. You also excel at defining classes, properties, and relationships between entities, ensuring logical consistency, and facilitating the sharing of knowledge across different domains.

An OWL ontology is a structured framework used to represent and share knowledge about a particular domain. It consists of classes (concepts), properties (relationships and attributes), and individuals (instances). OWL ontologies allow for the modeling of rich, complex relationships between data in a machine-readable format, enabling advanced reasoning, querying, and inference over that data.
Core OWL Concepts:

    Classes: These represent sets or collections of individuals, typically abstract concepts or types (e.g., "Character," "Weapon," "GameLevel").
    Individuals: Instances of classes (e.g., "Mario" is an individual of the class "Character"; "Sword of Flames" is an individual of the class "Weapon").
    Object Properties: Define relationships between two individuals (e.g., "wieldsWeapon" linking a character to a weapon they use, or "locatedIn" linking a character to a particular game level).
    Datatype Properties: Define relationships between an individual and a data value (e.g., "hasHealthPoints" linking a character to a numeric value representing their health).
    SubClassOf: A relation where one class is a subclass of another, inheriting properties (e.g., "BossCharacter" is a subclass of "Character").
    Equivalence: Used to state that two classes or properties are equivalent (e.g., "MagicWeapon" may be declared equivalent to "SpecialWeapon").
    Disjoint Classes: These are classes that cannot share instances (e.g., "Weapon" and "ConsumableItem" are disjoint classes, meaning an item cannot be both a weapon and a consumable).

OWL Inference and Reasoning:

One of the powerful aspects of OWL is that it allows for reasoning over data. Inference engines can deduce new facts based on the relationships and properties defined in the ontology. For example, if "BossCharacter" is a subclass of "Character" and "Bowser" is an individual of "BossCharacter," it can be inferred that "Bowser" is also an individual of "Character." Additionally, if a property like "wieldsWeapon" is defined, you could infer that "Bowser wields a FireballWeapon" if such an individual and relationship are defined.

Your role is to assist in extracting classes, properties, and relationships within the context of video game entities such as characters, items, levels, and abilities - from the provided content: web page url, game name and html web page source. You will then use this information to populate an OWL ontology that captures the essence of the video game domain. Your expertise in OWL modeling and knowledge representation will be crucial in this task.

Web page url: ```{page_url}```
Game name: ```{game_name}```
Html web page source: ```{content}```

{format_instructions}

"""

In [107]:
name_schema = ResponseSchema(name="name",
                             description="Name of the entity from this page")
class_schema = ResponseSchema(name="class",
                              description="Class of the entity,  for example: character, location, event, lore, in-game item, quest, etc.")
subclass_schema = ResponseSchema(name="subclass",
                                 description="An appropriate Subclass of the entity. For example - if the entity is a character, the subclass could be a hero, NPC, etc. If the entity is a location, the subclass could be a area, city, village, etc. If class is an item, subclass could be weapon, armor, consumable, etc.")
description_schema = ResponseSchema(name="description",
                                     description="Description of the entity, in less than 255 characters")
properties_schema = ResponseSchema(name="properties",
                                    description="List of dictionaries for each property found on the page. Each property can be either data property or object property. Data properties link the main entity to a literal such as string, number, or date. Object properties link main antity of the page to another entity of a relevant class. \
For each property create a dictionary with 5 key value pairs. These are: \
\"property_name\": the name of the linking property. \
\"description\": generic description of the property, as applied to any entity of the specific class or subclass mentioned above, no more than 255 characters; \
\"target_entity\": name or value of the either data or object that property is linking, can't be empty; \
\"target_class\": class of the data or object that property is linking, if property name is `hasType` or `Type`, then target class is `Class`; \
\"target_superclass\": a more generic class for the target object, for example defense is a parent of physical defense, magic defense, etc. and voice actor is a parent of japanese voice actor, english voice actor, etc. \
If property points to multiple values, create a separate property dictionary within the properties list for each value.")

response_schemas = [
    name_schema, 
    class_schema, 
    subclass_schema, 
    description_schema, 
    # image_url_schema, 
    properties_schema
]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

In [11]:
item = {'game_name': 'bravelydefault',
 'game_url': 'https://bravelydefault.fandom.com/',
 'page_name': 'Adam_Holograd',
 'page_url': 'https://bravelydefault.fandom.com/wiki/Adam_Holograd',
 'content': '<aside class="portable-infobox pi-background pi-border-color pi-theme-series pi-layout-default" role="region">\n<h2>Adam Holograd</h2>\n<div>\n<div><span><span>アダマス・ホログラード</span> (<span>Adamasu・Horogurādo</span><span><a>?</a></span>, lit. <span>Adamas Holograd</span>)</span></div>\n</div>\n\n<section>\n<h2>Biographical information</h2>\n<div>\n<h3>Home</h3>\n<div>Holograd</div>\n</div>\n<div>\n<h3>Affiliation</h3>\n<div>Holograd Empire</div>\n</div>\n<div>\n<h3>Occupation</h3>\n<div>Lord Commander of the Holograd Empire</div>\n</div>\n</section>\n<section>\n<h2>Physical description</h2>\n<div>\n<h3>Race</h3>\n<div>Human</div>\n</div>\n<div>\n<h3>Gender</h3>\n<div>Male</div>\n</div>\n<div>\n<h3>Hair color</h3>\n<div>Silver</div>\n</div>\n</section>\n<section>\n<h2>Behind the scenes information</h2>\n<div>\n<h3>Designer</h3>\n<div>Naoki Ikushima</div>\n</div>\n<div>\n<h3>Japanese voice actor</h3>\n<div>Rikiya Koyama</div>\n</div>\n<div>\n<h3>English voice actor</h3>\n<div>Gyuri Sarossy</div>\n</div>\n</section>\n</aside>',
 '_id': '67139f9e8f64cb721b2f3eec'}

In [12]:
prompt = ChatPromptTemplate.from_template(template=template_string_with_format)

messages = prompt.format_messages(**item, 
                                format_instructions=format_instructions)
print(messages[0].content)


You are a highly skilled OWL (Web Ontology Language) ontology engineer. Your task is to assist in the creation, validation, and optimization of OWL ontologies, which are formal representations of knowledge. You have expert knowledge in knowledge engineering, description logic, and semantic web technologies. You also excel at defining classes, properties, and relationships between entities, ensuring logical consistency, and facilitating the sharing of knowledge across different domains.

An OWL ontology is a structured framework used to represent and share knowledge about a particular domain. It consists of classes (concepts), properties (relationships and attributes), and individuals (instances). OWL ontologies allow for the modeling of rich, complex relationships between data in a machine-readable format, enabling advanced reasoning, querying, and inference over that data.
Core OWL Concepts:

    Classes: These represent sets or collections of individuals, typically abstract concepts

In [40]:
response = chat(messages)
output_dict = output_parser.parse(response.content)

In [41]:
output_dict

{'name': 'Adam Holograd',
 'class': 'Character',
 'subclass': 'NPC',
 'description': 'Adam Holograd is the Lord Commander of the Holograd Empire, a human character with silver hair.',
 'properties': [{'property_name': 'home',
   'description': 'The location where the character resides.',
   'target_entity': 'Holograd',
   'target_class': 'Location',
   'target_superclass': 'GeographicalEntity'},
  {'property_name': 'affiliation',
   'description': 'The organization or group the character is associated with.',
   'target_entity': 'Holograd Empire',
   'target_class': 'Faction',
   'target_superclass': 'Organization'},
  {'property_name': 'occupation',
   'description': 'The role or job held by the character.',
   'target_entity': 'Lord Commander of the Holograd Empire',
   'target_class': 'Role',
   'target_superclass': 'Position'},
  {'property_name': 'race',
   'description': 'The species or race of the character.',
   'target_entity': 'Human',
   'target_class': 'Race',
   'target_su

In [None]:
# insert the extracted information to the postgresql database, table 'fandom_entities'
cursor.execute(
    """INSERT INTO fando_entities (object_id, name, class, subclass, description)
    VALUES (%s, %s, %s, %s, %s);""",
    (item["_id"], output_dict["name"], output_dict["class"], output_dict["subclass"], output_dict["description"])
)
conn.commit()

In [None]:
# insert the extracted information to the postgresql database, table 'fandom_properties'
for prop in output_dict["properties"]:
    cursor.execute(
        """INSERT INTO fandom_properties (object_id, name, description, target, target_class, target_superclass)
        VALUES (%s, %s, %s, %s, %s);""",
        (item["_id"], prop["name"], prop["value"], prop["description"], prop["parent"])
    )
    conn.commit()

## Batch processing

As described in OpenAI docs:

"Batch API: Learn how to use OpenAI's Batch API to send asynchronous groups of requests with 50% lower costs, a separate pool of significantly higher rate limits, and a clear 24-hour turnaround time."

https://platform.openai.com/docs/guides/batch/overview

In [41]:
import pandas as pd
from tqdm.notebook import tqdm

In [13]:
df = pd.read_sql(
    """SELECT * FROM fandom_pages;""", conn
)

  df = pd.read_sql(


In [14]:
df.head()

Unnamed: 0,id,object_id,game_name,game_url,page_name,page_url,content,created_at
0,2,67139f9e8f64cb721b2f3eec,bravelydefault,https://bravelydefault.fandom.com/,Adam_Holograd,https://bravelydefault.fandom.com/wiki/Adam_Ho...,"<aside class=""portable-infobox pi-background p...",2024-10-19 12:11:19.941645
1,3,6713a1e78f64cb721b2f3eed,bravelydefault,https://bravelydefault.fandom.com/,Adamantite_Shell_(Bravely_Default),https://bravelydefault.fandom.com/wiki/Adamant...,"<aside class=""portable-infobox pi-background p...",2024-10-19 12:11:20.004083
2,4,6713a1e78f64cb721b2f3eee,bravelydefault,https://bravelydefault.fandom.com/,Adelle_Ein,https://bravelydefault.fandom.com/wiki/Adelle_Ein,"<aside class=""portable-infobox pi-background p...",2024-10-19 12:11:20.062576
3,5,6713a1e78f64cb721b2f3eef,bravelydefault,https://bravelydefault.fandom.com/,Adventurer,https://bravelydefault.fandom.com/wiki/Adventurer,"<aside class=""portable-infobox pi-background p...",2024-10-19 12:11:20.118943
4,6,6713a1e78f64cb721b2f3ef0,bravelydefault,https://bravelydefault.fandom.com/,Adventurer_(Bravely_Default_boss),https://bravelydefault.fandom.com/wiki/Adventu...,"<aside class=""portable-infobox pi-background p...",2024-10-19 12:11:20.176508


```
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
```

In [305]:
start = 13000
end = 14000
batch_filename = f'batchinput_{start}_{end}.jsonl'
with open(batch_filename, 'w') as f:
    for item in tqdm(df.to_dict(orient='records')[start:end]):
        messages = prompt.format_messages(**item, 
                                format_instructions=format_instructions)
        request = {
            "custom_id": item['object_id'],
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o-mini",
                "messages": [
                    {"role": "system", "content": messages[0].content}
                ]
            }
        }
        json.dump(request, f)
        f.write('\n')


  0%|          | 0/36 [00:00<?, ?it/s]

In [306]:
from openai import OpenAI
client = OpenAI(api_key=os.environ['OPENAI_APIKEY'])

batch_input_file = client.files.create(
  file=open(batch_filename, "rb"),
  purpose="batch"
)

batch_input_file_id = batch_input_file.id

batch_create_msg = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": f"batch ontology job {start}-{end}"
    }
)
batch_create_msg

Batch(id='batch_67160d81e1e08190a2e1c6ba26a95e67', completion_window='24h', created_at=1729498498, endpoint='/v1/chat/completions', input_file_id='file-hxRfDiTNFM4MZOziIzsVLAsK', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1729584898, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'batch ontology job 13000-14000'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [319]:
batch_info = client.batches.retrieve(batch_create_msg.id)
print(batch_info.metadata['description'], batch_info.status)
print(batch_info.request_counts)

batch ontology job 13000-14000 completed
BatchRequestCounts(completed=36, failed=0, total=36)


In [320]:
batch_status = client.batches.retrieve(batch_create_msg.id)
if batch_status.status == 'completed':
    file_response = client.files.content(batch_status.output_file_id)
    batch_output_filename = f'batchoutput_{int(end/1000)}.jsonl'
    with open(batch_output_filename, "wb") as f:
        f.write(file_response.content)

    print(f"{batch_output_filename} completed.")
else:
    print(f"Batch job {int(end/1000)} is still {batch_status.status}.")

batchoutput_14.jsonl completed.


In [321]:
data = []

for i in [13,14]:

    batch_data = []
    with open(f'batchoutput_{i}.jsonl', "r") as f:
        for line in f:
            batch_data.append(json.loads(line))

    output_dicts = []
    for response in tqdm(batch_data):
        content = response['response']['body']['choices'][0]['message']['content']
        try:
            output_dict = output_parser.parse(content)
            # for i, prop in enumerate(output_dict['properties']):
            #     if 'value' in prop.keys():
            #         output_dict['properties'][i]['property_name'] = output_dict['properties'][i]['value']

            output_dict['_id'] = response['custom_id']
            output_dicts.append(output_dict)
        except Exception as e:
            print(e)
            print("Failed object_id:", response['custom_id'])

    data += output_dicts


  0%|          | 0/1000 [00:00<?, ?it/s]

Got invalid return object. Expected key `subclass` to be present, but got {'name': 'Alina Ramos', 'class': 'Character'}
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE
Failed object_id: 6713a4cf8f64cb721b2f6eab
Got invalid JSON object. Error: Expecting property name enclosed in double quotes: line 4 column 2 (char 63)
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE
Failed object_id: 6713a4d08f64cb721b2f6ec0
Got invalid JSON object. Error: Expecting property name enclosed in double quotes: line 59 column 28 (char 1801)
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE
Failed object_id: 6713a4d38f64cb721b2f6efb
Got invalid return object. Expected key `name` to be present, but got [{'name': 'Jesse Faden', 'class': 'Character', 'subclass': 'Hero', 'description': "The main protagonist of 'Control' seeking to uncover

  0%|          | 0/36 [00:00<?, ?it/s]

Got invalid return object. Expected key `subclass` to be present, but got {'name': 'Anna Lee', 'class': 'Character'}
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE
Failed object_id: 6713a4fe8f64cb721b2f71b6


In [323]:
# insert the extracted information to the postgresql database, table 'fandom_entities'
for item in tqdm(data):
    try:
        cursor.execute(
            """INSERT INTO fandom_entities (object_id, name, class, subclass, description)
            VALUES (%s, %s, %s, %s, %s);""",
            (item["_id"], item["name"], item["class"], item["subclass"], item["description"])
        )
        conn.commit()
    except:
        conn.rollback()

    # insert the extracted information to the postgresql database, table 'fandom_properties'
    for prop in item["properties"]:
        try:
            cursor.execute(
                """INSERT INTO fandom_properties (object_id, property_name, description, target_entity, target_class, target_superclass)
                VALUES (%s, %s, %s, %s, %s, %s);""",
                (item["_id"], prop["property_name"], prop["description"], prop["target_entity"], prop["target_class"], prop["target_superclass"])
            )
            conn.commit()
        except:
            conn.rollback()

  0%|          | 0/1024 [00:00<?, ?it/s]

In [278]:
len(data)

994