In [1]:
import os
import json
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

True

In [2]:
# establish a connection to the MongoDB database
from pymongo import MongoClient

# connect to your Atlas cluster
client = MongoClient(os.environ["MONGODB_URI"])

In [126]:
# establish a connection to the PostgreSQL database
import psycopg2 as pg

conn = pg.connect(
    dbname=os.environ["POSTGRES_DB"],
    user=os.environ["POSTGRES_USER"],
    password=os.environ["POSTGRES_PASSWORD"],
    host=os.environ["POSTGRES_HOST"]
)
cursor = conn.cursor()

# LangChain: Models, Prompts and Output Parsers


## Outline

 * Direct API calls to OpenAI
 * API calls through LangChain:
   * Prompts
   * Models
   * Output parsers

In [129]:
import openai

openai.api_key = os.environ['OPENAI_APIKEY']
llm_model = "gpt-4o-mini"
# llm_model = "chatgpt-4o-latest"


In [6]:
from langchain.chat_models import ChatOpenAI

In [135]:
# To control the randomness and creativity of the generated
# text by an LLM, use temperature = 0.0
chat = ChatOpenAI(temperature=0.0, model=llm_model, openai_api_key=openai.api_key)


In [103]:
template_string = """You are a web ontology creator. You are given part of an html web page from the fandom wiki web server, and your role is to extract entity information from this source. \
Page may containt information about a specific character, location, event, lore, in-game item, etc. One page corresponds to one entity only.\
You need to extract the information that will be used to populate a web ontology.\
- Name of the entity\
- Class of the entity\
- Subclass of the entity\
- Description of the entity

- Properties of the entity\

For each property, you need to extract the following information:\
- Name of the property\
- Value of the property\
- url of the link attached to the property (if exists in the html)\

Return the extracted information in the following format:\


Web page url: ```{page_url}```\
Game name: ```{game_name}```\
Html web page source: ```{content}```
"""

In [104]:
from langchain.prompts import ChatPromptTemplate

prompt_template = ChatPromptTemplate.from_template(template_string)

In [105]:
prompt_template.messages[0].prompt

PromptTemplate(input_variables=['content', 'game_name', 'page_url'], input_types={}, partial_variables={}, template='You are a web ontology creator. You are given part of an html web page from the fandom wiki web server, and your role is to extract entity information from this source. Page may containt information about a specific character, location, event, lore, in-game item, etc. One page corresponds to one entity only.You need to extract the information that will be used to populate a web ontology.- Name of the entity- Class of the entity- Subclass of the entity- Description of the entity\n\n- Properties of the entity\nFor each property, you need to extract the following information:- Name of the property- Value of the property- url of the link attached to the property (if exists in the html)\nReturn the extracted information in the following format:\n\nWeb page url: ```{page_url}```Game name: ```{game_name}```Html web page source: ```{content}```\n')

In [106]:
prompt_template.messages[0].prompt.input_variables

['content', 'game_name', 'page_url']

In [107]:
item = {'game_name': 'bravelydefault',
 'game_url': 'https://bravelydefault.fandom.com/',
 'page_name': 'Adam_Holograd',
 'page_url': 'https://bravelydefault.fandom.com/wiki/Adam_Holograd',
 'content': '<aside class="portable-infobox pi-background pi-border-color pi-theme-series pi-layout-default" role="region">\n<h2>Adam Holograd</h2>\n<div>\n<div><span><span>アダマス・ホログラード</span> (<span>Adamasu・Horogurādo</span><span><a>?</a></span>, lit. <span>Adamas Holograd</span>)</span></div>\n</div>\n\n<section>\n<h2>Biographical information</h2>\n<div>\n<h3>Home</h3>\n<div>Holograd</div>\n</div>\n<div>\n<h3>Affiliation</h3>\n<div>Holograd Empire</div>\n</div>\n<div>\n<h3>Occupation</h3>\n<div>Lord Commander of the Holograd Empire</div>\n</div>\n</section>\n<section>\n<h2>Physical description</h2>\n<div>\n<h3>Race</h3>\n<div>Human</div>\n</div>\n<div>\n<h3>Gender</h3>\n<div>Male</div>\n</div>\n<div>\n<h3>Hair color</h3>\n<div>Silver</div>\n</div>\n</section>\n<section>\n<h2>Behind the scenes information</h2>\n<div>\n<h3>Designer</h3>\n<div>Naoki Ikushima</div>\n</div>\n<div>\n<h3>Japanese voice actor</h3>\n<div>Rikiya Koyama</div>\n</div>\n<div>\n<h3>English voice actor</h3>\n<div>Gyuri Sarossy</div>\n</div>\n</section>\n</aside>',
 '_id': '67139f9e8f64cb721b2f3eec'}

In [108]:
fandom_input = prompt_template.format_messages(**item)

In [109]:
# Call the LLM to translate to the style of the customer message
entity_info = chat(fandom_input)

In [111]:
print(entity_info.content)

Web page url: ```https://bravelydefault.fandom.com/wiki/Adam_Holograd```  
Game name: ```bravelydefault```  
Html web page source: ```<aside class="portable-infobox pi-background pi-border-color pi-theme-series pi-layout-default" role="region">
<h2>Adam Holograd</h2>
<div>
<div><span><span>アダマス・ホログラード</span> (<span>Adamasu・Horogurādo</span><span><a>?</a></span>, lit. <span>Adamas Holograd</span>)</span></div>
</div>

<section>
<h2>Biographical information</h2>
<div>
<h3>Home</h3>
<div>Holograd</div>
</div>
<div>
<h3>Affiliation</h3>
<div>Holograd Empire</div>
</div>
<div>
<h3>Occupation</h3>
<div>Lord Commander of the Holograd Empire</div>
</div>
</section>
<section>
<h2>Physical description</h2>
<div>
<h3>Race</h3>
<div>Human</div>
</div>
<div>
<h3>Gender</h3>
<div>Male</div>
</div>
<div>
<h3>Hair color</h3>
<div>Silver</div>
</div>
</section>
<section>
<h2>Behind the scenes information</h2>
<div>
<h3>Designer</h3>
<div>Naoki Ikushima</div>
</div>
<div>
<h3>Japanese voice actor</h3>
<

### Parse the LLM output string into a Python dictionary

In [112]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

You need to extract the following information:\
- Name of the entity\
- Class of the entity\
- Subclass of the entity\
- Description of the entity\
- url of the image of the entity (if exists in the html)

- Properties of the entity\

For each property, you need to extract the following information:\
- Name of the property\
- Value of the property\
- url of the link attached to the property (if exists in the html)\

In [113]:
name_schema = ResponseSchema(name="name",
                             description="Name of the entity")
class_schema = ResponseSchema(name="class",
                              description="Class of the entity,  for example: character, location, event, lore, in-game item, etc.")
subclass_schema = ResponseSchema(name="subclass",
                                 description="An appropriate Subclass of the entity. For example - if the entity is a character, the subclass could be a hero, villain, etc. If the entity is a location, the subclass could be a city, village, etc.")
description_schema = ResponseSchema(name="description",
                                     description="Description of the entity, no more than 255 characters")
image_url_schema = ResponseSchema(name="image_url",
                                    description="url of the image of the entity (if exists in the html)")
properties_schema = ResponseSchema(name="properties",
                                    description="List of dictionaries for each property found on the page. Each dictionary contains 4 key value pairs: name - the name of the property (capitalize and remove spaces and special characters); \
value - value of the property, can't be empty; \
description - description of the property, as applied to any entity the specific class or subclass mentoioned above, no more than 255 characters; \
parent - most appropriate parent property of the property (based on the knowledge of video games and web ontology engineering, can't be empty), for example defense is a parent of physical defense, magic defense, etc. and voice actor is a parent of japanese voice actor, english voice actor, etc.")

response_schemas = [
    name_schema, 
    class_schema, 
    subclass_schema, 
    description_schema, 
    # image_url_schema, 
    properties_schema
]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

In [114]:
format_instructions

'The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"name": string  // Name of the entity\n\t"class": string  // Class of the entity,  for example: character, location, event, lore, in-game item, etc.\n\t"subclass": string  // An appropriate Subclass of the entity. For example - if the entity is a character, the subclass could be a hero, villain, etc. If the entity is a location, the subclass could be a city, village, etc.\n\t"description": string  // Description of the entity\n\t"properties": string  // List of dictionaries for each property found on the page. Each dictionary contains 4 key value pairs: name - the name of the property (capitalize and remove spaces and special characters); value - value of the property; description - description of the property, as applied to any entity the specific class or subclass mentoioned above; parent - most appropriate parent property of the pr

In [115]:
template_string_with_format = """You are a web ontology creator. You are given part of an html web page from the fandom wiki web server, and your role is to extract entity information from this source. \
Page may containt information about a specific character, location, event, lore, in-game item, etc.  and its properties. One page corresponds to one entity only.\
You need to extract the information that will be used to populate a web ontology, and return it in a json format.\

Web page url: ```{page_url}```\
Game name: ```{game_name}```\
Html web page source: ```{content}```

{format_instructions}
"""

In [116]:
prompt = ChatPromptTemplate.from_template(template=template_string_with_format)

messages = prompt.format_messages(**item, 
                                format_instructions=format_instructions)

In [None]:
print(messages[0].content)

In [131]:
response = chat(messages)

In [132]:
print(response.content)

```json
{
	"name": "Adam Holograd",
	"class": "character",
	"subclass": "villain",
	"description": "Adam Holograd is the Lord Commander of the Holograd Empire in the game Bravely Default. He is a human male with silver hair and serves as a key antagonist in the story.",
	"properties": [
		{
			"name": "Home",
			"value": "Holograd",
			"description": "The place where the character resides or originates from.",
			"parent": "Location"
		},
		{
			"name": "Affiliation",
			"value": "Holograd Empire",
			"description": "The organization or group the character is associated with.",
			"parent": "Organization"
		},
		{
			"name": "Occupation",
			"value": "Lord Commander of the Holograd Empire",
			"description": "The character's role or job within the game world.",
			"parent": "Role"
		},
		{
			"name": "Race",
			"value": "Human",
			"description": "The species or race of the character.",
			"parent": "BiologicalClassification"
		},
		{
			"name": "Gender",
			"value": "Male",
			"descri

In [133]:
output_dict = output_parser.parse(response.content)

In [134]:
output_dict

{'name': 'Adam Holograd',
 'class': 'character',
 'subclass': 'villain',
 'description': 'Adam Holograd is the Lord Commander of the Holograd Empire in the game Bravely Default. He is a human male with silver hair and serves as a key antagonist in the story.',
 'properties': [{'name': 'Home',
   'value': 'Holograd',
   'description': 'The place where the character resides or originates from.',
   'parent': 'Location'},
  {'name': 'Affiliation',
   'value': 'Holograd Empire',
   'description': 'The organization or group the character is associated with.',
   'parent': 'Organization'},
  {'name': 'Occupation',
   'value': 'Lord Commander of the Holograd Empire',
   'description': "The character's role or job within the game world.",
   'parent': 'Role'},
  {'name': 'Race',
   'value': 'Human',
   'description': 'The species or race of the character.',
   'parent': 'BiologicalClassification'},
  {'name': 'Gender',
   'value': 'Male',
   'description': 'The gender of the character.',
   'p

In [123]:
# save the extracted information to the mongodb database
from bson.objectid import ObjectId


collection = client['Fandom']['Pages']

collection.update_one(
    {"_id": ObjectId(item["_id"])},
    {"$set": output_dict},
    upsert=True
)


UpdateResult({'n': 1, 'nModified': 1, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1729355984, 118), 'signature': {'hash': b'l\x90/\x96\x13\xa8\x91\xdfa\x89\xeeT\x12\xa40\\"U\x85\xe7', 'keyId': 7366676203282890758}}, 'operationTime': Timestamp(1729355984, 118), 'updatedExisting': True}, acknowledged=True)

create table fandom_entities (
    id serial primary key,
    object_id varchar(255) not null,
    name varchar(255) not null,
    class varchar(255) not null,
    subclass varchar(255),
    description text not null,
    created_at timestamp default current_timestamp
);

grant all privileges on table fandom_entities to ontology_rw;
grant select on table fandom_entities to ontology_ro;

create table fandom_properties (
    id serial primary key,
    object_id varchar(255) not null,
    name varchar(255) not null,
    value varchar(255) not null,
    description text not null,
    parent varchar(255) not null,
    created_at timestamp default current_timestamp
);

grant all privileges on table fandom_properties to ontology_rw;
grant select on table fandom_properties to ontology_ro;

In [127]:
# insert the extracted information to the postgresql database, table 'fandom_entities'
cursor.execute(
    """INSERT INTO fandom_entities (object_id, name, class, subclass, description)
    VALUES (%s, %s, %s, %s, %s);""",
    (item["_id"], output_dict["name"], output_dict["class"], output_dict["subclass"], output_dict["description"])
)
conn.commit()

In [128]:
# insert the extracted information to the postgresql database, table 'fandom_properties'
for prop in output_dict["properties"]:
    cursor.execute(
        """INSERT INTO fandom_properties (object_id, name, value, description, parent)
        VALUES (%s, %s, %s, %s, %s);""",
        (item["_id"], prop["name"], prop["value"], prop["description"], prop["parent"])
    )
    conn.commit()

In [None]:
conn.close()