In [1]:
import re
import os
import json
import time
import asyncio
from typing import List

# from langchain.output_parsers import PydanticOutputParser
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain_openai import ChatOpenAI


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
key_api = "<INSERT OPENAI API KEY>"

In [3]:
# Pydantic class for Case Summary
class Hypernym(BaseModel):
    hypernym: str = Field(description="Hypernym of Entity Mention based on Entity Mention Description")

In [4]:
# Define the prompt template
prompt_template = """
You are a highly knowledgeable and detail-oriented assistant. Your task is to generate a Hypernym for Entity Mention based on Entity Mention Description and Wikidata (if available).
Make sure the Hypernym is relevant to the Entity Type.

Here are the details:
- Entity Mention: {entity_mention}
- Entity Type: {entity_type}
- Entity Mention: {entity_mention_description}

Generate a Hypernym that captures the significance and context of the entity mention description and relevant to Entity Type. Provide the the result as JSON Format. 


Example: {{hypernym: the hypernym of entity mention}}. Only return the Hypernym, NO EXPLAINATION NEEDED.
"""

parser = JsonOutputParser(pydantic_object=Hypernym)

# Create the prompt template instance
template = PromptTemplate(
    input_variables=["entity_mention", "entity_type", "entity_mention_description"],
    template=prompt_template,
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [5]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=key_api)

In [7]:
chain = template | llm | parser

# MEN-Dataset

In [11]:
# Use the Final with Description Only
with open("./MEN-Dataset_Results/annotated_final.json", "r") as f:
    data = json.load(f)

In [13]:
print(len(data))

400


In [20]:
%%time
for item in data:
    print("Running", item['id'])
    for ent in item['unique_entity_side_information']:
        try:
            result = chain.invoke({"entity_mention": ent['entity_mention'], "entity_type": ent['entity_type'], "entity_mention_description": ent['description']})['hypernym']
        except:
            print("Error", ent)
            result = "NA"
        ent['hypernym_llm'] = result

Running article_1
Running article_2
Running article_3
Running article_4
Running article_5
Running article_6
Running article_7
Running article_8
Running article_9
Running article_10
Running article_11
Running article_12
Running article_13
Running article_14
Running article_15
Running article_16
Running article_17
Running article_18
Running article_19
Running article_20
Running article_21
Running article_22
Running article_23
Running article_24
Running article_25
Running article_26
Running article_27
Running article_28
Running article_29
Running article_30
Running article_31
Running article_32
Running article_33
Running article_34
Running article_35
Running article_36
Running article_37
Running article_38
Running article_39
Running article_40
Running article_41
Running article_42
Running article_43
Running article_44
Running article_45
Running article_46
Running article_47
Running article_48
Running article_49
Running article_50
Running article_51
Running article_52
Running article_53
Ru

In [39]:
with open("./MEN-Dataset_Results/only_descriptions_and_hypernym.json", "w") as f:
    json.dump(data,f)

# DocRED and RE-DocRED

## RE-DocRED

In [10]:
with open("./RE-DocRED_Results/only_descriptions.json", "r") as f:
    redocred = json.load(f)

In [11]:
print(len(redocred))

499


In [12]:
output_file_redocred = "./RE-DocRED_Results/only_descriptions_and_hypernyms.json"
# Initialize or load existing data
if os.path.exists(output_file_redocred):
    with open(output_file_redocred, 'r') as f:
        results_redocred = json.load(f)
else:
    results_redocred = []

In [13]:
print(len(results_redocred))

0


In [14]:
%%time
for_sleeping_timer = 0
# Main loop to process each item
for i in range(len(results_redocred), len(redocred)):
    item = redocred[i]
    print("Running", item['id'])
    
    # Collect entity requests for batch processing 
    requests = []
    
    for ent in item['unique_entity_side_information']:
        requests.append({
            "entity_mention": ent['entity_mention'],
            "entity_type": ent['entity_type'],
            "entity_mention_description": ent['description']
        })

    # Process the batch
    try:
        results = chain.batch(requests)  # Call LangChain's batch method
        
        # Save results back to entities
        for ent, result in zip(item['unique_entity_side_information'], results):
            ent['hypernym_llm'] = result['hypernym']  # Assuming result format

    except Exception as e:
        print("Error processing item:", e)
        break
        for ent in item['unique_entity_side_information']:
            ent['hypernym_llm'] = "NA"  # Mark as NA on error
    
    # Append processed item to results list
    results_redocred.append(item)

    # Write results to output file after every batch
    with open(output_file_redocred, 'w') as f:
        json.dump(results_redocred, f, indent=4)

Running 1
Running 2
Running 3
Running 4
Running 5
Running 6
Running 7
Running 8
Running 9
Running 10
Running 11
Running 12
Running 13
Running 14
Running 15
Running 16
Running 17
Running 18
Running 19
Running 20
Running 21
Running 22
Running 23
Running 24
Running 25
Running 26
Running 27
Running 28
Running 29
Running 30
Running 31
Running 32
Running 33
Running 34
Running 35
Running 36
Running 37
Running 38
Running 39
Running 40
Running 41
Running 42
Running 43
Running 44
Running 45
Running 46
Running 47
Running 48
Running 49
Running 50
Running 51
Running 52
Running 53
Running 54
Running 55
Running 56
Running 57
Running 58
Running 59
Running 60
Running 61
Running 62
Running 63
Running 64
Running 65
Running 66
Running 67
Running 68
Running 69
Running 70
Running 71
Running 72
Running 73
Running 74
Running 75
Running 76
Running 77
Running 78
Running 79
Running 80
Running 81
Running 82
Running 83
Running 84
Running 85
Running 86
Running 87
Running 88
Running 89
Running 90
Running 91
Running 

## DocRED

In [15]:
with open("./DocRED_Results/only_descriptions.json", "r") as f:
    docred = json.load(f)

In [16]:
print(len(docred))

999


In [17]:
output_file_docred = "./DocRED_Results/only_descriptions_and_hypernyms.json"
# Initialize or load existing data
if os.path.exists(output_file_docred):
    with open(output_file_docred, 'r') as f:
        results_docred = json.load(f)
else:
    results_docred = []

In [18]:
print(len(results_docred))

999


In [14]:
%%time
for_sleeping_timer = 0
# Main loop to process each item
for i in range(len(results_docred), len(docred)):
    item = docred[i]
    print("Running", item['id'])
    
    # Collect entity requests for batch processing 
    requests = []
    
    for ent in item['unique_entity_side_information']:
        requests.append({
            "entity_mention": ent['entity_mention'],
            "entity_type": ent['entity_type'],
            "entity_mention_description": ent['description']
        })

    # Process the batch
    try:
        results = chain.batch(requests)  # Call LangChain's batch method
        
        # Save results back to entities
        for ent, result in zip(item['unique_entity_side_information'], results):
            ent['hypernym_llm'] = result['hypernym']  # Assuming result format

    except Exception as e:
        print("Error processing item:", e)
        break
        for ent in item['unique_entity_side_information']:
            ent['hypernym_llm'] = "NA"  # Mark as NA on error
    
    # Append processed item to results list
    results_docred.append(item)

    # Write results to output file after every batch
    with open(output_file_docred, 'w') as f:
        json.dump(results_docred, f, indent=4)

Running 1
Running 2
Running 3
Running 4
Running 5
Running 6
Running 7
Running 8
Running 9
Running 10
Running 11
Running 12
Running 13
Running 14
Running 15
Running 16
Running 17
Running 18
Running 19
Running 20
Running 21
Running 22
Running 23
Running 24
Running 25
Running 26
Running 27
Running 28
Running 29
Running 30
Running 31
Running 32
Running 33
Running 34
Running 35
Running 36
Running 37
Running 38
Running 39
Running 40
Running 41
Running 42
Running 43
Running 44
Running 45
Running 46
Running 47
Running 48
Running 49
Running 50
Running 51
Running 52
Running 53
Running 54
Running 55
Running 56
Running 57
Running 58
Running 59
Running 60
Running 61
Running 62
Running 63
Running 64
Running 65
Running 66
Running 67
Running 68
Running 69
Running 70
Running 71
Running 72
Running 73
Running 74
Running 75
Running 76
Running 77
Running 78
Running 79
Running 80
Running 81
Running 82
Running 83
Running 84
Running 85
Running 86
Running 87
Running 88
Running 89
Running 90
Running 91
Running 