In [1]:
import os
import time
import json
from typing import List

# from langchain.output_parsers import PydanticOutputParser
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain_openai import ChatOpenAI


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [1]:
key_api = "<INSERT OPENAI API KEY>"

In [3]:
# Pydantic class for Case Summary
class Desciption(BaseModel):
    description: str = Field(description="Description of Entity Mention from News Article")

In [4]:
def get_unique_entities(data):
    seen = set()
    unique_data = []
    
    for entry in data:
        entity_mention = entry.get("entity_mention")
        if entity_mention not in seen:
            seen.add(entity_mention)
            unique_data.append(entry)
    
    return unique_data

In [5]:
# Define the prompt template
prompt_template = """
You are a highly knowledgeable and detail-oriented assistant. Your task is to generate a concise description of the entity mentioned in the news article.

Here are the details:
- Entity Mention: {entity_mention}
- Entity Type: {entity_type}
- News Article: {news_article}

Generate a description that captures the significance and context of the entity mentioned in the news article. Provide the the result as JSON Format. 
**Strictly provide Description in 1 Sentence**

Example: {{description: the description of entity mention}}
"""

parser = JsonOutputParser(pydantic_object=Desciption)

# Create the prompt template instance
template = PromptTemplate(
    input_variables=["entity_mention", "entity_type", "news_article"],
    template=prompt_template,
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [6]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=key_api)

In [7]:
chain = template | llm | parser

In [10]:
def get_unique_entities(data):
    seen = set()
    unique_data = []
    
    for entry in data:
        entity_mention = entry.get("entity_mention")
        if entity_mention not in seen:
            seen.add(entity_mention)
            unique_data.append(entry)
    
    return unique_data

# MEN-Dataset

In [2]:
with open("./MEN-Dataset/final.json", "r") as f:
    data = json.load(f)

In [3]:
print(len(data))

400


In [11]:
%%time
for item in data:
    print(item["id"])
    all_ents = []
    unique_ent_with_side_information = []
    for ent in item['entities']:
        ent_mention = item['text'][ent['position']['start_offset']:ent['position']['end_offset']].strip()
        all_ents.append(
            {
                "entity_mention": ent_mention,
                "entity_type": ent['label']
            }
        )   
    unique_ent = get_unique_entities(all_ents)
    for u_ent in unique_ent:
        response_chain = chain.invoke({"entity_mention": u_ent["entity_mention"], "entity_type": u_ent['entity_type'], "news_article": item['text']})
        try:
            description = response_chain['description']
        except:
            print("Error:", item["id"])
            description = "Not Found"
        unique_ent_with_side_information.append(
            {
                "entity_mention": u_ent["entity_mention"],
                "entity_type": u_ent['entity_type'],
                "description": description
            }
        )
    item["unique_entity_side_information"] = unique_ent_with_side_information

article_1
article_2
article_3
article_4
article_5
article_6
article_7
article_8
article_9
article_10
article_11
article_12
article_13
article_14
article_15
article_16
article_17
article_18
article_19
article_20
article_21
article_22
article_23
article_24
article_25
article_26
article_27
article_28
article_29
article_30
article_31
article_32
article_33
article_34
article_35
article_36
article_37
article_38
article_39
article_40
article_41
article_42
article_43
article_44
article_45
article_46
article_47
article_48
article_49
article_50
article_51
article_52
article_53
article_54
article_55
article_56
article_57
article_58
article_59
article_60
article_61
article_62
article_63
article_64
article_65
article_66
article_67
article_68
article_69
article_70
article_71
article_72
article_73
article_74
article_75
article_76
article_77
article_78
article_79
article_80
article_81
article_82
article_83
article_84
article_85
article_86
article_87
article_88
article_89
article_90
article_91
article_

In [13]:
with open("./MEN-Dataset_Results/only_descriptions.json", "w") as f:
    json.dump(data,f)

# DocRED and RE-DocRED

## RE-DocRED

In [13]:
with open("./RE-DocRED/train.json", "r") as f:
    redocred = json.load(f)

In [14]:
print(len(redocred))

3051


In [18]:
output_file_redocred = "./RE-DocRED_Results/only_descriptions.json"
# Initialize or load existing data
if os.path.exists(output_file_redocred):
    with open(output_file_redocred, 'r') as f:
        results_redocred = json.load(f)
else:
    results_redocred = []

In [19]:
print(len(results_redocred))

3051


In [17]:
%%time
for_sleeping_timer = 0
for item in redocred:
    print(item["id"])
    all_ents = []
    for ent in item['entities']:
        all_ents.append(
            {
                "entity_mention": ent['mention'],
                "entity_type": ent['label']
            }
        )
    unique_ent = get_unique_entities(all_ents)
    news_article = " ".join([token for item in item['tokenized_sent'] for token in item]) 
    requests = []

    for u_ent in unique_ent:
        requests.append({"entity_mention": u_ent["entity_mention"], "entity_type": u_ent['entity_type'], "news_article": news_article})

    try:
        results = chain.batch(requests)  # Call LangChain's batch method
        # Save results back to entities
        for ent, result in zip(unique_ent, results):
            ent['description'] = result['description']  # Assuming result format
    
    except Exception as e:
        print("Error processing item:", e)

    item["unique_entity_side_information"] = unique_ent
    results_redocred.append(item)
    # Write results to output file after every batch
    with open(output_file_redocred, 'w') as f:
        json.dump(results_redocred, f, indent=4)

2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549


## DocRED

In [8]:
with open("./DocRED/train.json", "r") as f:
    docred = json.load(f)

In [9]:
print(len(docred))

105925


In [20]:
output_file_docred = "./DocRED_Results/only_descriptions.json"
# Initialize or load existing data
if os.path.exists(output_file_docred):
    with open(output_file_docred, 'r') as f:
        results_docred = json.load(f)
else:
    results_docred = []

In [21]:
print(len(results_docred))

21591


In [12]:
%%time
for_sleeping_timer = 0
index_with_issue = []
for i in range(len(docred)):
    item = docred[i]
    print(item["id"])
    all_ents = []
    for ent in item['entities']:
        all_ents.append(
            {
                "entity_mention": ent['mention'],
                "entity_type": ent['label']
            }
        )   
    unique_ent = get_unique_entities(all_ents)
    news_article = " ".join([token for item in item['tokenized_sent'] for token in item]) 
    requests = []

    for u_ent in unique_ent:
        requests.append({"entity_mention": u_ent["entity_mention"], "entity_type": u_ent['entity_type'], "news_article": news_article})

    try:
        results = chain.batch(requests)  # Call LangChain's batch method
        # Save results back to entities
        for ent, result in zip(unique_ent, results):
            ent['description'] = result['description']  # Assuming result format
    
    except Exception as e:
        print("Error processing item:", e)
                    
    item["unique_entity_side_information"] = unique_ent
    results_docred.append(item)
    # Write results to output file after every batch
    with open(output_file_docred, 'w') as f:
        json.dump(results_docred, f, indent=4)

13686
13687
13688
13689
13690
13691
13692
13693
13694
13695
13696
13697
13698
13699
13700
13701
13702
13703
13704
13705
13706
13707
13708
13709
13710
13711
13712
13713
13714
13715
13716
13717
13718
13719
13720
13721
13722
13723
13724
13725
13726
13727
13728
13729
13730
13731
13732
13733
13734
13735
13736
13737
13738
13739
13740
13741
13742
13743
13744
13745
13746
13747
13748
13749
13750
13751
13752
13753
13754
13755
13756
13757
13758
13759
13760
13761
13762
13763
13764
13765
13766
13767
13768
13769
13770
13771
13772
13773
13774
13775
13776
13777
13778
13779
13780
13781
13782
13783
13784
13785
13786
13787
13788
13789
13790
13791
13792
13793
13794
13795
13796
13797
13798
13799
13800
13801
13802
13803
13804
13805
13806
13807
13808
13809
13810
13811
13812
13813
13814
13815
13816
13817
13818
13819
13820
13821
13822
13823
13824
13825
13826
13827
13828
13829
13830
13831
13832
13833
13834
13835
13836
13837
13838
13839
13840
13841
13842
13843
13844
13845
13846
13847
13848
13849
13850
13851
1385

KeyboardInterrupt: 