In [1]:
import sys
sys.path.insert(0, "..")

In [2]:
# Import necessary libraries
from auto_chatgpt.autochatgpt.chatgptbot import ChatGPTBot
from rel import REL
import os
import time
import json
import random
import pandas as pd

In [3]:
# Kindly update based on your email and password. 
# For every experiment, change the file_name. This will the file to store the outcome of ChatGPT. 
# This is to esnure, we do not lose any output if incase there is any unexpected disruption.
EMAIL_ADDRESS = ".."
PASSWORD = ".."
FILE_NAME = "5-Iter-ZeroShot-NoICL"
RELATION_LABEL="docred"
CONSISTENT_COUNT=5

In [4]:
# Run the ChatGPT Bot
new_chat = ChatGPTBot(EMAIL_ADDRESS, PASSWORD, headless=False, wait=60)

## Note:
1. Do not remove headless variable. This will ensure us to monitor input and output from ChatGPT. Also it will be easier to indentify if incase there is any issue happened during scrapping.
2. Once the above cell complete run, you will see a new browser opens and shows like this:
   1. <img src="../images/chatgpt-popup.png" alt="Alt text that describes the graphic" title="Title text" />
3. <code style="background:yellow;color:black">Close the popup first before move to run next Jupyter Cell.</code> 
4. <code style="background:yellow;color:black">Now Select New Chat before running every Experiment.</code>
5. Then you can start running below cell


In [5]:
rel = REL(consistency_count=CONSISTENT_COUNT, relation_label=RELATION_LABEL)

## Start Experiment

In [6]:
# Read the dataset. 
dt = "../dataset/docred-dataset.json"
with open(dt) as f:
    data = json.load(f)
    
dataset = pd.DataFrame(data)

In [7]:
# Define the file to save ChatGPT output. 
# If the file is existing (if you are running after first time), then it will automatically find the articles that are annotated
file = f"rel_annotation_by_chatgpt/{FILE_NAME}.json"
counter_time = 1

if os.path.exists(file):
    with open(file, "r") as f:
        existing_data = json.load(f)
else:
    existing_data = []

total = dataset.shape[0]
len_existing_data = len(existing_data)
print("Current Milestone Completion: {}".format(len_existing_data))
print("You will annotate: {} News Articles".format(total))

Current Milestone Completion: 0
You will annotate: 200 News Articles


In [None]:
guides = pd.read_csv(f"../guideline/{RELATION_LABEL}.csv")

for idx, row in dataset.iloc[len_existing_data:total].iterrows():
    print("Annotating News Article.... {}".format(idx+1))
    article_tr = " ".join([" ".join(sentence) for sentence in row["sents"]])
    consistency_list = []
    for _ in range(CONSISTENT_COUNT):
        # Retrieve Entity Pair for each dataset
        vertexSet = [item for sublist in row['vertexSet'] for item in sublist]
        head = [vertexSet[h]['name'] for h in row['labels']['head']]
        tail = [vertexSet[t]['name'] for t in row['labels']['tail']]
        entities = [(head[idx],tail[idx]) for idx in range(len(head))]
        entity_pairs = [f"{_+1}. {entities[_]}" for _ in range(len(entities))]
        docred_labels = list(guides["Relation Label"])
        output = 'Answer me in JSON format. Follow this format: { "annotations": [ { "entity_pair": { "head": "Entity", "tail": "Entity" }, "relation": "Relation Label" }, { "entity_pair": { "head": "Entity", "tail": "Entity" }, "relation": "Relation Label" } ] }. Only give me the response asked, without any explaination'
        prompt_docred = "You will be provided with List of Entities and News Article as Input. Only annotate the relation based on this labels: {} and guideline provided earlier. {}. Here is the News Article: {} and Entity Pairs: {}".format(docred_labels,output,article_tr,entity_pairs)
    
        # Send Prompt in ChatGPT
        new_chat.send_prompt(prompt=prompt_docred)
        time.sleep(random.randint(10,15))
        res_docred = new_chat.get_gpt_response()
        time.sleep(random.randint(5,8))
        response_docred = res_docred[-1]
        time.sleep(random.randint(5,8))
        formated_response_doc = rel.response_format(response_docred) # we are not checking if relation is within our relation label scope
        time.sleep(random.randint(5,8))
        consistency_list.append(formated_response_doc)
        
    new_response_doc = rel.consistent_checker(consistency_list)
    
    output_gold = []
    relation = [r for r in row['labels']['relation_text']]
    relations_gold = [{"id": idx, "head": head[idx], "tail": tail[idx], "relation": relation[idx]} for idx in range(len(head))]
    for relation in relations_gold:
        head = relation["head"]
        tail = relation["tail"]
        output_gold.append({"entity_pair": {"head": head, "tail": tail}, "relation": relation["relation"]})
    if not formated_response_doc:  
        print(f"The DocRED is empty for article id {idx+1}")
    existing_data.append({
        "idx": idx,
        "text": str(article_tr),
        "chatgpt": formated_response_doc,
        "gold": output_gold
    })
    with open(file, "w") as f:
        json.dump(existing_data, f)