In [None]:
import sys
sys.path.insert(0, "..")

In [1]:
# Import necessary libraries
from auto_chatgpt.autochatgpt.chatgptbot import ChatGPTBot
from ner import NER
import os
import time
import json
import random
import pandas as pd

In [2]:
# Kindly update based on your email and password. 
# For every experiment, change the file_name. This will the file to store the outcome of ChatGPT. 
# This is to esnure, we do not lose any output if incase there is any unexpected disruption.
EMAIL_ADDRESS = ".."
PASSWORD = ".."
FILE_NAME = "3Shot-Explain"
FEW_SHOT=3

In [3]:
# Run the ChatGPT Bot
new_chat = ChatGPTBot(EMAIL_ADDRESS, PASSWORD, headless=False, wait=60)

## Note:
1. Do not remove headless variable. This will ensure us to monitor input and output from ChatGPT. Also it will be easier to indentify if incase there is any issue happened during scrapping.
2. Once the above cell complete run, you will see a new browser opens and shows like this:
   1. <img src="../images/chatgpt-popup.png" alt="Alt text that describes the graphic" title="Title text" />
3. <code style="background:yellow;color:black">Close the popup first before move to run next Jupyter Cell.</code> 
4. <code style="background:yellow;color:black">Now Select New Chat before running every Experiment.</code>
5. Then you can start running below cell

In [4]:
ner = NER(FEW_SHOT=FEW_SHOT)

## Few Shot Setting

In [6]:
few_shot = ner.few_shot_with_explainations()

In [8]:
new_chat.send_prompt(prompt="Now you will be provided with some sample of News Articles and the Entity Extracted from Them. This Entity are extracted based on the Annotation Guideline and Context of News Article")
for article in few_shot:
    print("Writing Sample..")
    prompt = "Here is the news article {}: Here is the extracted entities{}".format(article["News Article"],article["Extracted Named Entity"])
    new_chat.send_prompt(prompt=prompt)
    time.sleep(random.randint(15,20))
time.sleep(30)

Writing Article
Writing Article
Writing Article


## Start Experiment

In [12]:
# Read the dataset. 
dt = "../dataset/men-dataset.json"
with open(dt) as f:
    data = json.load(f)
    
dataset = pd.DataFrame(data)

In [13]:
# Define the file to save ChatGPT output. 
# If the file is existing (if you are running after first time), then it will automatically find the articles that are annotated
file = f"ner_annotation_by_chatgpt/{FILE_NAME}.json"
counter_time = 1

if os.path.exists(file):
    with open(file, "r") as f:
        existing_data = json.load(f)
else:
    existing_data = []

total = dataset.shape[0]
len_existing_data = len(existing_data)
print("Current Milestone Completion: {}".format(len_existing_data))
print("You will annotate: {} News Articles".format(total))

Current Milestone Completion: 0
You will annotate: 200 News Articles


In [15]:
for idx, row in dataset.iloc[len_existing_data:total].iterrows():
    print("Annotating News Article.... {}".format(idx+1))
    news_article = row["article"]
    # Prompt to extract entities based on Input. The input is New Article
    prompt = "Act as Data Annotator, identify and extract all the entity PERSON, LOCATION, ORGANIZATION, EVENT, WORK_OF_ART, ROLE, TITLE, NORP, FACILITY, PRODUCT, LAW, LANGUAGE from the input news article. Input: {}. Provide the output as JSON.".format(news_article)
    new_chat.send_prompt(prompt=prompt)

    # This counter is to set and idle time and prvent from spamming ChatGPT website
    if counter_time==25:
        time.sleep(random.randint(45,60))
        counter_time=1
    elif counter_time==15:
        time.sleep(random.randint(20,30))
        counter_time=counter_time+1
    else:
        time.sleep(random.randint(8,10))
        counter_time=counter_time+1

    # Get ChatGPT output for the input given
    res = new_chat.get_gpt_response()
    time.sleep(random.randint(5,7))
    response = res[-1]
    time.sleep(random.randint(2,4))
    # Format ChatGPT outcome and extract only required output.
    formated_response = ner.formatting_chatgpt_response(response)
    # Get the offset from each entity.
    validated_entity_set = ner.find_offset(row["article"],formated_response) #Only Have Exact Match
    existing_data.append({
        "idx": idx,
        "text": news_article,
        "chatgpt_ent": validated_entity_set,
        "gold_ent": row["entities"]
    })
    with open(file, "w") as f:
        json.dump(existing_data, f)