In [1]:
#!pip install spacy
#!python -m spacy download en_core_web_sm
#!python -m spacy download en_core_web_lg
#!pip install openai
#!pip install tiktoken
#!pip install pandas
#!pip install tqdm

## Function Calling with Azure OpenAI to extract organization names from documents

_See more function calling examples from https://github.com/Azure-Samples/openai/blob/main/Basic_Samples/Functions/working_with_functions.ipynb_

In [1]:
import os
import openai
import spacy
import tiktoken
import json
import re
import pandas as pd

from spacy.tokens import Span
from tqdm.notebook import tqdm

#Caching function responses lets us iterate without re-calling expensive operations
import functools

#Count tokens to see how expensive a call is
enc = tiktoken.encoding_for_model("gpt-3.5-turbo") 

#We load small and large models, to test concensus with OpenAI
nlpsm = spacy.load('en_core_web_sm') 
nlplg = spacy.load('en_core_web_lg')

#Don't include keys like this, use ENV vars!
#openai.api_key = os.getenv("AZURE_OPENAI_KEY")
#openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
with open('config.json') as fd:
    conf = json.loads(fd.read())
    openai.api_key = conf["api_key"]
    openai.api_base = conf["api_base"]
    openai.api_version = "2023-07-01-preview"
    openai.api_type = "azure"

In [2]:
def extract_organization_entities(organization):
    pass
    return []

@functools.cache
def get_gpt_completion(context):
    messages= [{"role": "user", "content": context}]
    functions= [
        {
            "name": "extract_organization_entities",
            "description": "Extracts all the Organization (ORG) named entities from the context.",
            "parameters": { 
                "type": "object",
                "properties": { #Follows JSON Schema conventions: https://json-schema.org/understanding-json-schema/
                    "organization": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        },
                        "description": "The organization entity names"
                    }
                },
                "required": ["organization"]
            }
        }
    ]  
    
    response = openai.ChatCompletion.create(
        engine="gpt-35-turbo-4k",
        temperature=0.0,
        messages=messages,
        functions=functions,
        function_call="auto", 
    )
    
    return response

In [11]:
def extract_organizations(context):
    response = get_gpt_completion(context)

    if response:
        #print('OPENAI RESPONSE:',response)
        response_message=response['choices'][0]['message']
    
        # Check if the model wants to call a function
        if response_message.get("function_call"):
        
            # Call the function. The JSON response may not always be valid so make sure to handle errors
            function_name = response_message["function_call"]["name"]
            if(function_name=="extract_organization_entities") :
                function_args = json.loads(response_message["function_call"]["arguments"])
                return function_args

In [4]:
xlsx = pd.ExcelFile('data/organization_gold_labels.xlsx')
df = xlsx.parse(xlsx.sheet_names[0])
df

Unnamed: 0,id,question,context,gold
0,1,who are the parties?,EX-99.E.3 4 dex99e3.htm NON-DISCLOSURE AGREEME...,"[""3M Company"",""Cogent""]"
1,2,who are the parties?,Non-Disclosure Agreement\nDate: 2019-03-14\nPa...,"[""Costa, Inc"", ""Costa Coffee Company"", Harry's..."


In [5]:
def count_tokens(context):
    enc.encode(context)
    return len(context)
count_tokens('hello world!')

12

In [12]:
def sync_offsets(context,organizations):
    offsets = []
    for org in organizations:
        matches = [{"PARTY":org,"start":m.start(),"end":m.end()} for m in re.finditer(org, context, re.IGNORECASE)]
        if not matches or len(matches)==0:
            print(f'No match found for "{org}" in context')
            offsets.append({"PARTY":org,"error":"not found"})
        else:
            offsets = offsets + matches
    return offsets


context_offset_tuples = [] #We will use tuples with nlp.pipe later https://spacy.io/api/language#pipe
for idx,row in tqdm(df.iterrows(),total=len(df)):
    context = row['context']
    token_count = count_tokens(context)
    function_args = extract_organizations(context)
    print('GPT response:',function_args)
    if function_args['organization']:
        gpt_orgs = function_args['organization']
        offsets = sync_offsets(context,gpt_orgs)
        print('Offsets:',offsets)
        context_offset_tuples.append((context,offsets))

  0%|          | 0/2 [00:00<?, ?it/s]

GPT response: {'organization': ['3M Company', 'Cogent, Inc.']}
Offsets: [{'PARTY': '3M Company', 'start': 168, 'end': 178}, {'PARTY': 'Cogent, Inc.', 'start': 226, 'end': 238}]
GPT response: {'organization': ['Costa, Inc', 'Costa Coffee Company', "Harry's Music Ltd."]}
Offsets: [{'PARTY': 'Costa, Inc', 'start': 51, 'end': 61}, {'PARTY': 'Costa Coffee Company', 'start': 102, 'end': 122}, {'PARTY': "Harry's Music Ltd.", 'start': 322, 'end': 340}]


In [15]:
def get_spacy_true_positives(doc, original_entities):
    ents = []
    for entity in original_entities:
        start_char = entity['start']
        end_char = entity['end']
        label = entity['PARTY']
        start_token = None
        end_token = None
        for token in doc:
            if token.idx == start_char:
                start_token = token.i
            if token.idx + len(token.text) == end_char:
                end_token = token.i + 1

        if start_token is not None and end_token is not None:
            ents.append(Span(doc, start_token, end_token, label=label))
    return ents

def get_spacy_false_positives(doc,true_positives):
    false_positives = []
    true_ent_ranges = [(ent.start, ent.end) for ent in true_positives]
    for ent in doc.ents:
        if ent.label_ == 'ORG':
            if (ent.start, ent.end) not in true_ent_ranges:
                false_positives.append(ent)

    return false_positives

#For every context, convert it to a doc and to spaCy NER on it.
#If an entity is identified as an ORG (spaCy) that is not a PARTY (from GPT), be sure to exclude it
for doc,orgs in tqdm(nlpsm.pipe(context_offset_tuples,as_tuples=True),total=len(context_offset_tuples)):
    spacy_true_positives= get_spacy_true_positives(doc, orgs)
    spacy_false_positives = get_spacy_false_positives(doc,spacy_true_positives)
    for ent in spacy_true_positives:
        print(f"TRUE: '{ent}', start:{ent.start}, end:{ent.end}")
    for ent in spacy_false_positives:
        print(f"FALSE: '{ent}', start:{ent.start}, end:{ent.end}")

  0%|          | 0/2 [00:00<?, ?it/s]

TRUE: '3M Company', start:31, end:34
TRUE: 'Cogent, Inc.', start:47, end:50
FALSE: 'the Interested Party', start:93, end:96
FALSE: 'Representatives', start:98, end:99
FALSE: 'the Proposed Transaction', start:118, end:121
TRUE: 'Costa, Inc', start:16, end:19
TRUE: 'Costa Coffee Company', start:31, end:34
TRUE: 'Harry's Music Ltd.', start:77, end:81
FALSE: 'Non-Disclosure Agreement
Date', start:0, end:6
