In [None]:
import requests
import os
import openai
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from tenacity import retry, stop_after_attempt, wait_fixed

openai.api_type = "azure"
openai.api_version = "2023-07-01-preview"
engine = "GPT35"

from tenacity import (
    retry,
    stop_after_attempt,
    wait_chain,
    wait_fixed
) 

@retry(wait=wait_chain(*[wait_fixed(3) for i in range(3)] +
                       [wait_fixed(5) for i in range(2)] +
                       [wait_fixed(10)]))
def completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)


def verify_entity_exists(response, list_in_txt):
    system_prompt = "You are an AI assistant that helps the user verify whether an entity is captured in a list."
    prompt = """I will give you one entity and one list, and I want you to respond with \"YES\" if the entity is within the list, \"NO\" if it is not in the list.  
    
    List: 
    {}
    Entity: {}
    Give your response in the following format: 
    `Reference in the list: {{item in the list if exists, None otherwise}}
    Answer: {{YES or NO}}` and say nothing else.
    """
    response = completion_with_backoff(
      engine=engine,
      messages = [{"role":"system","content":system_prompt},
                  {"role":"user","content":prompt.format(list_in_txt, response)},],
      temperature=0,
      max_tokens=25,
      top_p=0.95,
      frequency_penalty=0,
      presence_penalty=0,
      stop=None)
    

    try:
        result = response["choices"][0]["message"]["content"]
        lines = result.split("\n")
        reference = lines[0].split(": ")[1]
        answer = lines[1].split(": ")[1]
        return reference, answer
    except:
        print(prompt.format(list_in_txt, response))
        print("--------------------------------")
        return None, None
    return result


In [None]:
import pickle

def save_output(records, correctness, output_file):
    records_final = []

    for i in range(len(records['prompt'])):
        prompt = records['prompt'][i]
        completion = records['completion'][i]
        record = (prompt, completion, [correctness[i][0], correctness[i][1]])
        records_final.append(record)

    # Save records_final as a pkl file
    with open(output_file, 'wb') as file:
        pickle.dump(records_final, file)


In [None]:
import requests
import requests
from datetime import datetime

def query_wikidata(book_name):
    book_name = book_name.replace('"', '')
    url = 'https://query.wikidata.org/sparql'    
    query = '''
       SELECT ?bookLabel ?authorLabel (YEAR(?publicationDate) as ?publishing_year) WHERE {{
       ?book rdfs:label "{0}"@en .
       ?book wdt:P50 ?author .
       ?book wdt:P577 ?publicationDate .
  
       SERVICE wikibase:label {{
          bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
       }}
    }}
    '''.format(book_name)
        
    # Pass the query as a URL-encoded string in the params parameter
    params = {
        'format': 'json',
        'query': query
    }
    
    # Make the HTTP GET request
    response = requests.get(url, params=params)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print(f"Error: {response.status_code}")
        return None
    
def organize_data(data):
    organized_data = []
    
    try:
        if data is not None:
            for item in data['results']['bindings']:
                person = item['authorLabel']['value']
                publishing_year = item['publishing_year']['value']
                book = item['bookLabel']['value']

                organized_data.append({
                    "Author": person,
                    "Publishing_Year" : publishing_year,
                    "Book": book,
                })
    except:
        print("wiki extraction failed")
        
    return organized_data

def get_book_info(book_name):
    book_data = query_wikidata(book_name)
    organized_data = organize_data(book_data)

    matching_authors = []
    matching_publishing_years = []
    for entry in organized_data:

        if entry['Book'].lower().strip() == book_name.lower().strip():
            if entry['Author'] not in matching_authors:
                matching_authors.append(entry['Author'])
                
            if entry['Publishing_Year'] not in matching_publishing_years:   
                matching_publishing_years.append(entry['Publishing_Year'])
    
    return matching_authors, matching_publishing_years

    

In [None]:
import time
def remove_delimiter(text):
    text = text.replace('"','')
    delimiters = ['.', ',']
    for delimiter in delimiters:
        if text.endswith(delimiter):
            return text[:-len(delimiter)]        
    return text

def verify_books(records, debug_file):
    
    correctness = np.zeros((len(records["prompt"]), 2))
    with open(debug_file,"w") as fd:
        for i in range(len(records["prompt"])):
            time.sleep(5)

            constraints = records["name"][i]
            completion = records["completion"][i]
            book_name = remove_delimiter(completion.split("\n")[0].strip())
            fd.write(f"Book name from the completion : {book_name}"+ "\n")
            matching_authors, matching_publishing_years = get_book_info(book_name)    

            fd.write("wiki data:" + "\n")
            fd.write(f"author : {matching_authors}"+ "\n")
            fd.write(f"publishing year : {matching_publishing_years}"+ "\n")
            fd.write("..................................\n")

            for constraint in constraints:        
                if "written by" in constraint and len(matching_authors) > 0 :
                    state_reference, state_answer = verify_entity_exists(constraint, matching_authors)

                    if state_answer is not None:
                        correctness[i][0] = (1 if state_answer.lower() == "yes" else 0)
                    fd.write(f"author constraint : {constraint} \n")
                    fd.write(f"Turbo author reference : {state_reference} \n")
                    fd.write(f"Turbo author answer : {state_answer} \n")
                    fd.write(".................................\n")

                else:
                    if len(matching_publishing_years) > 0 :
                        #correctness[i][0]  - holds the awards constraint outcome
                        state_reference, state_answer = verify_entity_exists(constraint, matching_publishing_years)
                        if state_answer is not None:
                            correctness[i][1] = (1 if state_answer.lower() == "yes" else 0)
                        fd.write(f"publishing year constraint: {constraint}"+ "\n")
                        fd.write(f"Turbo publishing year reference : {state_reference}"+ "\n")
                        fd.write(f"Turbo publishing year answer : {state_answer} \n")
                        fd.write("..............................\n")
            fd.write("===============================================\n")
    return correctness

In [None]:
import pickle
import os
import numpy as np
from easydict import EasyDict as edict

data_pretty = {
    "books": "Books",   
}
result_records = []
for model_size in ["7b", "13b", "70b"]:
    for data_name in data_pretty:
        filename = f"./outputs/Llama-2-{model_size}-hf_{data_name}_localized_track.pkl"
        output_file = f"./outputs/Llama-2-{model_size}-hf_{data_name}_localized_track.pkl"
        debug_file = f".outputs/Llama-2-{model_size}-hf_{data_name}_localized_track.debug.txt"

              
        if not os.path.exists(filename):
            print(filename)
            continue
        records_to_save = edict(pickle.load(open(filename, "rb")))
        records = records_to_save
        correctness = verify_books(records, debug_file) 
        save_output(records, correctness, output_file )


In [None]:
## Finally 
## records = [(prompt, completion, [0, 0])]