In [None]:
Model = "gpt-3.5-turbo-1106"
Name = 'mcm_b'
split = 'mcm_a'

SKIP_FIRST = 0
Queries = 5
sourcefile = f'./{split}.jsonl'
save_jsonl = f'./{Name}.jsonl'
save_file = f'./{Name}.csv'

Max_Tokens = 2000
openai_api_key = "your_key"
if '3.5' in Model:
    prices = (0.0015/1000, 0.002/1000) # input, output tokens
    WAIT = 60
elif '4' in Model:
    prices = (0.03/1000, 0.06/1000) # 0.06/0.002= 60/2=30. 
    WAIT = 60


Messages = lambda input: [
    {
      "role": "system",
      "content": "You are an expert lawyer in the domain of U.S. civil procedure. You are given an introduction that describes legal concepts, one of these concepts, background of a legal case and a multiple choice question about that case. First, answer which answer is the correct one and then for each choice explain in one sentence why it is correct or wrong. The format of your answer should be: The correct answer is:\\n<your_answer>\\n\\nChoices:\\n\\n<choice_A>\\n <explanation_for_choice_A>\\n<choice_B>\\n <explanation_for_choice_B>\\n<choice_C>\\n <explanation_for_choice_C>\\n<choice_D>\\n <explanation_for_choice_D>\\n."
    },
    {
      "role": "user",
      "content": "Introduction:\nThe venue provisions for cases involving corporate defendants are stated in Section 1391(b) of U.S. law. This section clarifies where a case may be brought; either the district where all defendants reside or where a significant portion of the events related to the claim occurred. However, defining where a corporation resides can be complex. Sections 28 U.S.C. §1391(c)(2) and (d) deal with this issue. It defines the residence of a corporation as any judicial district where it is under the court's personal jurisdiction concerning a particular civil action. If a state has more than one judicial district, a corporation is deemed to reside in any district within the state where it has sufficient contacts to be subjected to personal jurisdiction. If there is no such district, the corporation is considered to reside in the district with which it has the most significant contacts. An example is provided, using a fictional corporation and a legal dispute. Multiple potential district venues are considered, shedding light on the application of the provisions.\n\nConcept:\nVenue in Civil Procedure\n\nBackground:\nMark, a resident of Florida, wants to file a personal injury lawsuit against GreenCo, a multinational corporation headquartered in Delaware, for injuries sustained due to a defective product. The product was purchased online and delivered to Mark's home in Florida. Mark believes the defective product was manufactured and designed at GreenCo's facility in Kansas.\n\nQuestion:\nIn which district would venue be proper if Mark brings a personal injury lawsuit against GreenCo in federal court?\n\nChoices:\nA) The District of Delaware, where GreenCo is headquartered\nB) The Middle District of Florida, where Mark resides\nC) The District of Kansas, where the defective product was manufactured and designed\nD) The Southern District of Florida, where the product was delivered to Mark's home"
    },
    {
      "role": "assistant",
      "content": "The correct answer is:\nB) The Middle District of Florida, where Mark resides\n\nChoices:\n\nA) The District of Delaware, where GreenCo is headquartered\n Incorrect, because while GreenCo's headquarters are in Delaware, there doesn't seem to be any significant actions related to the claim that occurred there.\n\nB) The Middle District of Florida, where Mark resides\n Correct, because the product was delivered to Mark's home in Florida, causing the injury. Therefore, a significant portion of the events related to the claim occurred there.\n\nC) The District of Kansas, where the defective product was manufactured and designed\n Incorrect, although the product was manufactured and designed in Kansas, the significant harm occurred in Florida, making this district less relevant for this case.\n\nD) The Southern District of Florida, where the product was delivered to Mark's home\n Incorrect, because Mark does not reside in the Southern District of Florida and no specific information was given that suggest the Southern District as the proper venue."
    },
    {
      "role": "user",
      "content": input
    },
  ]


In [None]:
import os
import openai

openai.api_key = openai_api_key

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
def completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

def query(M):
  '''
  Openai api call
  '''
  response = completion_with_backoff(
      model=Model,
      messages= M,
      temperature=1,
      max_tokens=Max_Tokens,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
  )
  query = response.choices[0].message.content
  cost = prices[0]*response.usage.prompt_tokens + prices[1]*response.usage.completion_tokens
  return query, cost, response # extract_answer(query)

In [None]:
import re

def get_content(input, start_label, end_label=None):
    start = input.find(start_label)
    if end_label:
        end = input.find(end_label, start)
    else:
        end = len(input)
    content = input[start + len(start_label) : end].strip()
    return ' '.join(content.split())


def get_correct_answer(input):
    correct_answer_block = get_content(input, 'The correct answer is:', 'Choices:')
    correct_answer_label = correct_answer_block[0]
    correct_answer_content = correct_answer_block[3:]
    return correct_answer_label, correct_answer_content

def answer_collection(input, options):
    cots = []
    label, correct_answer_content = get_correct_answer(input)
    choices_block = get_content(input, 'Choices:')
    option_a = get_content(choices_block, 'A)', 'B)')
    cots.append(get_content(option_a, options[0]))
    option_b = get_content(choices_block, 'B)', 'C)')
    cots.append(get_content(option_b, options[1]))
    option_c = get_content(choices_block, 'C)', 'D)')
    cots.append(get_content(option_c, options[2]))
    option_d = get_content(choices_block, 'D)')
    cots.append(get_content(option_d, options[3]))
    return label, cots

In [None]:
import csv
# save in a csv file
# idx,question,answer,label,analysis,complete analysis,explanation,cot
def write2csv(save_file, items):
    with open(save_file, mode='a', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:
            writer.writerow(['idx', 'question', 'answer', 'label', 'analysis', 'complete_analysis', 'explanation', 'cot'])
        # Writing data
        writer.writerow(items)

In [None]:
import pandas as pd
import time
import json


total=1
total_time=0
# Nones=[]
failed_collection=[]
filtered = []
correct_filter = 0

with open(sourcefile, 'r') as file:
    for line in file:
        j = json.loads(line)
        if not j['success']:
            continue
        if total% Queries==0 and WAIT>0:
            if total ==Queries:
                print("Analysis Example:")
                print(M)
                print('-----------------------------------------------------------------------------')
                print(ans)
                print('######')
                print(save_dict)
            print(f'Waiting {WAIT} seconds to avoid timeout...')
            print(f'Total queries: {total}')
            print(f'Total successfully passed filtering: {correct_filter}, or acc: {correct_filter*100/total:.2f}')
            time.sleep(WAIT)
            total_time+=WAIT
            print(f'Total time: {total_time/60:.0f} minutes')
            print('Resuming experiment\n')
        
        idx = j['idx']
        intro = j['intro']
        concept = j['concept']
        background = j['background']
        question = j['question']
        options = j['choices']
        label = j['correct_option'][0] # careful here!
        
        prefixes = ['A) ', 'B) ', 'C) ', 'D) ']
        choices = []
        if len(options)!=4:
            print('Options failed!')
                        # print('Collection Failed!')
            failed_collection.append(idx)
            save_dict = {
                "idx": idx,
                "success": False,
                "prompt": M,
                "output": ans
            }
            with open(save_jsonl, 'a') as f:
                json.dump(save_dict, f)
                f.write('\n')
            continue
        for i, op in enumerate(options):
            choices.append(prefixes[i]+op)
        choices = "\n".join(choices)
        
        # construct message
        M = f'Introduction:\n{intro}\n\nConcept:\n{concept}\n\nBackground:\n{background}\n\nQuestion:\n{question}\n\nChoices:\n{choices}'
        messages = Messages(M)
        # query
        tries = 0
        while(tries>=0):
            try:
                ans, cost, response = query(messages)
                break
            except Exception as e:
                wait_min = 5
                print('Met RateLimit exception')
                print(f'Waiting {wait_min} minutes to avoid timeout...')
                time.sleep(wait_min*60)
                total_time+=wait_min*60
                print(f'Total time: {total_time/60:.0f} minutes')
                print('Resuming experiment:\n')
                tries -=1
        total+=1

        # answer collection
        try:
            pred, cots = answer_collection(ans, options)
        except:
            print('Collection Failed!')
            failed_collection.append(idx)
            save_dict = {
                "idx": idx,
                "success": False,
                "prompt": M,
                "output": ans
            }
            with open(save_jsonl, 'a') as f:
                json.dump(save_dict, f)
                f.write('\n')
            continue

        # now check for filter! 
        if label==pred:
            correct_filter+=1
            # save csvs
            # idx,question,answer,label,analysis,complete analysis,explanation,cot
            # mutation index: 111
            # a
            index_a = int('111' + '00' + str(idx))
            items = [index_a, background+' '+question, options[0], int(label=='A'), 'Mutated', 'Mutated', intro, cots[0]]
            write2csv(save_file, items)
            # b
            index_b = int('111' + '01' + str(idx))
            items = [index_b, background+' '+question, options[1], int(label=='B'), 'Mutated', 'Mutated', intro, cots[1]]
            write2csv(save_file, items)
            # c
            index_c = int('111' + '10' + str(idx))
            items = [index_c, background+' '+question, options[2], int(label=='C'), 'Mutated', 'Mutated', intro, cots[2]]
            write2csv(save_file, items)
            # d
            index_d = int('111' + '11' + str(idx))
            items = [index_d, background+' '+question, options[3], int(label=='D'), 'Mutated', 'Mutated', intro, cots[3]]
            write2csv(save_file, items) 

            # save jsonl
            save_dict = {
                "idx": idx,
                "success": True,
                "filter_success": True,
                "label": label,
                "pred": pred,
                "cots": cots,
                "concept": concept,
                "intro": intro,
                "background": background,
                "question": question,
                "choices": options,
                "prompt": M,
                "output": ans,
            }
            with open(save_jsonl, 'a') as f:
                json.dump(save_dict, f)
                f.write('\n')
        else:
            print('Filter failed!')
            filtered.append(idx)
            # save jsonl
            save_dict = {
                "idx": idx,
                "success": True,
                "filter_success": False,
                "label": label,
                "pred": pred,
                "concept": concept,
                "background": background,
                "question": question,
                "choices": choices,
                "intro": intro,
                "prompt": M,
                "output": ans,
            }
            with open(save_jsonl, 'a') as f:
                json.dump(save_dict, f)
                f.write('\n')
        
        