In [None]:
import json
import random
import string
import pandas as pd

import spacy
from spacy.language import Language
from spacy.util import filter_spans

from pathlib import Path
from ast import literal_eval
from openai import OpenAI, AsyncOpenAI
from dotenv import load_dotenv

from tqdm.autonotebook import tqdm

load_dotenv()

In [2]:
seed = 12345
sample_n = 100
trial_n = 3 # Number of JSON parsing attempts
output_step = 10 # Number of attempts for step 5

random.seed(seed)

## Settings

In [3]:
# QA_TYPE = "UQA"
# QA_TYPE = "RQA"
QA_TYPE = "AQuA"

In [None]:
# Input CSV path
path_csv = Path(f"../data/input/{QA_TYPE}.csv")
path_csv.exists()

In [5]:
col_id = "question_id"
# col_evidence = "evidence_wo_url"
# col_evidence = "evidence"
col_evidence = "rationale"
col_question = "question"
# col_question = "question_sentence"
# col_choices = "choices"
col_choices = "options"
# col_answer = "answer"
col_answer = "correct"

list_convert_cols = [col_choices]
# list_convert_cols = [col_choices, col_answer]

In [6]:
ollama_model = "gemma2:9b"
openai_model = "gpt-4o-2024-08-06"

In [7]:
# Output paths
dir_output = Path(f"../data/output/{QA_TYPE}_case3_{openai_model}/")
if not dir_output.exists():
    dir_output.mkdir()

output_path_qa = dir_output / f"{QA_TYPE}_{ollama_model.replace(':', '_')}_{{trial_no}}.csv"
output_path_qa_tmp = dir_output / f"{QA_TYPE}_{ollama_model.replace(':', '_')}_tmp.csv" # 途中経過
output_path_step4 = dir_output / f"{QA_TYPE}_step4.csv"

In [8]:
# Path to the intermediate CSV file
dir_input = Path(f"../data/output/{QA_TYPE}_case3_gpt-4o-mini-2024-07-18/")

input_path_qa_tmp = dir_input / f"{QA_TYPE}_{ollama_model.replace(':', '_')}_tmp.csv" # 途中経過

input_path_step1 = dir_input / f"{QA_TYPE}_step1.csv"
input_path_step2 = dir_input / f"{QA_TYPE}_step2.csv"
input_path_step2_use = dir_input / f"{QA_TYPE}_step2_use.csv"
input_path_step4 = dir_input / f"{QA_TYPE}_step4.csv"

In [None]:
# List of parts of speech to be masked
POS_CONTENT_WORD = ['PROPN', 'NOUN', 'VERB', 'ADJ', 'ADV', ]

In [None]:
list_mask_rate = list(range(0, 85, 20))
print(list_mask_rate)

### Prompt templates

In [106]:
STEP5_PROMPT = """The following is a text and metadata related to the code terms within the text. Answer the question concisely according to the instructions.

## Instructions
- Choose the answer from the options and respond with the corresponding number.
- Respond in JSON format as {{'basis': str, 'answer': int}}
- Use only the text as a reference for the basis

## Text
{context}

## Metadata
{metadata}

## Question
{question}

## Options
{option_list}"""

## LLM Settings

In [107]:
openai_async_client = AsyncOpenAI()
ollama_client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama'
)

In [108]:
USER = 'user'
AI = 'assistant'
SYS = 'system'

In [109]:
def generate(client:OpenAI, model:str, messages:list[dict], stream:bool=True, output_json:bool=False):
    param = {
        'model':model,
        'messages':messages,
        'stream':stream
    }
    
    if output_json:
        param['response_format'] = {'type': 'json_object'}

    return client.chat.completions.create(**param)

def write_stream(stream) -> str:
    ret = ""
    for c in stream:
        dlt = c.choices[0].delta.content
        if dlt:
            ret += dlt
            print(dlt, end="", flush=True)

    return ret

def append_message(role:str, content:str, messages:list[dict]=[]):
    messages.append({'role': role, 'content': content})
    return messages

async def aget_response(client:AsyncOpenAI, model:str, messages:list[dict], output_json:bool=False) -> str:
    param = {
        'model':model,
        'messages':messages
    }
    if output_json:
        param['response_format'] = {'type': 'json_object'}

    ret = await client.chat.completions.create(**param)
    return ret.choices[0].message.content

In [110]:
def convert_df_to_markdown(df:pd.DataFrame):
    meta_table_s2 = ' | '.join(df.columns) + '\n'
    meta_table_s2 += ' | '.join(['---']*len(df.columns.to_list())) + "\n"
    meta_table_s2 += '\n'.join(df.apply(lambda x: ' | '.join([x[col] for col in df.columns]), axis=1).values)

    return meta_table_s2

## Data loading

In [111]:
param = {   
    'converters': {col:literal_eval for col in list_convert_cols + ['s3_output_choices']}
}
df_qa = pd.read_csv(input_path_qa_tmp, **param).fillna('')

In [112]:
df_word_new = pd.read_csv(input_path_step2_use)

# 1. regular masking

## step4
- Select words to be coded based on the masking rate

In [113]:
def select_mask_row(df_word:pd.DataFrame, mask_rate:int):
    """Select words to be coded based on the masking rate"""
    assert (mask_rate >= 0) and (mask_rate <= 100), f"`mask_rate` must be set between 0 and 100. mask_rate: {mask_rate}"

    df_copy = df_word.copy()
    mask_col = f'p_{mask_rate}_masked'
    df_copy[mask_col] = False

    for qa_id, rows in df_copy.groupby(col_id):
        list_lemma = rows['lemma'].unique().tolist()
        mask_row_n = round(len(list_lemma) * (mask_rate/100))
        print(f'ID_{qa_id} {mask_rate}% number of words to be masked:', mask_row_n)

        list_mask_lemma = random.sample(list_lemma, k=mask_row_n)
        mask_rows_index = rows[rows['lemma'].isin(list_mask_lemma)].index.values

        df_copy.loc[mask_rows_index, mask_col] = True

    return df_copy

In [None]:
for mask_rate in list_mask_rate:
    df_word_new = select_mask_row(df_word_new, mask_rate)

In [115]:
def step4(df_qa:pd.DataFrame, list_mask_rate:list[int], df_word_new:pd.DataFrame) -> pd.DataFrame:
    """Add masked text based on the masking rate to df_qa"""
    df_copy = df_qa.copy()
    for mask_rate in list_mask_rate:
        df_copy[f's4_prg_encode_context_{mask_rate}'] = ""
        df_copy[f's4_prg_encode_Q_{mask_rate}'] = ""
        # Use `s3_output_choices` for `Choices` only this time
        df_copy[f's4_prg_encode_choices_{mask_rate}'] = df_copy['s3_output_choices']

        for i, row in df_copy.iterrows():
            qa_id = row[col_id]
            values = df_word_new[(df_word_new[col_id]==qa_id) & (df_word_new[f'p_{mask_rate}_masked'])].apply(lambda x: {x['code']:x['word']}, axis=1)
            values = sorted(values, key=lambda x: x[next(iter(x))].count(' '), reverse=True)
            
            # for col in ['s3_output_context', 's3_output_Q', 's3_output_choices']:
            for col in ['s3_output_Q']:
                sub_text = row[col]
                for pair in values:
                    code = list(pair.keys())[0]
                    word = list(pair.values())[0]
                    sub_text = sub_text.replace(word, f"<{code}>")
                df_copy.loc[i, col.replace('s3_output_', 's4_prg_encode_')+f"_{mask_rate}"] = sub_text

    return df_copy

In [116]:
df_qa = step4(df_qa[[c for c in df_qa.columns.values if not c.startswith('s4_')]], list_mask_rate, df_word_new)

In [118]:
df_qa.to_csv(output_path_qa_tmp, encoding='utf-8-sig', index=False)

In [119]:
df_word_new.to_csv(output_path_step4, encoding='utf-8-sig', index=False)

In [120]:
df_word_new = pd.read_csv(output_path_step4)

In [None]:
# Calculate the missing rate of meanings for each masking rate
list_tmp = []
for mask_rate in list_mask_rate:
    if mask_rate == 0:
        continue
    
    tmp_total_lemma = df_word_new[df_word_new[f'p_{mask_rate}_masked']]['lemma'].nunique()
    tmp_empty_meaning_lemma = tmp_total_lemma - df_word_new[df_word_new[f'p_{mask_rate}_masked'] & df_word_new['meaning'].notna()]['lemma'].nunique()
    list_tmp.append({'MR': mask_rate, 'number of words with missing meanings': tmp_empty_meaning_lemma, 'word count': tmp_total_lemma, 'missing rate of meanings': tmp_empty_meaning_lemma / tmp_total_lemma})
    
pd.DataFrame(list_tmp)

## step5
- generate answers

In [122]:
import asyncio

In [123]:
async def exec_step5(client:AsyncOpenAI, model:str, i:int, output_col:str, query_col:str, context:str, question:str, choices: list, metadata: str, count:int):
    messages = []
    prompt = STEP5_PROMPT.format(
        context=context,
        metadata=metadata,
        question=question,
        option_list=[f"{i}. {val}" for i, val in enumerate(choices, 1)]
    )
    messages = append_message(USER, prompt, messages)

    while count > 0:
        try:
            res = await aget_response(client, model, messages, output_json=True)
            ret = (i, json.loads(res)['answer'], prompt, output_col, query_col)
            return ret

        except Exception as e:
            print('error', e)
            count -= 1

    return (i, '-1', '', output_col, query_col)

In [124]:
async def step5(aclient:AsyncOpenAI, model:str, df_qa:pd.DataFrame, df_word_new:pd.DataFrame) -> pd.DataFrame:
    df_copy = df_qa.copy()
    print('model:', model)

    results = []

    for mask_rate in list_mask_rate:
        tasks = []
        
        output_col = f'answer_{model}_{mask_rate}'
        query_col = f'query_{mask_rate}'
        df_copy[output_col] = ""
        df_copy[query_col] = ""
        
        col_context = f's4_prg_encode_context_{mask_rate}'
        col_Q = f's4_prg_encode_Q_{mask_rate}'
        col_choices_tmp = f's4_prg_encode_choices_{mask_rate}'
        
        for i, row in df_copy.iterrows():
            qa_id = row[col_id]
            
            meta_table = df_word_new[(df_word_new[col_id]==qa_id) & (df_word_new[f'p_{mask_rate}_masked'])].groupby('lemma').agg(
                {col: 'unique' for col in ['part_of_speech', 'category', 'meaning', 'code']}
            ).copy()
            if not meta_table.empty:
                for col in ['part_of_speech', 'category', 'meaning', 'code']:
                    meta_table[col] = meta_table[col].apply(lambda x: ', '.join(t for t in x if type(t)==str))
            md_metatable = convert_df_to_markdown(meta_table)
            count = trial_n
            tasks.append(asyncio.ensure_future(exec_step5(aclient, model, i, output_col, query_col, row[col_context], row[col_Q], row[col_choices_tmp], md_metatable, count)))

        results.extend(await asyncio.gather(*tasks))

    for i, ans, query, output_col, query_col in results:
        df_copy.loc[i, output_col] = ans
        df_copy.loc[i, query_col] = query

    return df_copy.copy()

In [None]:
for output_num in range(1, output_step+1):
    print('trial:', output_num) 
    df_result = await step5(openai_async_client, openai_model, df_qa, df_word_new)
    save_path_tmp = str(output_path_qa).format(trial_no=output_num)
    print(save_path_tmp)
    df_result.to_csv(save_path_tmp, encoding='utf-8-sig', index=False)

# 2. partial lifting

## step4

In [134]:
param = {   
    'converters': {col:literal_eval for col in list_convert_cols + ['s3_output_choices']}
}
df_qa = pd.read_csv(output_path_qa_tmp, **param).fillna('')

In [135]:
df_word_new_all_meaning = df_word_new[df_word_new['meaning'].notna()]

In [136]:
df_qa_all_meaning = step4(df_qa, list_mask_rate, df_word_new_all_meaning)

In [None]:
list_tmp = []
for mask_rate in list_mask_rate:
    if mask_rate == 0:
        continue

    tmp_total_lemma = df_word_new_all_meaning[df_word_new_all_meaning[f'p_{mask_rate}_masked']]['lemma'].nunique()
    tmp_empty_meaning_lemma = tmp_total_lemma - df_word_new_all_meaning[df_word_new_all_meaning[f'p_{mask_rate}_masked'] & df_word_new_all_meaning['meaning'].notna()]['lemma'].nunique()
    list_tmp.append({'MR': mask_rate, 'number of words with missing meanings': tmp_empty_meaning_lemma, 'word count': tmp_total_lemma, 'missing rate of meanings': tmp_empty_meaning_lemma / tmp_total_lemma})
    
pd.DataFrame(list_tmp)

## step5

In [None]:
for output_num in range(1, output_step+1):
    print('trial:', output_num) 
    df_result = await step5(openai_async_client, openai_model, df_qa_all_meaning, df_word_new_all_meaning)
    save_path_tmp = str(output_path_qa).format(trial_no=f"{output_num}_filtered")
    print(save_path_tmp)
    df_result.to_csv(save_path_tmp, encoding='utf-8-sig', index=False)

# 3. strict masking

### step4

In [142]:
param = {   
    'converters': {col:literal_eval for col in list_convert_cols + ['s3_output_choices']}
}
df_qa_no_meaning = pd.read_csv(output_path_qa_tmp, **param).fillna('')

In [143]:
df_word_new_no_meaning = pd.read_csv(output_path_step4)
df_word_new_no_meaning['meaning'] = None

In [144]:
df_qa_no_meaning = step4(df_qa_no_meaning, list_mask_rate, df_word_new_no_meaning)

In [None]:
list_tmp = []
for mask_rate in list_mask_rate:
    if mask_rate == 0:
        continue
    
    tmp_total_lemma = df_word_new_no_meaning[df_word_new_no_meaning[f'p_{mask_rate}_masked']]['lemma'].nunique()
    tmp_empty_meaning_lemma = tmp_total_lemma - df_word_new_no_meaning[df_word_new_no_meaning[f'p_{mask_rate}_masked'] & df_word_new_no_meaning['meaning'].notna()]['lemma'].nunique()
    list_tmp.append({'MR': mask_rate, 'number of words with missing meanings': tmp_empty_meaning_lemma, 'word count': tmp_total_lemma, 'missing rate of meanings': tmp_empty_meaning_lemma / tmp_total_lemma})
    
pd.DataFrame(list_tmp)

### step5

In [None]:
for output_num in range(1, output_step+1):
    print('trial:', output_num) 
    df_result = await step5(openai_async_client, openai_model, df_qa_no_meaning, df_word_new_no_meaning)
    save_path_tmp = str(output_path_qa).format(trial_no=f"{output_num}_no_meaning")
    print(save_path_tmp)
    df_result.to_csv(save_path_tmp, encoding='utf-8-sig', index=False)

# 4. lenient masking

### step4

In [13]:
param = {   
    'converters': {col:literal_eval for col in list_convert_cols + ['s3_output_choices']}
}
df_qa_no_verb = pd.read_csv(output_path_qa_tmp, **param).fillna("")
df_word_new_no_verb = pd.read_csv(output_path_step4)

In [153]:
unique_lemma = df_word_new_no_verb.groupby([col_id, 'lemma'])['part_of_speech'].apply('unique').reset_index()
unique_lemma['contains_verb'] = unique_lemma['part_of_speech'].apply(lambda x: 'verb' in [pos.lower() for pos in x if pos])

In [None]:
df_word_new_no_verb = df_word_new_no_verb.merge(unique_lemma.rename(columns={'part_of_speech': 'POS_unique'}), how='left', on=[col_id, 'lemma'])
df_word_new_no_verb.loc[df_word_new_no_verb['contains_verb'], [f'p_{mr}_masked' for mr in list_mask_rate]] = False

In [156]:
df_qa_no_verb = step4(df_qa_no_verb, list_mask_rate, df_word_new_no_verb)

In [None]:
list_tmp = []
for mask_rate in list_mask_rate:
    if mask_rate == 0:
        continue
    
    tmp_total_lemma = df_word_new_no_verb[df_word_new_no_verb[f'p_{mask_rate}_masked']]['lemma'].nunique()
    tmp_empty_meaning_lemma = tmp_total_lemma - df_word_new_no_verb[df_word_new_no_verb[f'p_{mask_rate}_masked'] & df_word_new_no_verb['meaning'].notna()]['lemma'].nunique()
    list_tmp.append({'MR': mask_rate, 'number of words with missing meanings': tmp_empty_meaning_lemma, 'word count': tmp_total_lemma, 'missing rate of meanings': tmp_empty_meaning_lemma / tmp_total_lemma})
    
pd.DataFrame(list_tmp)

### step5

In [None]:
for output_num in range(1, output_step+1):
    print('trial:', output_num) 
    df_result = await step5(openai_async_client, openai_model, df_qa_no_verb, df_word_new_no_verb)
    save_path_tmp = str(output_path_qa).format(trial_no=f"{output_num}_no_verb")
    print(save_path_tmp)
    df_result.to_csv(save_path_tmp, encoding='utf-8-sig', index=False)