In [None]:
import os, json
import numpy as np
from argparse import ArgumentParser
from tqdm import tqdm
from collections import defaultdict
import pandas as pd
from pprint import pprint
import re, string
from google.colab import files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import statistics

def print_check_score(p_type, e_type, s_t, mdl_name):
    root = 'New-Avg-Run-Experiments'
    folder = 'Argument-Extraction-Results'

    numbers = ['0', '1', '2']
    event_types = ['taking-moud', 'relapse', 'tapering']
    arg_types = ['main-arguments', 'event-specific-arguments', 'subject-effect-arguments']

    all_results = {}
    for number in numbers: #iterating over all 3 runs
          file_name = f'{number}-new-results-{mdl_name}.json'
          name = os.path.join(root, folder, file_name)
          results = read_json_file(name)
          data = results[f'{p_type}_{e_type}_{s_t}']

          ls3 = []
          for event in event_types:
              ls1 = []
              for arg_type in arg_types:
                  arguments = list(data[event][arg_type].keys())
                  ls = []
                  for arg in arguments:
                      f1_score = data[event][arg_type][arg][0][2] ##stored f1-score
                      gt_count = data[event][arg_type][arg][1][2] ##stored ground-truth count
                      if(gt_count>5):
                          #print(event, arg_type, arg, "{:.5f}".format(f1_score))
                          ls.append(f1_score)
                  if(len(ls)>0):
                     # print(event, arg_type, "{:.4f}".format(sum(ls)/len(ls)*100))

                      key = f'{event}_{arg_type}'
                      if key in all_results:
                          all_results[key].append(sum(ls)/len(ls))
                      else:
                          all_results[key] = [sum(ls)/len(ls)]
                      ls1.append(sum(ls)/len(ls))
              if(len(ls1)>0):
                  #print("{:.4f}".format(sum(ls1)/len(ls1)*100,"\n"))
                  ls3.append(sum(ls1)/len(ls1))

          key = f"{p_type}_{e_type}_{s_t}"
          if key in all_results:
              all_results[key].append(sum(ls3)/len(ls3))
          else :
              all_results[key] = [sum(ls3)/len(ls3)]

          #print("\nOverall", "{:.4f}".format(sum(ls3)/len(ls3)*100))

    print("********* Key Reuslts************\n")
    for event in event_types:
        for arg_type in arg_types:
            key = f'{event}_{arg_type}'
            value = all_results[key]
            print(event, arg_type, "Mean: ", "{:.4f}".format(statistics.mean(value)*100),
                  "Std: ", "{:.4f}".format(statistics.stdev(value)*100))

    key = f"{p_type}_{e_type}_{s_t}"
    value = all_results[key]

    print("\nOverall:")
    print(p_type, e_type, s_t, "Mean: ", "{:.4f}".format(statistics.mean(value)*100),
      "Std: ", "{:.4f}".format(statistics.stdev(value)*100))
    print("*************************************\n")
    return all_results

def read_json_file(name):
    with open(name, 'r') as f:
        data = json.load(f)
        return data



# ZS LLM Results

## Gemma-1.1-7b-it

In [None]:
mdl_name = 'gemma-1.1-7b-it'
prompt_types = ['batch']#['batch', 'isolated']
exp_types = ['description_guided', 'question_guided']
similarity_threshold = [0.749, 0.99]

results_for_print = {}
for p_type in prompt_types:
    for s_t in similarity_threshold:
      for e_type in exp_types:
              print("\n*************************")
              print(p_type, e_type, s_t, mdl_name)
              key = f'{p_type}_{e_type}_{s_t}'
              value = print_check_score(p_type, e_type, s_t, mdl_name)
              results_for_print[key] = value



*************************
batch description_guided 0.749 gemma-1.1-7b-it
********* Key Reuslts************

taking-moud main-arguments Mean:  26.8418 Std:  1.0102
taking-moud event-specific-arguments Mean:  39.1136 Std:  2.6050
taking-moud subject-effect-arguments Mean:  31.8332 Std:  0.4294
relapse main-arguments Mean:  20.4838 Std:  0.3202
relapse event-specific-arguments Mean:  31.7544 Std:  0.6991
relapse subject-effect-arguments Mean:  28.6655 Std:  4.5983
tapering main-arguments Mean:  31.2400 Std:  2.5306
tapering event-specific-arguments Mean:  28.6430 Std:  0.8509
tapering subject-effect-arguments Mean:  30.2639 Std:  2.1919

Overall:
batch description_guided 0.749 Mean:  29.8710 Std:  0.3754
*************************************


*************************
batch question_guided 0.749 gemma-1.1-7b-it
********* Key Reuslts************

taking-moud main-arguments Mean:  25.5201 Std:  2.8904
taking-moud event-specific-arguments Mean:  46.4686 Std:  0.2199
taking-moud subject-eff

## Mixtral-8x7B-Instruct

In [None]:
mdl_name = 'Mixtral-8x7B-Instruct'
prompt_types = ['batch']#['batch', 'isolated']
exp_types = ['description_guided', 'question_guided']
similarity_threshold = [0.749, 0.99]

results_for_print = {}
for p_type in prompt_types:
    for s_t in similarity_threshold:
      for e_type in exp_types:
              print("\n*************************")
              print(p_type, e_type, s_t, mdl_name)
              key = f'{p_type}_{e_type}_{s_t}'
              value = print_check_score(p_type, e_type, s_t, mdl_name)
              results_for_print[key] = value



*************************
batch description_guided 0.749 Mixtral-8x7B-Instruct
********* Key Reuslts************

taking-moud main-arguments Mean:  34.1959 Std:  0.3797
taking-moud event-specific-arguments Mean:  30.7073 Std:  0.5402
taking-moud subject-effect-arguments Mean:  33.9495 Std:  4.8293
relapse main-arguments Mean:  33.7717 Std:  0.4582
relapse event-specific-arguments Mean:  33.7356 Std:  0.9259
relapse subject-effect-arguments Mean:  31.8006 Std:  0.7947
tapering main-arguments Mean:  40.5556 Std:  0.1132
tapering event-specific-arguments Mean:  41.5118 Std:  1.6082
tapering subject-effect-arguments Mean:  39.4730 Std:  1.9771

Overall:
batch description_guided 0.749 Mean:  35.5223 Std:  0.7718
*************************************


*************************
batch question_guided 0.749 Mixtral-8x7B-Instruct
********* Key Reuslts************

taking-moud main-arguments Mean:  36.8958 Std:  0.4886
taking-moud event-specific-arguments Mean:  27.8890 Std:  0.4277
taking-moud

##Llama-3-8B

In [None]:
mdl_name = 'Llama-3-8B-Instruct'
prompt_types = ['batch']#['batch', 'isolated']
exp_types = ['description_guided', 'question_guided']
similarity_threshold = [0.749, 0.99]

results_for_print = {}
for p_type in prompt_types:
    for s_t in similarity_threshold:
      for e_type in exp_types:
              print("\n*************************")
              print(p_type, e_type, s_t, mdl_name)
              key = f'{p_type}_{e_type}_{s_t}'
              value = print_check_score(p_type, e_type, s_t, mdl_name)
              results_for_print[key] = value



*************************
batch description_guided 0.749 Llama-3-8B-Instruct
********* Key Reuslts************

taking-moud main-arguments Mean:  32.8867 Std:  0.4818
taking-moud event-specific-arguments Mean:  48.4557 Std:  0.4532
taking-moud subject-effect-arguments Mean:  32.4810 Std:  2.0809
relapse main-arguments Mean:  33.4360 Std:  0.1239
relapse event-specific-arguments Mean:  37.7479 Std:  0.0000
relapse subject-effect-arguments Mean:  27.4940 Std:  0.0000
tapering main-arguments Mean:  41.3364 Std:  0.7128
tapering event-specific-arguments Mean:  42.8850 Std:  0.0000
tapering subject-effect-arguments Mean:  35.5477 Std:  0.0000

Overall:
batch description_guided 0.749 Mean:  36.9189 Std:  0.2921
*************************************


*************************
batch question_guided 0.749 Llama-3-8B-Instruct
********* Key Reuslts************

taking-moud main-arguments Mean:  34.8855 Std:  0.4264
taking-moud event-specific-arguments Mean:  42.1992 Std:  0.7698
taking-moud sub

## Llama-3-70B-Instruct

In [None]:
mdl_name = 'Llama-3-70B-Instruct'
prompt_types = ['batch']#['batch', 'isolated']
exp_types = ['description_guided', 'question_guided']
similarity_threshold = [0.749, 0.99]

results_for_print = {}
for p_type in prompt_types:
    for s_t in similarity_threshold:
      for e_type in exp_types:
              print("\n*************************")
              print(p_type, e_type, s_t, mdl_name)
              key = f'{p_type}_{e_type}_{s_t}'
              value = print_check_score(p_type, e_type, s_t, mdl_name)
              results_for_print[key] = value



*************************
batch description_guided 0.749 Llama-3-70B-Instruct
********* Key Reuslts************

taking-moud main-arguments Mean:  41.3565 Std:  3.4753
taking-moud event-specific-arguments Mean:  39.3901 Std:  1.0344
taking-moud subject-effect-arguments Mean:  25.2899 Std:  2.3213
relapse main-arguments Mean:  35.2064 Std:  2.8816
relapse event-specific-arguments Mean:  38.8045 Std:  4.2920
relapse subject-effect-arguments Mean:  30.7875 Std:  4.2407
tapering main-arguments Mean:  41.5460 Std:  3.6548
tapering event-specific-arguments Mean:  40.5723 Std:  0.4514
tapering subject-effect-arguments Mean:  28.8311 Std:  4.1047

Overall:
batch description_guided 0.749 Mean:  35.7538 Std:  0.8714
*************************************


*************************
batch question_guided 0.749 Llama-3-70B-Instruct
********* Key Reuslts************

taking-moud main-arguments Mean:  37.2819 Std:  0.6907
taking-moud event-specific-arguments Mean:  42.3178 Std:  0.7354
taking-moud s

## GPT-4o

In [None]:
mdl_name = 'gpt-4o'
prompt_types = ['batch']#['batch', 'isolated']
exp_types = ['description_guided', 'question_guided']
similarity_threshold = [0.749, 0.99]

results_for_print = {}
for p_type in prompt_types:
    for s_t in similarity_threshold:
      for e_type in exp_types:
              print("\n*************************")
              print(p_type, e_type, s_t, mdl_name)
              key = f'{p_type}_{e_type}_{s_t}'
              value = print_check_score(p_type, e_type, s_t, mdl_name)
              results_for_print[key] = value



*************************
batch description_guided 0.749 gpt-4o
********* Key Reuslts************

taking-moud main-arguments Mean:  37.8800 Std:  2.0054
taking-moud event-specific-arguments Mean:  46.3411 Std:  1.0525
taking-moud subject-effect-arguments Mean:  30.5091 Std:  1.1099
relapse main-arguments Mean:  43.5650 Std:  1.3968
relapse event-specific-arguments Mean:  41.9453 Std:  2.1800
relapse subject-effect-arguments Mean:  39.6814 Std:  2.3105
tapering main-arguments Mean:  42.9098 Std:  1.1140
tapering event-specific-arguments Mean:  38.4389 Std:  1.9056
tapering subject-effect-arguments Mean:  43.1596 Std:  2.6988

Overall:
batch description_guided 0.749 Mean:  40.4922 Std:  0.2273
*************************************


*************************
batch question_guided 0.749 gpt-4o
********* Key Reuslts************

taking-moud main-arguments Mean:  35.7777 Std:  1.0114
taking-moud event-specific-arguments Mean:  47.3871 Std:  0.6394
taking-moud subject-effect-arguments Mean

# Other Baseline Models

##Extractive-QA

In [None]:
import statistics

def print_check_score_2(s_t, mdl_name):
    root = 'New-Avg-Run-Experiments'
    folder = 'Argument-Extraction-Results'

    numbers = ['0', '1', '2']
    event_types = ['taking-moud', 'relapse', 'tapering']
    arg_types = ['main-arguments', 'event-specific-arguments', 'subject-effect-arguments']

    all_results = {}
    for number in numbers: #iterating over all 3 runs
          file_name = f'{number}-new-results-{mdl_name}.json'
          name = os.path.join(root, folder, file_name)
          results = read_json_file(name)
          data = results[f'{number}-{mdl_name}-{s_t}']
          #pprint(data)
          ls3 = []
          for event in event_types:
              ls1 = []
              for arg_type in arg_types:
                  arguments = list(data[event][arg_type].keys())
                  ls = []
                  for arg in arguments:
                      f1_score = data[event][arg_type][arg][0][2] ##stored f1-score
                      gt_count = data[event][arg_type][arg][1][2] ##stored ground-truth count
                      if(gt_count>5):
                          #print(event, arg_type, arg, "{:.5f}".format(f1_score))
                          ls.append(f1_score)
                  if(len(ls)>0):
                     # print(event, arg_type, "{:.4f}".format(sum(ls)/len(ls)*100))

                      key = f'{event}_{arg_type}'
                      if key in all_results:
                          all_results[key].append(sum(ls)/len(ls))
                      else:
                          all_results[key] = [sum(ls)/len(ls)]
                      ls1.append(sum(ls)/len(ls))
              if(len(ls1)>0):
                  #print("{:.4f}".format(sum(ls1)/len(ls1)*100,"\n"))
                  ls3.append(sum(ls1)/len(ls1))

          key = f"{mdl_name}-{s_t}"
          if key in all_results:
              all_results[key].append(sum(ls3)/len(ls3))
          else :
              all_results[key] = [sum(ls3)/len(ls3)]

          #print("\nOverall", "{:.4f}".format(sum(ls3)/len(ls3)*100))

    print("********* Key Reuslts************\n")
    for event in event_types:
        for arg_type in arg_types:
            key = f'{event}_{arg_type}'
            value = all_results[key]
            print(event, arg_type, "Mean: ", "{:.4f}".format(statistics.mean(value)*100),
                  "Std: ", "{:.4f}".format(statistics.stdev(value)*100))

    key = f"{mdl_name}-{s_t}"
    value = all_results[key]

    print("\nOverall:")
    print(mdl_name, s_t, "Mean: ", "{:.4f}".format(statistics.mean(value)*100),
      "Std: ", "{:.4f}".format(statistics.stdev(value)*100))
    print("*************************************\n")
    return all_results

def read_json_file(name):
    with open(name, 'r') as f:
        data = json.load(f)
        return data



In [None]:
mdl_name = 'extractive_qa_fine_tuned_squad'
similarity_threshold = [0.749, 0.99]
results_for_print = {}

for s_t in similarity_threshold:
          print("\n*************************")
          print(s_t, mdl_name)
          key = f'{mdl_name}_{s_t}'
          value = print_check_score_2(s_t, mdl_name)
          results_for_print[key] = value



*************************
0.749 extractive_qa_fine_tuned_squad
********* Key Reuslts************

taking-moud main-arguments Mean:  4.9856 Std:  0.0000
taking-moud event-specific-arguments Mean:  19.2658 Std:  0.0000
taking-moud subject-effect-arguments Mean:  16.0842 Std:  0.0000
relapse main-arguments Mean:  14.9505 Std:  0.0000
relapse event-specific-arguments Mean:  29.1571 Std:  0.0000
relapse subject-effect-arguments Mean:  15.3909 Std:  0.0000
tapering main-arguments Mean:  19.9813 Std:  0.0000
tapering event-specific-arguments Mean:  18.6588 Std:  0.0000
tapering subject-effect-arguments Mean:  15.7460 Std:  0.0000

Overall:
extractive_qa_fine_tuned_squad 0.749 Mean:  17.1356 Std:  0.0000
*************************************


*************************
0.99 extractive_qa_fine_tuned_squad
********* Key Reuslts************

taking-moud main-arguments Mean:  1.2579 Std:  0.0000
taking-moud event-specific-arguments Mean:  7.8374 Std:  0.0000
taking-moud subject-effect-arguments M

##FLAN-T5-Base

In [None]:
mdl_name = 'generative-qa-flan-t5-base'
similarity_threshold = [0.749, 0.99]
results_for_print = {}

for s_t in similarity_threshold:
          print("\n*************************")
          print(s_t, mdl_name)
          key = f'{mdl_name}_{s_t}'
          value = print_check_score_2(s_t, mdl_name)
          results_for_print[key] = value



*************************
0.749 generative-qa-flan-t5-base
********* Key Reuslts************

taking-moud main-arguments Mean:  34.9931 Std:  0.0000
taking-moud event-specific-arguments Mean:  37.2296 Std:  0.0000
taking-moud subject-effect-arguments Mean:  11.4086 Std:  0.0000
relapse main-arguments Mean:  25.3799 Std:  0.0000
relapse event-specific-arguments Mean:  20.3118 Std:  0.0000
relapse subject-effect-arguments Mean:  11.3780 Std:  0.0000
tapering main-arguments Mean:  38.7827 Std:  0.0000
tapering event-specific-arguments Mean:  40.9407 Std:  0.0000
tapering subject-effect-arguments Mean:  17.5392 Std:  0.0000

Overall:
generative-qa-flan-t5-base 0.749 Mean:  26.4404 Std:  0.0000
*************************************


*************************
0.99 generative-qa-flan-t5-base
********* Key Reuslts************

taking-moud main-arguments Mean:  4.8934 Std:  0.0000
taking-moud event-specific-arguments Mean:  25.2465 Std:  0.0000
taking-moud subject-effect-arguments Mean:  2.85

##FLAN-T5-Large

In [None]:
mdl_name = 'generative-qa-flan-t5-large'
similarity_threshold = [0.749, 0.99]
results_for_print = {}

for s_t in similarity_threshold:
          print("\n*************************")
          print(s_t, mdl_name)
          key = f'{mdl_name}_{s_t}'
          value = print_check_score_2(s_t, mdl_name)
          results_for_print[key] = value





*************************
0.749 generative-qa-flan-t5-large
********* Key Reuslts************

taking-moud main-arguments Mean:  41.7027 Std:  0.0000
taking-moud event-specific-arguments Mean:  45.6181 Std:  0.0000
taking-moud subject-effect-arguments Mean:  23.9258 Std:  0.0000
relapse main-arguments Mean:  38.4419 Std:  0.0000
relapse event-specific-arguments Mean:  27.4133 Std:  0.0000
relapse subject-effect-arguments Mean:  19.7963 Std:  0.0000
tapering main-arguments Mean:  44.0440 Std:  0.0000
tapering event-specific-arguments Mean:  51.9250 Std:  0.0000
tapering subject-effect-arguments Mean:  26.9329 Std:  0.0000

Overall:
generative-qa-flan-t5-large 0.749 Mean:  35.5333 Std:  0.0000
*************************************


*************************
0.99 generative-qa-flan-t5-large
********* Key Reuslts************

taking-moud main-arguments Mean:  5.1082 Std:  0.0000
taking-moud event-specific-arguments Mean:  32.3349 Std:  0.0000
taking-moud subject-effect-arguments Mean:  7