<a href="https://colab.research.google.com/github/meghorikawa/ULLM/blob/main/Extract_values.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import json

In [None]:
athletes = pd.read_csv("/content/athletes_df_COT_completions.csv") # aslo load FS_completions csv to parse
bday = pd.read_csv("/content/bday_df_COT_completions.csv") # also need to load FS_completions csv to parse

athletes_properties = ["relevance", "factual", "construct_present"] # Athletes task requires factuality property
bday_properties = ["relevance", "construct"] # Birthday task requires no factuality checks

In [None]:
def create_columns(df):
  """
  Create the necessary columns for the dataframe: one for each property/prompting-strategy combination
  """
  properties =  []
  if df is athletes:
    properties = athletes_properties
  else:
    properties = bday_properties

  strategies = ["FS", "COT"]
  for strat in strategies:
    for prop in properties:
      for k in range(0, 6):
        df[f"{prop}_{strat}_{str(k)}"] = ""

In [None]:
create_columns(athletes)
create_columns(bday)

In [None]:
def fill_out_df(df):
  """
  Fills out the dtaframe with the values present in the LLM's JSON responses
  """
  properties =  []
  if df is athletes:
    properties = athletes_properties
  else:
    properties = bday_properties

  # at each row
  for r_idx, row in df.iterrows():
    #iterate over the columns with JSON
    for col in df.filter(regex = "(ComparingAthletes|BirthdayPresent)_(FS|COT)_[0-9]").columns:
      #get the JSON present in that column and parse it
      try:
        json_str = df.loc[r_idx, col]
        if "'" in json_str:
          # The model sometimes has internal quotes or double quotes which are unescaped: escape them
          json_str = json_str.replace("'", "\'")
        lm_resp_json = json.loads(json_str)

        #get the task name, prompting strategy and k (k-shot) values from the name of the column
        task = col.split("_")[0]
        strat = col.split("_")[1]
        k = col.split("_")[2]

        # extract the properties from the JSON and put them in the correct column
        for prop in properties:
          df.loc[r_idx, f"{prop}_{strat}_{k}"] = lm_resp_json[prop]

      except Exception as e: # If there is a problem parsing the JSON, print the content and the row for examination
        df.loc[r_idx, col] = ""
        print(e)
        print("Could not parse row: ", r_idx, "\n\nContent:", row[col], "\n\n", col, "\n____________\n")



In [None]:
from json.decoder import JSONDecodeError
def clean_faulty_json(json_str, properties, df, r_idx, col):
  try:
    lm_resp_json = json.loads(json_str)

  except JSONDecodeError: # If there is a problem parsing the JSON, print the content and the row for examination

    print(e)
    try:
      single_quoted_val = json_str.split("thought")
    except:
      df.loc[r_idx, col] = ""
      print("Could not parse row: ", r_idx, "\n\nContent:", row[col], "\n\n", col, "\n____________\n")


In [None]:
# Parse the JSON completions into seperate columns
fill_out_df(athletes)

In [None]:
# Parse the JSON completions into seperate columns
fill_out_df(bday)

In [None]:
# Save DF
athletes.to_csv("/content/athletes_df_COT_parsed.csv")


In [None]:
# Save DF
bday.to_csv("/content/bday_df_COT_parsed.csv")