# Preprocessing STREET Data

Data for GSM8K and SCONE are downloaded from https://github.com/amazon-science/street-reasoning/tree/main/data.

In [None]:
# import packages
import json
import re

In [None]:
# preprocessing gms8k dataset to apply EntailmentBank functions
def gsm8kFormatter(input_path: str, output_path: str):
  file = open(input_path)

  with open(output_path, "w") as outfile:
    for line in file:
      ex = json.loads(line)
      
      # delete irrelevant keys
      not_needed_keys = ['context', 'options', 'rationale', 'metadata', 'reasoning_graph_edges', 'textual_logical_units']
      for k in not_needed_keys:
        if k in ex:
          del ex[k]
  
      # modify hypothesis
      if 'answer' in ex:
        ex['hypothesis'] = 'The answer is ' + str(ex['answer'])
      
      # modify context
      if 'linearized_input' in ex:
        ex['context'] = ex['linearized_input']
        del ex['linearized_input']
      
      # modify proof
      if 'linearized_output' in ex:
        ex['proof'] = re.sub(r'int.: ' + ex['hypothesis'], 'hypothesis', ex['linearized_output'])
        del ex['linearized_output']

      # modify meta and answer for evaluation
      ex['meta'] = {'triples': '' , 'distractors': '', 'intermediate_conclusions': ''}

      # get triples
      split_steps = ex['context'].split('sent')
      triples = {}

      for step in split_steps:
        if step == '':
          continue
        ident = 'sent' + step.split(": ")[0]
        sent = step.split(": ")[-1].strip()
        triples[ident] = sent

      ex['meta']['triples'] = triples

      ex['question'] = re.sub(r'\n', '', ex['question'])
      # ex['meta']['question'] = ex['question']
      ex['answer'] = ex['hypothesis']

      # get intermediate_conclusions
      implications = ex['proof'].split(";")
      intermediates = {}
      for i in implications:
        interim = i.split(" -> ")[-1]
        interim_list = interim.split(": ")

        # bad
        if 'hypothesis' in interim_list[-1]:
          continue
        if 'int' not in interim_list[0]:
          continue

        intermediates[interim_list[0].strip()] = interim_list[1].strip()
      
      ex['meta']['intermediate_conclusions'] = intermediates
      
      # remove invalid examples
      if 'hypothesis' in ex['proof']:
        if 'sent0' not in ex['proof']:
          json.dump(ex, outfile)
          outfile.write('\n')   

In [None]:
# format gms8k data
gsm8kFormatter('/content/sample_data/raw_gsm8k_train.jsonl', 'train_gsm8k_interim.jsonl')
gsm8kFormatter('/content/sample_data/raw_gsm8k_dev.jsonl', 'dev_gsm8k_interim.jsonl')
gsm8kFormatter('/content/sample_data/raw_gsm8k_test.jsonl', 'test_gsm8k_interim.jsonl')

In [None]:
# reduces gsm8k dataset to desired size
def gsm8kCutter(input_path: str, output_path: str, samples: int):
  file = open(input_path)

  with open(output_path, "w") as outfile:
    count = 0

    for line in file:
      ex = json.loads(line)
      count += 1
      if count > samples:
        break
      
      json.dump(ex, outfile)
      outfile.write('\n')

In [None]:
# reduces gsm8k dataset
gsm8kCutter('train_gsm8k_interim.jsonl', 'train_gsm8k.jsonl', 250)
gsm8kCutter('dev_gsm8k_interim.jsonl', 'dev_gsm8k.jsonl', 50)
gsm8kCutter('test_gsm8k_interim.jsonl', 'test_gsm8k.jsonl', 75)

In [None]:
# helper function for reduced_proof function
# used to find relevant proof steps for desired portion of hypothesis
def recursion(steps, reverse_implication, consequent_step, antecedents):
  # base case
  if not any("int" in a for a in antecedents):
    return
  
  for el in antecedents:
    if "int" in el:
      steps.append(consequent_step[el].strip())
      recursion(steps, reverse_implication, consequent_step, reverse_implication[el])

In [None]:
# used to find relevant proof steps for desired portion of hypothesis
# type is alchemy, scene, or tangram
def reduced_proof(text, problem_type):
  premises = text.split(";")
  copy = text.split(";")

  if problem_type == 'alchemy':
    key = premises[-2].split(" -> ")[-1].split(": ")[-1].split(" ")[0]
  if problem_type == 'scene' or 'tangram':
    key = premises[-2].strip().split(":")[-1].split('has')[0].strip() + ' '

  premises.reverse() # reverse premises for loop
  antecedents = []
  reverse_implication = {} # key: consequent; value: antecedent labels
  consequent_step = {} # key: consequent; value: entire implication step
  steps = [] # all proof steps that lead to desired portion of hypothesis

  # build reverse_implication dictionary
  for p in premises:
    ac = p.split(" -> ")

    if len(ac) < 2:
      continue
    
    antecedents = ac[0].strip().split(" & ")
    consequent = ac[1].split(": ")[0].strip()
    reverse_implication[consequent] = antecedents
    consequent_step[consequent] = p

  # find all premises with desired portion of hypothesis
  for p in premises:
    ac = p.split(" -> ")

    # check for empty strings
    if len(ac) < 2:
      continue

    # check if relevant to our hypothesis
    if problem_type == 'alchemy':
      value = ac[1].split(": ")[1].split(" ")[0] # beaker number
    if problem_type == 'scene' or 'tangram':
      value = ac[1].split(":")[-1].split('has')[0].strip() + ' ' # position number

    # if premise relevant, recursively find all previous implications to this step
    consequent = ac[1].split(": ")[0].strip()
    if value == key:
      steps.append(p.strip())
      antecedents = reverse_implication[consequent]
      recursion(steps, reverse_implication, consequent_step, antecedents)

  # process output
  steps = list(set(steps))
  output_list = []

  for c in copy:
    if c.strip() in steps:
      output_list.append(c.strip())

  # concatenate into a single output string
  output = ""
  for i in output_list:
    output = output + i + "; "
  
  return key, output

In [21]:
# preprocesses scone dataset to apply EntailmentBank functions
def sconeFormatter(input_path: str, output_path: str):
  file = open(input_path)
  with open(output_path, "w") as outfile:
    for line in file:
      ex = json.loads(line)

      # casework by type of problem
      if 'ALCHEMY' in ex['id']:
        problem_type = 'alchemy'
      elif 'SCENE' in ex['id']:
        problem_type = 'scene'
      elif 'TANGRAM' in ex['id']:
        problem_type = 'tangram'
      else:
        # erroneous ids
        continue

      # delete irrelevant keys
      not_needed_keys = ['context', 'options', 'rationale', 'textual_logical_units', 'metadata', 'reasoning_graph_edges']
      for k in not_needed_keys:
        if k in ex:
          del ex[k]

      # modify context
      ex['context'] = ex['linearized_input']
      del ex['linearized_input']
      
      # modify proof, extract last beaker or position
      proof = ex['linearized_output']
      premises = proof.split(";")
      key, ex['proof'] = reduced_proof(proof, problem_type)
      del ex['linearized_output']

      # answer is answer for last beaker or position
      # hypothesis is answer in natural language
      if 'answer' in ex:
        # remove faulty example when answer is incomplete
        if key not in ex['answer']:
          continue
        all_hypotheses = ex['answer'].split(";")
        for h in all_hypotheses:
          if key in h:
            ex['hypothesis'] = h.strip()
        
        del ex['answer']
      
      # modify proof so that last step is hypothesis
      ex['proof'] = re.sub(r'int[0-9]+: ' + ex['hypothesis'], 'hypothesis', ex['proof'])

      # remove faulty example when hypothesis is intermediary step
      if ex['proof'].count('hypothesis') > 1:
        continue
      
      # modify meta and answer for evaluation
      ex['meta'] = {'triples': '' , 'distractors': '', 'intermediate_conclusions': ''}

      # get triples
      split_steps = ex['context'].split('sent')
      triples = {}

      for step in split_steps:
        if step == '':
          continue
        ident = 'sent' + step.split(": ")[0]
        sent = step.split(": ")[-1].strip()
        triples[ident] = sent

      ex['meta']['triples'] = triples

      ex['question'] = re.sub(r'\n', '', ex['question'])
      # ex['meta']['question'] = ex['question']
      ex['answer'] = ex['hypothesis']

      # get intermediate_conclusions
      implications = ex['proof'].split(";")
      intermediates = {}
      for i in implications:
        interim = i.split(" -> ")[-1]
        interim_list = interim.split(": ")

        # bad
        if 'hypothesis' in interim_list[-1]:
          continue
        if 'int' not in interim_list[0]:
          continue

        intermediates[interim_list[0].strip()] = interim_list[1].strip()
      
      ex['meta']['intermediate_conclusions'] = intermediates

      # get distractors
      premises = ex['context'].split("sent")
      ident = []
      distractors = []

      for sent in premises:
        if sent == '':
          continue

        p = 'sent' + sent.split(': ')[0]
        ident.append(p)

      for i in ident:
        if i + " " not in ex['proof']:
          distractors.append(i)
      
      ex['meta']['distractors'] = distractors

      json.dump(ex, outfile)
      outfile.write('\n')

In [22]:
# generate interim datasets
sconeFormatter('/content/sample_data/raw_scone_train.jsonl', 'train_scone_interim.jsonl')
sconeFormatter('/content/sample_data/raw_scone_dev.jsonl', 'dev_scone_interim.jsonl')
sconeFormatter('/content/sample_data/raw_scone_test.jsonl', 'test_scone_interim.jsonl')

In [23]:
# reduces scone dataset to desired size
def sconeCutter(input_path: str, output_path: str, samples):
  file = open(input_path)
  with open(output_path, "w") as outfile:

    alchemy = 0
    scene = 0
    tangram = 0

    for line in file:
      ex = json.loads(line)
      if 'ALCHEMY' in ex['id']:
        alchemy += 1
        if alchemy > samples:
          continue
      elif 'SCENE' in ex['id']:
        scene += 1
        if scene > samples:
          continue
      elif 'TANGRAM' in ex['id']:
        tangram += 1
        if tangram > samples:
          continue
        
      json.dump(ex, outfile)
      outfile.write('\n')

In [24]:
# reduces scone datasets
sconeCutter('/content/train_scone_interim.jsonl', 'train_scone.jsonl', 250)
sconeCutter('/content/dev_scone_interim.jsonl', 'dev_scone.jsonl', 50)
sconeCutter('/content/test_scone_interim.jsonl', 'test_scone.jsonl', 75)