In [1]:
import json
from pathlib import Path
from pprint import pprint

In [2]:
from collabmem.constants import REPO_ROOT

In [3]:
# lic evaluation data
# REPO_ROOT /src/lic/data/sharded_instructions_600.json
lic_data_path = REPO_ROOT / "src" / "lic" / "data" / "sharded_instructions_600.json"

In [4]:
with open(lic_data_path, "r") as f:
    lic_data = json.load(f)

In [5]:
len(lic_data)

627

In [6]:
lic_data[0].keys()

dict_keys(['task_id', 'prompt', 'test', 'public_test_cases', 'metadata', 'source', 'shards', 'task'])

In [7]:
print(lic_data[0]["prompt"])


def by_length(arr):
    """
    Given an array of integers, sort the integers that are between 1 and 9 inclusive,
    reverse the resulting array, and then replace each digit by its corresponding name from
    "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine".

    For example:
      arr = [2, 1, 1, 4, 5, 8, 2, 3]   
            -> sort arr -> [1, 1, 2, 2, 3, 4, 5, 8] 
            -> reverse arr -> [8, 5, 4, 3, 2, 2, 1, 1]
      return ["Eight", "Five", "Four", "Three", "Two", "Two", "One", "One"]
    
      If the array is empty, return an empty array:
      arr = []
      return []
    
      If the array has any strange number ignore it:
      arr = [1, -1 , 55] 
            -> sort arr -> [-1, 1, 55]
            -> reverse arr -> [55, 1, -1]
      return = ['One']
    """



In [8]:
# get the unique set of tasks
unique_tasks = set({item["task"] for item in lic_data})
print(unique_tasks)

{'math', 'data2text', 'code', 'database', 'actions', 'summary'}


In [9]:
def get_example_tasks(task: str, skip: int = 0) -> dict:
    # valid tasks: {'code', 'summary', 'data2text', 'database', 'math', 'actions'}
    for item in lic_data:
        if item["task"] == task:
            if skip > 0:
                skip -= 1
                continue
            return item

In [10]:
pprint(get_example_tasks("math"))

{'answer': 'Forty games every year translates to 40*4 = <<40*4=160>>160 games '
           'in four years.\n'
           'If Ara scores 21 points in each game, she has scored 160*21 = '
           '<<21*160=3360>>3360 points\n'
           '#### 3360',
 'question': 'Ara joined the school basketball team four years ago. She has '
             'been playing 40 games every year. If her score for every game is '
             '21 points, calculate the total number of points she has scored '
             'in the four years.',
 'shards': [{'shard': "what's Ara's total basketball score over four years?",
             'shard_id': 1},
            {'shard': 'Ara has been on the school basketball team for four '
                      'years now',
             'shard_id': 2},
            {'shard': 'Ara plays 40 games per year on average', 'shard_id': 3},
            {'shard': 'she scores 21 points in each game', 'shard_id': 4}],
 'task': 'math',
 'task_id': 'sharded-GSM8K/1246'}


In [11]:
pprint(get_example_tasks("code"))

{'metadata': {'func_name': 'by_length'},
 'prompt': '\n'
           'def by_length(arr):\n'
           '    """\n'
           '    Given an array of integers, sort the integers that are between '
           '1 and 9 inclusive,\n'
           '    reverse the resulting array, and then replace each digit by '
           'its corresponding name from\n'
           '    "One", "Two", "Three", "Four", "Five", "Six", "Seven", '
           '"Eight", "Nine".\n'
           '\n'
           '    For example:\n'
           '      arr = [2, 1, 1, 4, 5, 8, 2, 3]   \n'
           '            -> sort arr -> [1, 1, 2, 2, 3, 4, 5, 8] \n'
           '            -> reverse arr -> [8, 5, 4, 3, 2, 2, 1, 1]\n'
           '      return ["Eight", "Five", "Four", "Three", "Two", "Two", '
           '"One", "One"]\n'
           '    \n'
           '      If the array is empty, return an empty array:\n'
           '      arr = []\n'
           '      return []\n'
           '    \n'
           '      If the array

In [12]:
pprint(get_example_tasks("actions"))

{'fully_specified_question': [[{'content': 'You are given two sets of data, '
                                           'the first set is [12, 15, 11, 14, '
                                           '18, 19, 13, 14, 15, 16, 17, 18, '
                                           '19, 20, 21, 22, 23, 24, 25, 26] '
                                           'and the second set is [32, 35, 31, '
                                           '34, 38, 39, 33, 34, 35, 36, 37, '
                                           '38, 39, 40, 41, 42, 43, 44, 45, '
                                           '46]. Can you create two histograms '
                                           "using the 'create_histogram' "
                                           'function, one for each data set, '
                                           'with 5 bins each?',
                                'role': 'user'}]],
 'function': [{'description': 'Create a histogram based on provided data.',
               'name': 

In [None]:
pprint(get_example_tasks("database"))

{'db_id': 'tvshow',
 'fully_specified_question': "which countries' tv channels are playing some "
                             'cartoon written by Todd Casey?',
 'reference_sql': 'SELECT T1.country FROM TV_Channel AS T1 JOIN cartoon AS T2 '
                  "ON T1.id = T2.Channel WHERE T2.written_by  =  'Todd Casey'",
 'schema_sql': 'CREATE TABLE TV_Channel (\n'
               '    id TEXT   PRIMARY KEY,\n'
               '    series_name TEXT,\n'
               '    Country TEXT,\n'
               '    Language TEXT,\n'
               '    Content TEXT,\n'
               '    Pixel_aspect_ratio_PAR TEXT,\n'
               '    Hight_definition_TV TEXT,\n'
               '    Pay_per_view_PPV TEXT,\n'
               '    Package_Option TEXT,\n'
               '    UNIQUE (id)\n'
               ');\n'
               '\n'
               'CREATE TABLE TV_series (\n'
               '    id REAL   PRIMARY KEY,\n'
               '    Episode TEXT,\n'
               '    Air_Date TEXT,\n'
  

In [14]:
pprint(get_example_tasks("data2text"))

{'fewshot_descriptions': "Hagino's fellow countryman Daiya Seto captured the "
                         'bronze in 4:09.71, to give Japan two swimmers on the '
                         'same Olympic podium.\n'
                         'In 2017, Stamenković appeared in the Kazakhstan '
                         'Premier League for Irtysh Pavlodar.\n'
                         'Mr S.C. Behar was the Director General of RCVP '
                         'Noronha Academy of Administration from 01.04.1997 '
                         'until 31.01.1999.\n'
                         'In 2014 United States Senate election in '
                         'Mississippi, Cochran and McDaniel received 49.0% and '
                         '49.5% of the votes.\n'
                         'Óengus mac Nad Froích was the King of Munster until '
                         '489.\n'
                         'They reached a first innings score of 195, a total '
                         'propped up by a score of 54 by 

In [14]:
all_code_task_id_prefixes = set()
for item in lic_data:
    if item["task"] != "code":
        continue
    prefix = item["task_id"].split("/")[0]
    all_code_task_id_prefixes.add(prefix)
print(all_code_task_id_prefixes)

{'sharded-HumanEval', 'sharded-livecodebench'}


In [19]:
# get example code task with livecodebench prefix
lcb_example = None
for item in lic_data:
    if item["task"] == "code" and item["task_id"].startswith("sharded-livecodebench/"):
        lcb_example = item
        break

In [22]:
lcb_example.keys()

dict_keys(['question_title', 'question_content', 'platform', 'question_id', 'contest_id', 'contest_date', 'starter_code', 'difficulty', 'public_test_cases', 'private_test_cases', 'metadata', 'task_id', 'source', 'shards', 'task'])

In [28]:
print(lcb_example["question_content"])

You are given a 0-indexed array of strings details. Each element of details provides information about a given passenger compressed into a string of length 15. The system is such that:

The first ten characters consist of the phone number of passengers.
The next character denotes the gender of the person.
The following two characters are used to indicate the age of the person.
The last two characters determine the seat allotted to that person.

Return the number of passengers who are strictly more than 60 years old.
 
Example 1:

Input: details = ["7868190130M7522","5303914400F9211","9273338290F4010"]
Output: 2
Explanation: The passengers at indices 0, 1, and 2 have ages 75, 92, and 40. Thus, there are 2 people who are over 60 years old.

Example 2:

Input: details = ["1313579440F2036","2921522980M5644"]
Output: 0
Explanation: None of the passengers are older than 60.

 
Constraints:

1 <= details.length <= 100
details[i].length == 15
details[i] consists of digits from '0' to '9'.
deta

In [25]:
he_example = get_example_tasks("code")
print(he_example["prompt"])


def by_length(arr):
    """
    Given an array of integers, sort the integers that are between 1 and 9 inclusive,
    reverse the resulting array, and then replace each digit by its corresponding name from
    "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine".

    For example:
      arr = [2, 1, 1, 4, 5, 8, 2, 3]   
            -> sort arr -> [1, 1, 2, 2, 3, 4, 5, 8] 
            -> reverse arr -> [8, 5, 4, 3, 2, 2, 1, 1]
      return ["Eight", "Five", "Four", "Three", "Two", "Two", "One", "One"]
    
      If the array is empty, return an empty array:
      arr = []
      return []
    
      If the array has any strange number ignore it:
      arr = [1, -1 , 55] 
            -> sort arr -> [-1, 1, 55]
            -> reverse arr -> [55, 1, -1]
      return = ['One']
    """



In [27]:
pprint(he_example)

{'metadata': {'func_name': 'by_length'},
 'prompt': '\n'
           'def by_length(arr):\n'
           '    """\n'
           '    Given an array of integers, sort the integers that are between '
           '1 and 9 inclusive,\n'
           '    reverse the resulting array, and then replace each digit by '
           'its corresponding name from\n'
           '    "One", "Two", "Three", "Four", "Five", "Six", "Seven", '
           '"Eight", "Nine".\n'
           '\n'
           '    For example:\n'
           '      arr = [2, 1, 1, 4, 5, 8, 2, 3]   \n'
           '            -> sort arr -> [1, 1, 2, 2, 3, 4, 5, 8] \n'
           '            -> reverse arr -> [8, 5, 4, 3, 2, 2, 1, 1]\n'
           '      return ["Eight", "Five", "Four", "Three", "Two", "Two", '
           '"One", "One"]\n'
           '    \n'
           '      If the array is empty, return an empty array:\n'
           '      arr = []\n'
           '      return []\n'
           '    \n'
           '      If the array

In [39]:
pprint(get_example_tasks("actions"))

{'fully_specified_question': [[{'content': 'You are given two sets of data, '
                                           'the first set is [12, 15, 11, 14, '
                                           '18, 19, 13, 14, 15, 16, 17, 18, '
                                           '19, 20, 21, 22, 23, 24, 25, 26] '
                                           'and the second set is [32, 35, 31, '
                                           '34, 38, 39, 33, 34, 35, 36, 37, '
                                           '38, 39, 40, 41, 42, 43, 44, 45, '
                                           '46]. Can you create two histograms '
                                           "using the 'create_histogram' "
                                           'function, one for each data set, '
                                           'with 5 bins each?',
                                'role': 'user'}]],
 'function': [{'description': 'Create a histogram based on provided data.',
               'name': 

In [40]:
actions_example = get_example_tasks("actions")

In [44]:
actions_example

{'task_id': 'sharded-BFCL/parallel_129',
 'function': [{'name': 'create_histogram',
   'description': 'Create a histogram based on provided data.',
   'parameters': {'type': 'dict',
    'properties': {'data': {'type': 'array',
      'items': {'type': 'integer'},
      'description': 'The data for which histogram needs to be plotted.'},
     'bins': {'type': 'integer',
      'description': 'The number of equal-width bins in the range. Default is 10.'}},
    'required': ['data', 'bins']}}],
 'fully_specified_question': [[{'role': 'user',
    'content': "You are given two sets of data, the first set is [12, 15, 11, 14, 18, 19, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26] and the second set is [32, 35, 31, 34, 38, 39, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]. Can you create two histograms using the 'create_histogram' function, one for each data set, with 5 bins each?"}]],
 'language': 'Python',
 'test_category': 'parallel',
 'shards': [{'shard_id': 1, 'shard': 'Cr

In [46]:
database_example = get_example_tasks("database")

In [47]:
database_example

{'task_id': 'sharded-spider-val-633-medium',
 'spider_difficulty': 'medium',
 'fully_specified_question': "which countries' tv channels are playing some cartoon written by Todd Casey?",
 'reference_sql': "SELECT T1.country FROM TV_Channel AS T1 JOIN cartoon AS T2 ON T1.id = T2.Channel WHERE T2.written_by  =  'Todd Casey'",
 'db_id': 'tvshow',
 'schema_sql': 'CREATE TABLE TV_Channel (\n    id TEXT   PRIMARY KEY,\n    series_name TEXT,\n    Country TEXT,\n    Language TEXT,\n    Content TEXT,\n    Pixel_aspect_ratio_PAR TEXT,\n    Hight_definition_TV TEXT,\n    Pay_per_view_PPV TEXT,\n    Package_Option TEXT,\n    UNIQUE (id)\n);\n\nCREATE TABLE TV_series (\n    id REAL   PRIMARY KEY,\n    Episode TEXT,\n    Air_Date TEXT,\n    Rating TEXT,\n    Share REAL,\n    18_49_Rating_Share TEXT,\n    Viewers_m TEXT,\n    Weekly_Rank REAL,\n    Channel TEXT,\n    UNIQUE (id),\n    FOREIGN KEY (Channel) REFERENCES TV_Channel(id)\n);\n\nCREATE TABLE Cartoon (\n    id REAL   PRIMARY KEY,\n    Title T

In [None]:
# preprocess gsm8k

# first load in file
def preprocess_gsm8k(json_file: Path) -> list[dict]:
    with open(json_file) as f:
        data = json.load(f)

    processed_data = []
    for item in data:
        if item["task"] != "math":
            continue
        # key mapping:
        # question -> prompt
        # answer -> completion
        # task_id -> source_task_id ("sharded-GSM8K/{number}" -> "gsm8k/{number}")
        processed_item = {
            "prompt": item["question"],
            "completion": item["answer"],
            "source_task_id": item["task_id"].replace("sharded-GSM8K", "gsm8k"),
        }
        processed_data.append(processed_item)
    return processed_data

In [27]:
pprint(get_example_tasks("database"))

{'db_id': 'tvshow',
 'fully_specified_question': "which countries' tv channels are playing some "
                             'cartoon written by Todd Casey?',
 'reference_sql': 'SELECT T1.country FROM TV_Channel AS T1 JOIN cartoon AS T2 '
                  "ON T1.id = T2.Channel WHERE T2.written_by  =  'Todd Casey'",
 'schema_sql': 'CREATE TABLE TV_Channel (\n'
               '    id TEXT   PRIMARY KEY,\n'
               '    series_name TEXT,\n'
               '    Country TEXT,\n'
               '    Language TEXT,\n'
               '    Content TEXT,\n'
               '    Pixel_aspect_ratio_PAR TEXT,\n'
               '    Hight_definition_TV TEXT,\n'
               '    Pay_per_view_PPV TEXT,\n'
               '    Package_Option TEXT,\n'
               '    UNIQUE (id)\n'
               ');\n'
               '\n'
               'CREATE TABLE TV_series (\n'
               '    id REAL   PRIMARY KEY,\n'
               '    Episode TEXT,\n'
               '    Air_Date TEXT,\n'
  

In [15]:
from collabmem.constants import LIC_MINI_EVAL_PATH

In [16]:
with open(LIC_MINI_EVAL_PATH, "r") as f:
    mini_eval_data = json.load(f)

In [17]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
he_dataset = load_dataset("openai/openai_humaneval")

Generating test split: 100%|██████████| 164/164 [00:00<00:00, 3316.76 examples/s]


In [None]:
he_dataset["test"][105]

{'task_id': 'HumanEval/105',
 'prompt': '\ndef by_length(arr):\n    """\n    Given an array of integers, sort the integers that are between 1 and 9 inclusive,\n    reverse the resulting array, and then replace each digit by its corresponding name from\n    "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine".\n\n    For example:\n      arr = [2, 1, 1, 4, 5, 8, 2, 3]   \n            -> sort arr -> [1, 1, 2, 2, 3, 4, 5, 8] \n            -> reverse arr -> [8, 5, 4, 3, 2, 2, 1, 1]\n      return ["Eight", "Five", "Four", "Three", "Two", "Two", "One", "One"]\n    \n      If the array is empty, return an empty array:\n      arr = []\n      return []\n    \n      If the array has any strange number ignore it:\n      arr = [1, -1 , 55] \n            -> sort arr -> [-1, 1, 55]\n            -> reverse arr -> [55, 1, -1]\n      return = [\'One\']\n    """\n',
 'canonical_solution': '    dic = {\n        1: "One",\n        2: "Two",\n        3: "Three",\n        4: "Four",\n      

In [22]:
for item in mini_eval_data:
    if item["task"] == "code":
        print(item["task_id"])

sharded-HumanEval/105
sharded-HumanEval/109
sharded-HumanEval/111
sharded-HumanEval/113
sharded-HumanEval/114
sharded-HumanEval/118
sharded-HumanEval/128
sharded-HumanEval/138
sharded-HumanEval/139
sharded-HumanEval/150
sharded-HumanEval/153
sharded-HumanEval/159
sharded-HumanEval/17
sharded-HumanEval/26
sharded-HumanEval/39
sharded-HumanEval/43
sharded-HumanEval/5
sharded-HumanEval/59
sharded-HumanEval/62
sharded-HumanEval/7
sharded-HumanEval/76
sharded-HumanEval/86
sharded-HumanEval/98
sharded-livecodebench/2728


In [26]:
def add_full_spec_qa(data: list) -> None:
    for item in data:
        task = item["task"]
        if task == "math":
            item["full_spec_q"] = item["question"]
            item["ground_truth_a"] = item["answer"]
        elif task == "code":
            item["full_spec_q"] = item.get("prompt", None)
            # task id: sharded-HumanEval/{number}
            # or sharded-livecodebench/{number}
            # if it's humaneval we can get the ground truth from the dataset
            if item["task_id"].startswith("sharded-HumanEval/"):
                number = int(item["task_id"].split("/")[1])
                item["ground_truth_a"] = he_dataset["test"][number][
                    "canonical_solution"
                ]
            else:
                print(
                    f"livecodebench task id: {item['task_id']}, no ground truth available"
                )
                item["ground_truth_a"] = None
        elif task == "actions":
            item["full_spec_q"] = item["fully_specified_question"][0][0]["content"]
            item["ground_truth_a"] = item["reference_answer"]
        elif task == "database":
            item["full_spec_q"] = item["fully_specified_question"]
            item["ground_truth_a"] = item["reference_sql"]
        else:
            print(f"skipping {item['task_id']}")
            item["full_spec_q"] = None
            item["ground_truth_a"] = None

In [27]:
add_full_spec_qa(mini_eval_data)

livecodebench task id: sharded-livecodebench/2728, no ground truth available
skipping sharded-totto_7401674541652223105
skipping sharded-totto_-8257732455267918872
skipping sharded-totto_-1368446073372924912
skipping sharded-totto_1805808831193641472
skipping sharded-totto_2466054609666099920
skipping sharded-totto_8581319284013960198
skipping sharded-totto_-4157060416941050999
skipping sharded-totto_5379601793177459923
skipping sharded-totto_1798717168393840878
skipping sharded-totto_6848965724662627070
skipping sharded-totto_-4987169707523815662
skipping sharded-totto_-8347284934406871193
skipping sharded-totto_-1554888779901746112
skipping sharded-totto_-7720423331010486388
skipping sharded-totto_-7477353734244246187
skipping sharded-totto_-1200443185142546807
skipping sharded-totto_-7587628134860225807
skipping sharded-totto_-1437560876204824116
skipping sharded-totto_7799429181150617627
skipping sharded-totto_-3487325775237851124
skipping sharded-totto_-6960102638412075927
skippin

In [28]:
# rewrite the updated data to that file
with open(LIC_MINI_EVAL_PATH, "w") as f:
    json.dump(mini_eval_data, f, indent=2)