## Searching for Prompts Asking for Unit Tests!

In [1]:
import pyarrow.parquet as pq
import pyarrow as pa

code_table = pq.read_table("train-00000-of-00001-d9b93805488c263e.parquet")

In [2]:
len(code_table)

121959

In [3]:
code_table.column_names

['instruction', 'input', 'output', 'prompt']

In [12]:
code_table[3][1]

<pyarrow.StringScalar: 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nDevelop a function that will add two strings\n\n### Input:\nstr1 = "Hello "\nstr2 = "world"\n\n### Response:\ndef add_strings(str1, str2):\n    """This function takes two strings and returns the sum of them."""\n    return str1 + str2\n\nstr1 = "Hello "\nstr2 = "world"\nsum_of_strings = add_strings(str1, str2)'>

In [13]:
import re
import pyarrow as pa
import pyarrow.parquet as pq

# Define the regex pattern to match "unit test" (case-insensitive)
pattern = r"\bunit test\b"

# Initialize counters and result storage
sum = 0
matching_examples = []
unit_test_examples = None

# Iterate over the instructions column
for i, instruction in enumerate(code_table.column('instruction')):
    example = instruction.as_py()
    # Perform case-insensitive search
    match = re.search(pattern, example, re.IGNORECASE)
    if match:
        sum += 1
        new_batch = code_table.slice(i, 1)
        matching_examples.append(new_batch)

# Print the total number of matches
print("Number of examples with 'unit test':", sum)

# Concatenate all matching examples into a single table
unit_test_examples = pa.concat_tables(matching_examples)
print("Number of rows in unit_test_examples:", len(unit_test_examples))

# Save the matching examples to a Parquet file
pq.write_table(unit_test_examples, 'unit_test_examples.parquet')
print("Examples saved to unit_test_examples.parquet")

Number of examples with 'unit test': 22
Number of rows in unit_test_examples: 22
Examples saved to unit_test_examples.parquet


In [14]:
unit_test_samples = pq.read_table("unit_test_examples.parquet")

In [17]:
unit_test_examples[0][2]

<pyarrow.StringScalar: 'Create a unit test to check if two numbers are equal to each other.'>

In [19]:
unit_test_examples.schema

instruction: string
input: string
output: string
prompt: string
-- schema metadata --
huggingface: '{"info": {"features": {"instruction": {"dtype": "string", "' + 165

In [20]:
import pyarrow.parquet as pq
import json

# Load the Parquet file
parquet_file = 'unit_test_examples.parquet'  # Replace with your Parquet file name
table = pq.read_table(parquet_file)

# Convert the Parquet data to a list of dictionaries
data = table.to_pydict()  # Convert table to a dictionary of lists

# Reformat into the desired JSON structure
reformatted_data = [
    {
        "instruction": data["instruction"][i],
        "input": data["input"][i],
        "output": data["output"][i],
        "prompt": data["prompt"][i],
    }
    for i in range(len(data["instruction"]))
]

# Save to a JSON file
json_file = 'unit_test_examples.json'
with open(json_file, 'w') as f:
    json.dump(reformatted_data, f, indent=4)

print(f"JSON file saved as {json_file}")


JSON file saved as unit_test_examples.json


## Search for Outputs with Unit Tests!

In [21]:
import re
import pyarrow as pa
import pyarrow.parquet as pq

# Define the regex pattern to match "unit test" (case-insensitive)
pattern = r"\bassert\b"

# Initialize counters and result storage
sum = 0
matching_examples = []
unit_test_assert_examples = None

# Iterate over the instructions column
for i, instruction in enumerate(code_table.column('output')):
    example = instruction.as_py()
    # Perform case-insensitive search
    match = re.search(pattern, example, re.IGNORECASE)
    if match:
        sum += 1
        new_batch = code_table.slice(i, 1)
        matching_examples.append(new_batch)

# Print the total number of matches
print("Number of examples with 'assert':", sum)

# Concatenate all matching examples into a single table
unit_test_assert_examples = pa.concat_tables(matching_examples)
print("Number of rows in unit_test_assert_examples:", len(unit_test_assert_examples))

# Save the matching examples to a Parquet file
pq.write_table(unit_test_assert_examples, 'unit_test_assert_examples.parquet')
print("Examples saved to unit_test_assert_examples.parquet")

Number of examples with 'assert': 155
Number of rows in unit_test_assert_examples: 155
Examples saved to unit_test_assert_examples.parquet


In [22]:
import pyarrow.parquet as pq
import json

# Load the Parquet file
parquet_file = 'unit_test_assert_examples.parquet'  # Replace with your Parquet file name
table = pq.read_table(parquet_file)

# Convert the Parquet data to a list of dictionaries
data = table.to_pydict()  # Convert table to a dictionary of lists

# Reformat into the desired JSON structure
reformatted_data = [
    {
        "instruction": data["instruction"][i],
        "input": data["input"][i],
        "output": data["output"][i],
        "prompt": data["prompt"][i],
    }
    for i in range(len(data["instruction"]))
]

# Save to a JSON file
json_file = 'unit_test_assert_examples.json'
with open(json_file, 'w') as f:
    json.dump(reformatted_data, f, indent=4)

print(f"JSON file saved as {json_file}")

JSON file saved as unit_test_assert_examples.json
