In [14]:
import json
import random
import csv
import io
import string

prompt_dict = {
    "generate": "You are a language model that only generates CSV strings with {x} rows and {y} columns with random text in every cell.",
    "generate_fill": '''You are a language model that only generates CSV strings with {x} rows and {y} columns with random text in every cell.
                        For every cell in the {x} rows and {y} columns of the CSV string, fill with random text.''',
    "generate_fill_define": '''You are a language model that only generates CSV strings with {x} rows and {y} columns with random text in every cell.
                        For every cell in the {x} rows and {y} columns of the CSV string, fill with random text.
                        Columns are separated by commas, rows are terminated by a new line, and each cell contains random text within quotation marks.'''
}

def generate_csv_eval_jsonl(num_prompts=50, 
                            num_rows=15, 
                            num_columns=15, 
                            prompt_text="Generate a CSV with {x} rows and {y} columns.", 
                            output_file="csv_eval.jsonl"):
    
    def random_string(length):
        return ''.join(random.choice(string.ascii_letters) for _ in range(length))
    
    def generate_csv_string(rows, columns):
        lines = []
        for _ in range(rows):
            row = [f'"{random_string(random.randint(5, 15))}"' for _ in range(columns)]
            lines.append(','.join(row))
        return '```' + '\n'.join(lines) + '```'
    
    with open(output_file, 'w') as f:
        for _ in range(num_prompts):
            rows = random.randint(2, num_rows)
            columns = random.randint(2, num_columns)
            
            prompt = prompt_text.format(x=rows, y=columns)
            expected_output = generate_csv_string(rows, columns)
            
            json_line = json.dumps({"prompt": prompt, "expected_output": expected_output})
            f.write(json_line + '\n')


for prompt_name, prompt_text in prompt_dict.items():
    # Generate the JSONL file for the current prompt
    jsonl_filename = f"csv_{prompt_name}_eval.jsonl"
    generate_csv_eval_jsonl(num_prompts=2500, num_rows=15, num_columns=15, prompt_text=prompt_text, output_file=jsonl_filename)

