### Reformatting All CSVs

In [14]:
import pandas as pd
import re

def reformat(file):
    # Step 1: Load the raw contents from the file
    with open(file, 'r', encoding='utf-8') as f:
        raw_text = f.read()

    # Step 2: Split on the marker for a new code example
    # We'll use a regex lookahead to keep the split marker (//Source method) with each example
    examples = re.split(r'(?="\/\/Source method)', raw_text)

    # Step 3: Clean up examples
    cleaned_examples = [ex.strip().strip('"') for ex in examples if ex.strip()]

    # Step 4: Save to a DataFrame
    df = pd.DataFrame(cleaned_examples, columns=['code'])

    # Optional: save it back to a safe CSV format with full quoting
    df.to_csv(f'{file[:-13]}responses_reformatted.csv', index=False, quoting=1)  # quoting=1 means quote all

In [15]:
files = []

import os

for file in os.listdir("/media/mujtaba/DATA/nick/UnitTestExamples/data/results"):
    files.append('/' + file)

files = ["/media/mujtaba/DATA/nick/UnitTestExamples/data/results" + file + '/responses.csv' for file in files]
print(files)
for file in files:
    reformat(file)

['/media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5-base__zero_shot_first_prompts/responses.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__few_shot_first_prompts/responses.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__zero_shot_first_prompts/responses.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/CodeLlama-7b-Instruct-hf__few_shot_first_prompts/responses.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/starcoder2-3b__few_shot_first_prompts/responses.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/CodeLlama-7b-Instruct-hf__zero_shot_first_prompts/responses.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/starcoder2-3b__zero_shot_first_prompts/responses.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5-base__few_shot_first_prompts/responses.csv']


### Checking Loading

In [23]:
df = pd.read_csv('/media/mujtaba/DATA/nick/UnitTestExamples/data/results/starcoder2-3b__zero_shot_first_prompts/responses_reformatted.csv', quoting=1)
df.iloc[0]

code    0
Name: 0, dtype: object

### Altering for CodeT5p

In [25]:
import pandas as pd
import re

def reformat_codet5p(file):
    # Load raw text from your file
    with open('raw_code_file.csv', 'r', encoding='utf-8') as f:
        raw_text = f.read()
    
    # Use regex to split between closing quote + newline + opening quote
    # This pattern: quote, newline(s), then quote
    examples = re.split(r'"\s*\n\s*"', raw_text)
    
    # Clean leading/trailing quotes and whitespace
    cleaned_examples = [ex.strip().strip('"') for ex in examples if ex.strip()]
    
    # Step 4: Save to a DataFrame
    df = pd.DataFrame(cleaned_examples, columns=['code'])

    # Optional: save it back to a safe CSV format with full quoting
    df.to_csv(f'{file[:-13]}responses_reformatted.csv', index=False, quoting=1)


In [26]:
files = []

import os

for file in os.listdir("/media/mujtaba/DATA/nick/UnitTestExamples/data/results"):
    if "codet5p" in file:
        files.append('/' + file)

files = ["/media/mujtaba/DATA/nick/UnitTestExamples/data/results" + file + '/responses.csv' for file in files]
print(files)
for file in files:
    reformat(file)

['/media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__few_shot_first_prompts/responses.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__zero_shot_first_prompts/responses.csv']


In [None]:
df = pd.read_csv("/media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__few_shot_first_prompts/responses_reformatted.csv", quoting=1)

1