### Reformatting All CSVs

In [1]:
import pandas as pd
import re

def reformat(file):
    # Step 1: Load the raw contents from the file
    with open(file, 'r', encoding='utf-8') as f:
        raw_text = f.read()

    # Step 2: Split on the marker for a new code example
    # We'll use a regex lookahead to keep the split marker (//Source method) with each example
    examples = re.split(r'(?="\/\/Source method)', raw_text)

    # Step 3: Clean up examples
    cleaned_examples = [ex.strip().strip('"') for ex in examples if ex.strip()]

    # Step 4: Save to a DataFrame
    df = pd.DataFrame(cleaned_examples, columns=['code'])

    # Optional: save it back to a safe CSV format with full quoting
    df.to_csv(f'{file[:-13]}responses_reformatted.csv', index=False, quoting=1)  # quoting=1 means quote all

In [3]:
files = []

import os

for file in os.listdir("/media/mujtaba/DATA/nick/UnitTestExamples/data/results"):
    if "CodeLlama" in file or "starcoder" in file or "DeepSeek" in file:
        files.append('/' + file)

files = ["/media/mujtaba/DATA/nick/UnitTestExamples/data/results" + file + '/responses.csv' for file in files]
print(files)
for file in files:
    reformat(file)

['/media/mujtaba/DATA/nick/UnitTestExamples/data/results/DeepSeek-Coder-V2-Lite-Base__few_shot_first_prompts/responses.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/DeepSeek-Coder-V2-Lite-Base__zero_shot_first_prompts/responses.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/CodeLlama-7b-Instruct-hf__few_shot_first_prompts/responses.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/starcoder2-3b__few_shot_first_prompts/responses.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/CodeLlama-7b-Instruct-hf__zero_shot_first_prompts/responses.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/starcoder2-3b__zero_shot_first_prompts/responses.csv']


### Checking Loading

In [5]:
df = pd.read_csv('/media/mujtaba/DATA/nick/UnitTestExamples/data/results/DeepSeek-Coder-V2-Lite-Base__zero_shot_first_prompts/responses_reformatted.csv', quoting=1)
df.iloc[0]

code    0
Name: 0, dtype: object

### Altering for CodeT5p

In [25]:
import pandas as pd
import re

def reformat_codet5p(file):
    # Load raw text from your file
    with open('raw_code_file.csv', 'r', encoding='utf-8') as f:
        raw_text = f.read()
    
    # Use regex to split between closing quote + newline + opening quote
    # This pattern: quote, newline(s), then quote
    examples = re.split(r'"\s*\n\s*"', raw_text)
    
    # Clean leading/trailing quotes and whitespace
    cleaned_examples = [ex.strip().strip('"') for ex in examples if ex.strip()]
    
    # Step 4: Save to a DataFrame
    df = pd.DataFrame(cleaned_examples, columns=['code'])

    # Optional: save it back to a safe CSV format with full quoting
    df.to_csv(f'{file[:-13]}responses_reformatted.csv', index=False, quoting=1)


In [26]:
files = []

import os

for file in os.listdir("/media/mujtaba/DATA/nick/UnitTestExamples/data/results"):
    if "codet5p" in file:
        files.append('/' + file)

files = ["/media/mujtaba/DATA/nick/UnitTestExamples/data/results" + file + '/responses.csv' for file in files]
print(files)
for file in files:
    reformat(file)

['/media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__few_shot_first_prompts/responses.csv', '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__zero_shot_first_prompts/responses.csv']


In [10]:
df = pd.read_csv("/media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__few_shot_first_prompts/responses.csv", quoting=1)

In [17]:
df['0']

0      \n        Solution s = new Solution();\n      ...
1      \n        Solution s = new Solution();\n      ...
2      \n        Solution s = new Solution();\n      ...
3      \n        Solution s = new Solution();\n      ...
4      \n        Solution s = new Solution();\n      ...
                             ...                        
159    \n        Solution s = new Solution();\n      ...
160    \n        Solution s = new Solution();\n      ...
161    \n        Solution s = new Solution();\n      ...
162    \n        Solution s = new Solution();\n      ...
163    \n        Solution s = new Solution();\n      ...
Name: 0, Length: 164, dtype: object

In [20]:
import os
import pandas as pd

# === Define directories ===
input_dir = "/media/mujtaba/DATA/nick/UnitTestExamples/data/results"
output_dir = "/media/mujtaba/DATA/nick/UnitTestExamples/data/results"
file_names = ["codet5p-2b__few_shot_first_prompts/responses.csv", "codet5p-2b__zero_shot_first_prompts/responses.csv"]  # Define the CSV filenames manually

# === Function to trim when there are 2 extra closing brackets than opening ===
def trim_code(code: str) -> str:
    open_count = 0
    close_count = 0

    for i, char in enumerate(code):
        if char == '{':
            open_count += 1
        elif char == '}':
            close_count += 1

        if close_count - open_count == 2:
            return code[:i + 1].strip()

    return code.strip()

# === Ensure output directory exists ===
os.makedirs(output_dir, exist_ok=True)

# === Process each file ===
for file_name in file_names:
    input_path = os.path.join(input_dir, file_name)
    output_path = os.path.join(output_dir, file_name[:-4] + "_reformatted.csv")

    df = pd.read_csv(input_path, quoting=1)
    df["code"] = df["0"].astype(str).apply(trim_code)
    df[["code"]].to_csv(output_path, index=False)
    
    print(f"✅ Trimmed code written to '{output_path}'")


✅ Trimmed code written to '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__few_shot_first_prompts/responses_reformatted.csv'
✅ Trimmed code written to '/media/mujtaba/DATA/nick/UnitTestExamples/data/results/codet5p-2b__zero_shot_first_prompts/responses_reformatted.csv'
