In [1]:
import os
import pandas as pd
from pathlib import Path
import json

In [2]:

base_folder = '../../Dataset/dataset-source-codes'

if not os.path.exists(base_folder):
    print(f"The folder is not exist")
else:
    print(f"Content in base folder '{base_folder}': ")
    for item in os.listdir(base_folder):
        item_path = Path(base_folder) / item
        if item_path.is_dir():
            print(f"Folder: {item}")
        else:
            print(f"File: {item}")

Content in base folder '../../Dataset/dataset-source-codes': 
Folder: source_code_000
Folder: source_code_001
Folder: source_code_002
Folder: source_code_003
Folder: source_code_004
Folder: source_code_005
Folder: source_code_006
Folder: source_code_007
Folder: source_code_008
Folder: source_code_009
Folder: source_code_010
Folder: source_code_011
Folder: source_code_012
Folder: source_code_013
Folder: source_code_014
Folder: source_code_015
Folder: source_code_016
Folder: source_code_017
Folder: source_code_018
Folder: source_code_019
Folder: source_code_020
Folder: source_code_021
Folder: source_code_022
Folder: source_code_023
Folder: source_code_024
Folder: source_code_025
Folder: source_code_026
Folder: source_code_027
Folder: source_code_028
Folder: source_code_029
Folder: source_code_030
Folder: source_code_031
Folder: source_code_032
Folder: source_code_033
Folder: source_code_034
Folder: source_code_035
Folder: source_code_036
Folder: source_code_037
Folder: source_code_038
Fo

In [3]:
data_rows = []

#### Candidate and Generated AI code Extraction

In [4]:
for folder_id in range(63): #This are from source code 000 to 062
    source_code_id = f"source_code_{folder_id:03d}"
    folder_path = Path(base_folder) / source_code_id
    json_file = folder_path / f"{source_code_id}.json"

    # Is metadata exists?
    if not folder_path.exists() or not json_file.exists():
        print(f"Folder or metadata missing {source_code_id}")
        continue

    # Load metadata
    try:
        with open(json_file, 'r') as f:
            metadata = json.load(f)
        question = metadata.get("question", "")
        example = metadata.get("examples", "")
        programming_language = metadata.get("programming_language", "unknown")
        response_time = metadata.get("response_time", None)
    except Exception as e:
        print(f"Error reading metadata for {source_code_id}: {e}")
        continue

    # locate candidate file
    candidate_file = next(folder_path.glob(f"{source_code_id}.*"), None)
    if not candidate_file:
        print(f"No candidate file found in the {source_code_id}")
        continue
    
    # Read candidate file
    with open (candidate_file,'r') as f:
        candidate_code = f.read()

    # Getting and Iterate through all AI-generated files
    for ai_file in folder_path.glob(f"{source_code_id}_gpt-*.*"):
        with open(ai_file, 'r') as f:
            ai_code = f.read()

        # Extract AI model name
        ai_model = ai_file.stem.split("_gpt-")[1]
        
        # Add to data row
      
        data_rows.append({
            "coding_problem_id": source_code_id,  # Align with labeling file
            "question": question,
            "example": example,
            "programming_language": programming_language,
            "response_time": response_time,
            "llm_answer_id": ai_model,  # Align with labeling file
            "candidate_answer": candidate_code,
            "ai_generated_answer": ai_code
        })

In [5]:
preprocessed_data = pd.DataFrame(data_rows)

# Save to CSV
output_csv = "../results/preprocessed_All_code_metadata.csv"
preprocessed_data.to_csv(output_csv, index=False)
print(f"Preprocessed data saved to {output_csv}")

Preprocessed data saved to ../results/preprocessed_All_code_metadata.csv


In [6]:
preprocessed_data.head()

Unnamed: 0,coding_problem_id,question,example,programming_language,response_time,llm_answer_id,candidate_answer,ai_generated_answer
0,source_code_000,Write a program to find the largest element in...,"Input: [1, 4, 2, 9, 5]\nOutput: 9",Java,405,3.5-turbo_00,fun findLargestElement(array: IntArray) : Int ...,public class LargestElementFinder {\n publi...
1,source_code_000,Write a program to find the largest element in...,"Input: [1, 4, 2, 9, 5]\nOutput: 9",Java,405,3.5-turbo_01,fun findLargestElement(array: IntArray) : Int ...,public class Main {\n public static void ma...
2,source_code_000,Write a program to find the largest element in...,"Input: [1, 4, 2, 9, 5]\nOutput: 9",Java,405,4-turbo_00,fun findLargestElement(array: IntArray) : Int ...,public class Main {\n public static void ma...
3,source_code_000,Write a program to find the largest element in...,"Input: [1, 4, 2, 9, 5]\nOutput: 9",Java,405,4-turbo_01,fun findLargestElement(array: IntArray) : Int ...,public class LargestElement {\n public stat...
4,source_code_000,Write a program to find the largest element in...,"Input: [1, 4, 2, 9, 5]\nOutput: 9",Java,405,4_00,fun findLargestElement(array: IntArray) : Int ...,public class Main {\n public static void main...
