In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer


In [2]:
file_path = "../results/preprocessed_All_code_metadata.csv"
df_preproccess = pd.read_csv(file_path)

# Preview the dataset
print("Dataset Sample:")
print(df_preproccess.head())


Dataset Sample:
  coding_problem_id                                           question  \
0   source_code_000  Write a program to find the largest element in...   
1   source_code_000  Write a program to find the largest element in...   
2   source_code_000  Write a program to find the largest element in...   
3   source_code_000  Write a program to find the largest element in...   
4   source_code_000  Write a program to find the largest element in...   

                             example programming_language  response_time  \
0  Input: [1, 4, 2, 9, 5]\nOutput: 9                 Java            405   
1  Input: [1, 4, 2, 9, 5]\nOutput: 9                 Java            405   
2  Input: [1, 4, 2, 9, 5]\nOutput: 9                 Java            405   
3  Input: [1, 4, 2, 9, 5]\nOutput: 9                 Java            405   
4  Input: [1, 4, 2, 9, 5]\nOutput: 9                 Java            405   

  llm_answer_id                                   candidate_answer  \
0  3.5-turbo

In [3]:
# Initialize the TensorFlow Tokenizer with an Out-of-Vocabulary (OOV) token
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<OOV>")

# Combine all text for fitting the tokenizer
all_text = df_preproccess['candidate_answer'].fillna('') + ' ' + df_preproccess['ai_generated_answer'].fillna('')

# Fit the tokenizer on the combined text
tokenizer.fit_on_texts(all_text)

# Print vocabulary size
print(f"Vocabulary size: {len(tokenizer.word_index)}")

Vocabulary size: 3053


In [4]:
# List to store tokenized rows
tokenized_rows = []

for _, row in df_preproccess.iterrows():
    # Tokenize candidate answer
    candidate_seq = tokenizer.texts_to_sequences([row['candidate_answer']])[0]
    candidate_word_count = len(candidate_seq)
    
    # Tokenize AI-generated answer
    ai_seq = tokenizer.texts_to_sequences([row['ai_generated_answer']])[0]
    ai_word_count = len(ai_seq)
    
    # Calculate shared tokens
    shared_token_count = len(set(candidate_seq).intersection(ai_seq))
    
    # Append tokenized features for this row
    tokenized_rows.append({
        "coding_problem_id": row['coding_problem_id'],
        "llm_answer_id": row['llm_answer_id'],
        "candidate_word_count": candidate_word_count,
        "ai_word_count": ai_word_count,
        "shared_tokens": shared_token_count,
        "candidate_seq": candidate_seq,
        "ai_generated_seq": ai_seq
    })

In [5]:
# Convert tokenized rows into a DataFrame
df_tokenized = pd.DataFrame(tokenized_rows)

# Preview the tokenized DataFrame
print("Tokenized DataFrame Sample:")
print(df_tokenized.head())

Tokenized DataFrame Sample:
  coding_problem_id llm_answer_id  candidate_word_count  ai_word_count  \
0   source_code_000  3.5-turbo_00                    25             49   
1   source_code_000  3.5-turbo_01                    25             49   
2   source_code_000    4-turbo_00                    25            112   
3   source_code_000    4-turbo_01                    25             92   
4   source_code_000          4_00                    25             68   

   shared_tokens                                      candidate_seq  \
0              6  [759, 1241, 31, 1192, 19, 6, 31, 500, 4, 5, 17...   
1              5  [759, 1241, 31, 1192, 19, 6, 31, 500, 4, 5, 17...   
2              9  [759, 1241, 31, 1192, 19, 6, 31, 500, 4, 5, 17...   
3              8  [759, 1241, 31, 1192, 19, 6, 31, 500, 4, 5, 17...   
4              6  [759, 1241, 31, 1192, 19, 6, 31, 500, 4, 5, 17...   

                                    ai_generated_seq  
0  [79, 41, 2594, 79, 125, 113, 83, 15, 269, 

In [None]:
# Save the tokenized DataFrame to a CSV file
output_csv = "../results/tokenizer_features.csv"
df_tokenized.to_csv(output_csv, index=False)

print(f"Feature matrix with TensorFlow tokenization saved to {output_csv}")

Feature matrix with TensorFlow tokenization saved to ../results/tokenizers_features.csv


In [7]:
df_tokenized

Unnamed: 0,coding_problem_id,llm_answer_id,candidate_word_count,ai_word_count,shared_tokens,candidate_seq,ai_generated_seq
0,source_code_000,3.5-turbo_00,25,49,6,"[759, 1241, 31, 1192, 19, 6, 31, 500, 4, 5, 17...","[79, 41, 2594, 79, 125, 113, 83, 15, 269, 19, ..."
1,source_code_000,3.5-turbo_01,25,49,5,"[759, 1241, 31, 1192, 19, 6, 31, 500, 4, 5, 17...","[79, 41, 83, 79, 125, 113, 83, 15, 269, 19, 84..."
2,source_code_000,4-turbo_00,25,112,9,"[759, 1241, 31, 1192, 19, 6, 31, 500, 4, 5, 17...","[79, 41, 83, 79, 125, 113, 83, 15, 269, 19, 31..."
3,source_code_000,4-turbo_01,25,92,8,"[759, 1241, 31, 1192, 19, 6, 31, 500, 4, 5, 17...","[79, 41, 1242, 79, 125, 113, 83, 15, 269, 19, ..."
4,source_code_000,4_00,25,68,6,"[759, 1241, 31, 1192, 19, 6, 31, 500, 4, 5, 17...","[79, 41, 83, 79, 125, 113, 83, 15, 269, 358, 2..."
...,...,...,...,...,...,...,...
373,source_code_062,3.5-turbo_01,131,109,31,"[105, 11, 1760, 106, 11, 1761, 104, 131, 9, 26...","[260, 18, 2586, 17, 17, 1767, 17, 422, 17, 15,..."
374,source_code_062,4-turbo_00,131,103,27,"[105, 11, 1760, 106, 11, 1761, 104, 131, 9, 26...","[260, 18, 2015, 40, 1327, 2, 40, 1198, 247, 20..."
375,source_code_062,4-turbo_01,131,142,30,"[105, 11, 1760, 106, 11, 1761, 104, 131, 9, 26...","[260, 18, 13, 77, 2, 480, 1077, 1122, 591, 114..."
376,source_code_062,4_00,131,113,25,"[105, 11, 1760, 106, 11, 1761, 104, 131, 9, 26...","[18, 2592, 17, 258, 632, 20, 422, 15, 13, 825,..."
