In [73]:
import csv
import ast
import json

In [93]:
def validate_gpt_json(data):
    try: 
        evaluation_of_student_response = data.get("Evaluation of Student Response")
        action_based_on_evaluation = data.get("Action Based on Evaluation")
        subproblem_state = data.get("Subproblem State")
        subproblem = data.get("Subproblem")
        tutorbot = data.get("Tutorbot")
        
        if (
            isinstance(evaluation_of_student_response, str) and len(evaluation_of_student_response) == 1 and 
            isinstance(action_based_on_evaluation, str) and len(action_based_on_evaluation) <= 2 and 
            isinstance(subproblem_state, str) and len(subproblem_state) == 1 and 
            isinstance(subproblem, str) and 
            isinstance(tutorbot, str)
        ):
            return True
        else:
            return False
    except json.JSONDecodeError:
        return False

def validate_spock_json(data):
    try:
        if "Action Based on Evaluation" not in data or \
            "Evaluation of Student Response" not in data or \
            "Subproblem State" not in data or \
            "Subproblem" not in data or \
            "Tutorbot" not in data:
            return False

        if data["Action Based on Evaluation"] == "" and \
            data["Evaluation of Student Response"] == "" and \
            data["Subproblem State"] == "" and \
            data["Subproblem"] == "" and \
            data["Tutorbot"] == "":
            return False
            
        return True
    except:
        return False
    
def format_dpo_dataset(response_file):
    dpo_dataset = {
        "prompt":[],
        "chosen":[],
        "rejected":[]
    }

    retained = 0
    total = 0
    
    with open(response_file, 'r') as file:
        reader = csv.reader(file) # ['prompt', 'response', 'gpt']
        next(reader)

        for row in reader:
            total += 1
            
            prompt = row[0]
            tutorbot = row[1]
            gpt = row[2]

            # Choose good gpt responses
            gpt = ast.literal_eval(gpt)
            if validate_gpt_json(gpt) == False:
                continue
                
            # Skip corrupted tutorbot responses
            try:
                tutorbot = ast.literal_eval(tutorbot)
            except:
                dpo_dataset["prompt"].append(prompt)
                dpo_dataset["chosen"].append(str(gpt))
                dpo_dataset["rejected"].append(tutorbot)
                retained += 1
                continue
                
            if validate_spock_json(tutorbot) == False:
                dpo_dataset["prompt"].append(prompt)
                dpo_dataset["chosen"].append(str(gpt))
                dpo_dataset["rejected"].append(str(tutorbot)) 
                retained += 1
                continue

            # Add the data point
            gpt_eval = gpt["Evaluation of Student Response"].strip()
            tutorbot_eval = tutorbot["Evaluation of Student Response"].strip()

            if gpt_eval != tutorbot_eval:
                dpo_dataset["prompt"].append(prompt)
                dpo_dataset["chosen"].append(str(gpt))
                dpo_dataset["rejected"].append(str(tutorbot))
                retained += 1
                continue
            
            gpt_action = gpt["Action Based on Evaluation"].strip()
            tutorbot_action = tutorbot["Action Based on Evaluation"].strip()

            if gpt_action != tutorbot_action:
                dpo_dataset["prompt"].append(prompt)
                dpo_dataset["chosen"].append(str(gpt))
                dpo_dataset["rejected"].append(str(tutorbot))
                retained += 1
                continue

            gpt_sub_state = gpt["Subproblem State"].strip()
            tutorbot_sub_state = tutorbot["Subproblem State"].strip()

            if gpt_sub_state != tutorbot_sub_state:
                dpo_dataset["prompt"].append(prompt)
                dpo_dataset["chosen"].append(str(gpt))
                dpo_dataset["rejected"].append(str(tutorbot))
                retained += 1
                continue
                
    print("The total number of data points:", total)
    print("The number of retained data points:", retained, f"({retained / total * 100:.2f}%)")
    
    return dpo_dataset

def store(data, file_path):
    with open(file_path, 'w') as file:
        json.dump(data, file)


######################################
# Change these paths
response_file = "/data/kn22/spock_bio/spock_bio_Mistral-7B-Instruct-v0.2_r1/final_checkpoint-dpo/responses.csv"
data_file = "/data/kn22/dpo_data/bio/dpo_train/Mistral-7B-Instruct-v0.2_r1_dpo_bio_uniform_batch_123_filtered_dpo-both.json"
######################################

dpo_dataset = format_dpo_dataset(response_file)

store(dpo_dataset, data_file)

The total number of data points: 4921
The number of retained data points: 3462 (70.35%)


In [55]:
# def validate_gpt_json(data):
#     try: 
#         evaluation_of_student_response = data.get("Evaluation of Student Response")
#         action_based_on_evaluation = data.get("Action Based on Evaluation")
#         subproblem_state = data.get("Subproblem State")
#         tutorbot = data.get("Tutorbot")
        
#         if (
#             isinstance(evaluation_of_student_response, str) and len(evaluation_of_student_response) == 1 and 
#             isinstance(action_based_on_evaluation, str) and len(action_based_on_evaluation) <= 2 and 
#             isinstance(subproblem_state, str) and len(subproblem_state) == 1 and 
#             isinstance(tutorbot, str)
#         ):
#             return True
#         else:
#             return False
#     except json.JSONDecodeError:
#         return False

# def validate_spock_json(data):
#     try:
#         if "Action Based on Evaluation" not in data or \
#             "Evaluation of Student Response" not in data or \
#             "Subproblem State" not in data or \
#             "Tutorbot" not in data:
#             return False

#         if data["Action Based on Evaluation"] == "" and \
#             data["Evaluation of Student Response"] == "" and \
#             data["Subproblem State"] == "" and \
#             data["Tutorbot"] == "":
#             return False
            
#         return True
#     except:
#         return False
    
# def format_dpo_dataset(response_file):
#     dpo_dataset = {
#         "prompt":[],
#         "chosen":[],
#         "rejected":[]
#     }

#     retained = 0
#     total = 0
    
#     with open(response_file, 'r') as file:
#         reader = csv.reader(file) # ['prompt', 'response', 'gpt']
#         next(reader)

#         for row in reader:
#             total += 1
            
#             prompt = row[0]
#             tutorbot = row[1]
#             gpt = row[2]
            
#             # Skip corrupted tutorbot responses
#             try:
#                 tutorbot = ast.literal_eval(tutorbot)
#             except:
#                 continue
                
#             if validate_spock_json(tutorbot) == False:
#                 continue

#             # Choose good gpt responses
#             gpt = ast.literal_eval(gpt)
#             if validate_gpt_json(gpt) == False:
#                 continue

#             # Add the data point
#             gpt_eval = gpt["Evaluation of Student Response"].strip()
#             tutorbot_eval = tutorbot["Evaluation of Student Response"].strip()

#             if gpt_eval != tutorbot_eval:
#                 dpo_dataset["prompt"].append(prompt)
#                 dpo_dataset["chosen"].append(str(gpt))
#                 dpo_dataset["rejected"].append(str(tutorbot))
#                 retained += 1
#                 continue
            
#             gpt_action = gpt["Action Based on Evaluation"].strip()
#             tutorbot_action = tutorbot["Action Based on Evaluation"].strip()

#             if gpt_action != tutorbot_action:
#                 dpo_dataset["prompt"].append(prompt)
#                 dpo_dataset["chosen"].append(str(gpt))
#                 dpo_dataset["rejected"].append(str(tutorbot))
#                 retained += 1
#                 continue

#             gpt_sub_state = gpt["Subproblem State"].strip()
#             tutorbot_sub_state = tutorbot["Subproblem State"].strip()

#             if gpt_sub_state != tutorbot_sub_state:
#                 dpo_dataset["prompt"].append(prompt)
#                 dpo_dataset["chosen"].append(str(gpt))
#                 dpo_dataset["rejected"].append(str(tutorbot))
#                 retained += 1
#                 continue
                
#     print("The total number of data points:", total)
#     print("The number of retained data points:", retained, f"({retained / total * 100:.2f}%)")
    
#     return dpo_dataset

# def store(data, file_path):
#     with open(file_path, 'w') as file:
#         json.dump(data, file)


# ######################################
# # Change these paths
# response_file = "/data/kn22/spock_bio/spock_bio_mistral-instruct_r3/final_checkpoint-dpo/responses.csv"
# data_file = "/data/kn22/dpo_data/bio/dpo_train/mistral-instruct_dpo_bio_uniform_batch_123_filtered_dpo.json"
# ######################################

# dpo_dataset = format_dpo_dataset(response_file)

# store(dpo_dataset, data_file)

The total number of data points: 4921
The number of retained data points: 1379 (28.02%)


In [83]:
# def validate_gpt_json(data):
#     try: 
#         evaluation_of_student_response = data.get("Evaluation of Student Response")
#         action_based_on_evaluation = data.get("Action Based on Evaluation")
#         subproblem_state = data.get("Subproblem State")
#         subproblem = data.get("Subproblem")
#         tutorbot = data.get("Tutorbot")
        
#         if (
#             isinstance(evaluation_of_student_response, str) and len(evaluation_of_student_response) == 1 and 
#             isinstance(action_based_on_evaluation, str) and len(action_based_on_evaluation) <= 2 and 
#             isinstance(subproblem_state, str) and len(subproblem_state) == 1 and 
#             isinstance(subproblem, str) and 
#             isinstance(tutorbot, str)
#         ):
#             return True
#         else:
#             return False
#     except json.JSONDecodeError:
#         return False

# def validate_spock_json(data):
#     return True
    
# def format_dpo_dataset(response_file):
#     dpo_dataset = {
#         "prompt":[],
#         "chosen":[],
#         "rejected":[]
#     }

#     retained = 0
#     total = 0
    
#     with open(response_file, 'r') as file:
#         reader = csv.reader(file) # ['prompt', 'response', 'gpt']
#         next(reader)

#         for row in reader:
#             total += 1
            
#             prompt = row[0]
#             tutorbot = row[1]
#             gpt = row[2]
  
#             gpt = ast.literal_eval(gpt)
#             if validate_gpt_json(gpt) == False:
#                 continue

#             gpt = str(gpt)

#             retained += 1
#             dpo_dataset["prompt"].append(prompt)
#             dpo_dataset["chosen"].append(gpt)
#             dpo_dataset["rejected"].append(tutorbot)
                
#     print("The total number of data points:", total)
#     print("The number of retained data points:", retained, f"({retained / total * 100:.2f}%)")
    
#     return dpo_dataset

# def store(data, file_path):
#     with open(file_path, 'w') as file:
#         json.dump(data, file)


# ######################################
# # Change these paths
# response_file = "/data/kn22/spock_bio/spock_bio_vicuna-7b-v1.5_r1/final_checkpoint-dpo/responses.csv"
# data_file = "/data/kn22/dpo_data/bio/dpo_train/vicuna-7b-v1.5_r1_dpo_bio_uniform_batch_123_filtered_dpo-all.json"
# ######################################

# dpo_dataset = format_dpo_dataset(response_file)

# store(dpo_dataset, data_file)

The total number of data points: 4921
The number of retained data points: 4921 (100.00%)


In [84]:
dpo_dataset["rejected"][-1]

"{'Evaluation of Student Response': 'b', 'Action Based on Evaluation': '3', 'Subproblem State': 'z', 'Subproblem': '', 'Tutorbot': 'Great job! Your answer covers all the necessary points regarding the sequence of light traveling through the eye, the roles of each component, and the changes that occur along the way. Keep up the good work!'}"

In [88]:
dpo_dataset["chosen"][-1]

'{\'Evaluation of Student Response\': \'b\', \'Action Based on Evaluation\': \'3\', \'Subproblem State\': \'z\', \'Subproblem\': \'3. Changes that occur to light as it goes through each stage.\', \'Tutorbot\': "Great job! You\'ve accurately described the changes that occur to light as it goes through each stage of the eye. You\'ve completed all the subproblems, and the main problem is now finished."}'