In [None]:
# load data with HumT scores from prism, uf, and lmsys
import pandas as pd

ultrafeedback = pd.read_csv('humt_and_sociot_scores/uf_all_1103.csv')

lmsys = pd.read_csv("humt_and_sociot_scores/lmsys_ab_1103.csv")

prism = pd.read_csv("humt_and_sociot_scores/prism_all_1112.csv")

In [None]:
# train test split for prism: first drop duplicates, then split

import pandas as pd
from sklearn.model_selection import train_test_split


unique_conversation_ids = prism['conversation_id'].unique()

# Perform 90% train, 10% test split on conversation_id
train_ids, test_ids = train_test_split(unique_conversation_ids, test_size=0.1, random_state=42)

# Assign train/test labels
prism['train'] = prism['conversation_id'].isin(train_ids)

# Split into train and test sets
train_prism = prism[prism['train']]
test_prism = prism[~prism['train']]

# Deduplicate user_prompt only in test set
test_prism = test_prism.drop_duplicates(subset='user_prompt', keep='first')

# Drop the 'train' column before saving final train and test datasets
train_prism = train_prism.drop(columns=['train'])
test_prism = test_prism.drop(columns=['train'])

# Print sizes
print("Train size:", len(train_prism))
print("Test size:", len(test_prism))







Train size: 27028
Test size: 793


In [None]:
# train test split for ultrafeedback

# group by instructions
unique_instructions = ultrafeedback['instruction'].unique()

# Step 3: Perform 90% train, 10% test split on conversation_id
train_ids, test_ids = train_test_split(unique_instructions, test_size=0.1, random_state=42)

# Step 4: Assign train/test labels
ultrafeedback['train'] = ultrafeedback['instruction'].isin(train_ids)

# Step 5: Split into train and test sets
train_uf = ultrafeedback[ultrafeedback['train']]
test_uf = ultrafeedback[~ultrafeedback['train']]

# Step 6: Deduplicate user_prompt only in test set
test_uf = test_uf.drop_duplicates(subset='instruction', keep='first')

# Drop the 'train' column before saving final train and test datasets
train_uf = train_uf.drop(columns=['train'])
test_uf = test_uf.drop(columns=['train'])

# Print sizes
print("Train size:", len(train_uf))
print("Test size:", len(test_uf))



Train size: 35087
Test size: 1000


In [None]:
# filter unsafe prompts, then train test split for lmsys
# each row is a pair so we can just split the rows

import json
lmsys= lmsys.drop_duplicates(subset='prompt', keep='first')
lmsys

# Define a function to check if a response is unsafe
def is_unsafe(row):
    openai_str = str(row['openai_moderation'])  # Ensure it's a string
    toxic_str = str(row['toxic_chat_tag'])      # Ensure it's a string

    # Check if "flagged": true is in either column
    if "'flagged': True" in openai_str or "'flagged': True" in toxic_str:
        return True
    return False

# Apply the filter
df_filtered = lmsys[~lmsys.apply(is_unsafe, axis=1)]
print(len(df_filtered))
train_lmsys, test_lmsys = train_test_split(lmsys, test_size=0.1, random_state=42)

print("train size:", len(train_lmsys))
print("test size:", len(test_lmsys))






17373
train size: 15944
test size: 1772


In [None]:
# construct preference training set 

# make sure difference is > threshold and the one with lower score is actually more preferred

# prism
import itertools

def create_comparison_dataset(dataset, threshold):
    prefs = []
    for interaction_id, group in dataset.groupby('interaction_id'):
        # Get the indices of all items in the current group
        group_indices = group.index.tolist()
        
        # Generate all possible pairs within the group
        pairs = itertools.combinations(group_indices, 2)
        
        # Iterate over each pair
        for idx_a, idx_b in pairs:
            a = group.loc[idx_a]
            b = group.loc[idx_b]
        
        
            if (a['human_ai_pronouns_ratio_gpt2_model_response_300'] - b['human_ai_pronouns_ratio_gpt2_model_response_300'] > threshold) and (b['score'] > a['score']):
                # Append the preference where 'a' is chosen over 'b'
                # apply chat template to the prompt to ensure it is in the right format
                prefs.append({
                    'prompt': a['user_prompt'],
                    'chosen': b['model_response'],
                    'rejected': a['model_response'],
                    'source': 'prism'
                })

            elif (b['human_ai_pronouns_ratio_gpt2_model_response_300'] - a['human_ai_pronouns_ratio_gpt2_model_response_300'] > threshold) and (a['score'] > b['score']):
                # Append the preference where 'a' is chosen over 'b'
                # apply chat template to the prompt to ensure it is in the right format
                prefs.append({
                    'prompt': a['user_prompt'],
                    'chosen': a['model_response'],
                    'rejected': b['model_response'],
                    'source': 'prism'
                })



    return prefs
  


# uf


def create_comparison_dataset_uf(dataset, threshold):
    prefs = []
    # for the same interaction_id (same user_prompt), compare all the pairs of chosen and rejected
    # Group the data by 'interaction_id'
    for interaction_id, group in dataset.groupby('instruction'):
        # Get the indices of all items in the current group
        group_indices = group.index.tolist()
        
        # Generate all possible pairs within the group
        pairs = itertools.combinations(group_indices, 2)
        
        # Iterate over each pair
        for idx_a, idx_b in pairs:
            a = group.loc[idx_a]
            b = group.loc[idx_b]
            
            # Check the heuristic conditions for both directions
            if (a['human_ai_pronouns_ratio_gpt2_model_response_300'] - b['human_ai_pronouns_ratio_gpt2_model_response_300'] > threshold) and (b['overall_score'] > a['overall_score']):
                # Append the preference where 'a' is chosen over 'b'
                prefs.append({
                    'prompt': a['instruction'],
                    'chosen': b['model_response'],
                    'rejected': a['model_response'],
                    'source': 'uf'
                })
            elif (b['human_ai_pronouns_ratio_gpt2_model_response_300'] - a['human_ai_pronouns_ratio_gpt2_model_response_300'] > threshold) and (a['overall_score'] > b['overall_score']):
                # Append the preference where 'b' is chosen over 'a'
                prefs.append({
                    'prompt': a['instruction'],
                    'chosen': a['model_response'],
                    'rejected': b['model_response'],
                    'source': 'uf'
                })
    return prefs
  


# lmsys

import ast
def create_comparison_dataset_lmsys(dataset, threshold):
    prefs = []
#     row['conversation']
    # for the same interaction_id (same user_prompt), compare all the pairs of chosen and rejected
    # Group the data by 'interaction_id'
    for _, row in dataset.iterrows():
#         print(row['human_ai_pronouns_ratio_gpt2_model_a_response_300'])
#         print(row['human_ai_pronouns_ratio_gpt2_model_b_response_300'])
#         print(row['conversation_a'][0])
        if (row['human_ai_pronouns_ratio_gpt2_model_a_response_300'] - row['human_ai_pronouns_ratio_gpt2_model_b_response_300'] > threshold) and (row['winner'] == 'model_b'):
            prefs.append({
                    'prompt': row["prompt"], # need to make sure the conversation columns are lists, not strings!
                    'chosen': row['model_b_response'],
                    'rejected': row['model_a_response'],
                    'source': 'lmsys'
                })
        elif (row['human_ai_pronouns_ratio_gpt2_model_b_response_300'] - row['human_ai_pronouns_ratio_gpt2_model_a_response_300'] > threshold) and (row['winner'] == 'model_a'):
            prefs.append({
                    'prompt': row["prompt"],#[0]['content'],
                   'chosen': row['model_a_response'],
                    'rejected': row['model_b_response'],
                    'source': 'lmsys'
                })
    return prefs
  


In [1]:
thr = 0 # threshold size t
size = 500  #dataset size n
PREFERENCE_DATA = create_comparison_dataset(train_prism, thr)
PREFERENCE_DATA_UF = create_comparison_dataset_uf(train_uf, thr)
PREFERENCE_DATA_LMSYS = create_comparison_dataset_lmsys(train_lmsys, thr)
training_data = list(pd.Series(PREFERENCE_DATA + PREFERENCE_DATA_UF + PREFERENCE_DATA_LMSYS).sample(500))
len(training_data)
th = str(thr*100)
with open(f"training_{th}_{size}.json", "w") as json_file:
    json.dump(training_data, json_file)


In [61]:
# B_{DPO_R} - create dataset for baseline with regular preferences, not accounting for HuMT

import itertools
# prism
def create_comparison_dataset_r(dataset, threshold):
    prefs = []
    for interaction_id, group in dataset.groupby('interaction_id'):
        # Get the indices of all items in the current group
        group_indices = group.index.tolist()
        
        # Generate all possible pairs within the group
        pairs = itertools.combinations(group_indices, 2)
        
        # Iterate over each pair
        for idx_a, idx_b in pairs:
            a = group.loc[idx_a]
            b = group.loc[idx_b]
            if (b['score'] > a['score']):
                # Append the preference where 'a' is chosen over 'b'
                # apply chat template to the prompt to ensure it is in the right format
                prefs.append({
                    'prompt': a['user_prompt'],
                    'chosen': b['model_response'],
                    'rejected': a['model_response'],
                    'source': 'prism'
                })

            elif (a['score'] > b['score']):
                # Append the preference where 'a' is chosen over 'b'
                # apply chat template to the prompt to ensure it is in the right format
                prefs.append({
                    'prompt': a['user_prompt'],
                    'chosen': a['model_response'],
                    'rejected': b['model_response'],
                    'source': 'prism'
                })



    return prefs
  


# uf


def create_comparison_dataset_uf_r(dataset, threshold):
    prefs = []
    # for the same interaction_id (same user_prompt), compare all the pairs of chosen and rejected
    # Group the data by 'interaction_id'
    for interaction_id, group in dataset.groupby('instruction'):
        # Get the indices of all items in the current group
        group_indices = group.index.tolist()
        
        # Generate all possible pairs within the group
        pairs = itertools.combinations(group_indices, 2)
        
        # Iterate over each pair
        for idx_a, idx_b in pairs:
            a = group.loc[idx_a]
            b = group.loc[idx_b]
            
            # Check the heuristic conditions for both directions
            if  (b['overall_score'] > a['overall_score']):
                # Append the preference where 'a' is chosen over 'b'
                prefs.append({
                    'prompt': a['instruction'],
                    'chosen': b['model_response'],
                    'rejected': a['model_response'],
                    'source': 'uf'
                })
            elif (a['overall_score'] > b['overall_score']):
                # Append the preference where 'b' is chosen over 'a'
                prefs.append({
                    'prompt': a['instruction'],
                    'chosen': a['model_response'],
                    'rejected': b['model_response'],
                    'source': 'uf'
                })
    return prefs
  


# lmsys

import ast
def create_comparison_dataset_lmsys_r(dataset, threshold):
    prefs = []
#     row['conversation']
    # for the same interaction_id (same user_prompt), compare all the pairs of chosen and rejected
    # Group the data by 'interaction_id'
    for _, row in dataset.iterrows():
        if (row['winner'] == 'model_b'):
            prefs.append({
                    'prompt': row["prompt"], # need to make sure the conversation columns are lists, not strings!
                    'chosen': row['model_b_response'],
                    'rejected': row['model_a_response'],
                    'source': 'lmsys'
                })
        elif (row['winner'] == 'model_a'):
            prefs.append({
                    'prompt': row["prompt"],#[0]['content'],
                   'chosen': row['model_a_response'],
                    'rejected': row['model_b_response'],
                    'source': 'lmsys'
                })
    return prefs
  


In [2]:


PREFERENCE_DATA = create_comparison_dataset_r(train_prism, thr)
PREFERENCE_DATA_UF = create_comparison_dataset_uf_r(train_uf, thr)

PREFERENCE_DATA_LMSYS = create_comparison_dataset_lmsys_r(train_lmsys, thr)
training_data = list(pd.Series(PREFERENCE_DATA + PREFERENCE_DATA_UF + PREFERENCE_DATA_LMSYS).sample(size))
len(training_data)
th = str(thr*100)
with open(f"training_random_{size}.json", "w") as json_file:
    json.dump(training_data, json_file)


In [72]:
# merge test sets and only keep the prompt and add a new column for the source

# Add a new 'source' column to each test set
test_prism['source'] = 'prism'
test_uf['source'] = 'uf'
test_lmsys['source'] = 'lmsys'

# Standardize column names to 'prompt' for merging
test_prism = test_prism.rename(columns={'user_prompt': 'prompt'})
test_uf = test_uf.rename(columns={'instruction': 'prompt'})
test_lmsys = test_lmsys[['prompt', 'source']]  # Already has the correct name

# Merge all test sets
test = pd.concat([test_prism[['prompt', 'source']], 
                  test_uf[['prompt', 'source']], 
                  test_lmsys[['prompt', 'source']]], 
                 axis=0, ignore_index=True)

test


Unnamed: 0,prompt,source
0,What should I do if I feel like I am so unhapp...,prism
1,I would like to write a poem about my white cat.,prism
2,How can I start learning how to speak Japanese...,prism
3,Should people lose their jobs over not getting...,prism
4,I feel a connection to my team at work that ma...,prism
...,...,...
3560,Hi! What is Ai Alignment?,lmsys
3561,J'ai un casse-tête que je n'arrive pas à résoudre,lmsys
3562,How big is the Eiffeltower?,lmsys
3563,in fortnite what is the different between blue...,lmsys


In [43]:
test.to_csv("test_outputs.csv", index=False)

In [76]:
test.source.value_counts()

source
lmsys    1772
uf       1000
prism     793
Name: count, dtype: int64