#### Make Combined LLM Log Likelihood Dataset

In [2]:
from datasets import Dataset, load_dataset
import pandas as pd
from functools import reduce
import os 

def load_log_likelihoods(model_name): 

    paths = [
        f"/scratch/mr7401/log_likelihoods_Truncation_Fixed_BS1/{model_name}/log_likelihood_NEW.jsonl",
        f"/scratch/mr7401/log_likelihoods_Truncation_Fixed_BS1/{model_name}/log_likelihood_2000.jsonl",
        ]
    
    valid_paths = []
    for path in paths: 
        if os.path.exists(path): 
            valid_paths.append(path)
    
    return load_dataset("json", data_files= valid_paths, split="train", streaming=False)

def combine_log_likelihoods(model_names):
    # Load datasets for each model name
    dataset_list_as_pandas = []
    
    for model_name in model_names:
        dataset = load_log_likelihoods(model_name)
        d= dataset.to_pandas()
        # Check if the dataset is empty
        if d.empty:
            print(f"Dataset for {model_name} is empty.")
            continue
        dataset_list_as_pandas.append(d)
    
    ## Merge all together into 1 CSV with columns for each model's LL
    merged_df = reduce(lambda left, right: pd.merge(left, right, on=['generation_id', 'generation', 'gen_source_model'], how='inner'), dataset_list_as_pandas)
    merged_hf_ds = Dataset.from_pandas(merged_df)
    
    return merged_df, merged_hf_ds

# Example usage
model_names = ['GPT2',
 'GPT2Large',
 'Gemma2_2B',
 'Llama31_8B',
 'Llama32_3B',
 'OPT125M',
 'OPT2_7B',
 'OPT350M',
 'OPT6_7B',
 'Qwen2_5_0_5B',
 'Qwen2_5_3B']  
df, dataset = combine_log_likelihoods(model_names)

In [3]:
#df.to_csv("llms_log_likelihoods_combined.csv", index=False)

In [44]:
import numpy as np 

def mean(lst): 
    return sum(lst) / len(lst)

def compute_kl_difference(m1_m1_ll, m1_m2_ll, m2_m1_ll, m2_m2_ll):
    kl_diff = -(mean(m1_m1_ll) + mean(m1_m2_ll)) + (mean(m2_m1_ll) + mean(m2_m2_ll))
    return kl_diff
    
def generate_kl_diff_samples(merged_df, m1, m2,  smoothing): 
    n_batches = int(np.floor((len(merged_df)/smoothing)))
    m1_generations = merged_df[merged_df["gen_source_model"] == m1]
    m2_generations = merged_df[merged_df["gen_source_model"] == m2]
    kl_divs = []
    m1_samples = []
    m2_samples = []
    for i in range(n_batches): 
        m1_sample = m1_generations.sample(smoothing)
        
        m1_m1_ll, m1_m2_ll = m1_sample[f"{m1}_ll"].tolist(), m1_sample[f"{m2}_ll"].tolist()
        
        m2_sample = m2_generations.sample(smoothing)
        m2_m1_ll, m2_m2_ll = m2_sample[f"{m1}_ll"].tolist(), m2_sample[f"{m2}_ll"].tolist()
        kl = compute_kl_difference(m1_m1_ll, m1_m2_ll, m2_m1_ll, m2_m2_ll)
        kl_divs.append(kl)
        m1_samples.append(m1_sample["generation"].to_list())
        m2_samples.append(m2_sample["generation"].to_list())

    d = pd.DataFrame.from_dict({"M1_Name":[m1] * len(kl_divs), "M2_Name":[m2] * len(kl_divs), "M1": m1_samples, "M2": m2_samples, "KL_Diff": kl_divs})    
    return d

from itertools import combinations 
model_names = ['GPT2',
 'GPT2Large',
 'Gemma2_2B',
 'Llama31_8B',
 'Llama32_3B',
 'OPT125M',
 'OPT2_7B',
 'OPT350M',
 'OPT6_7B',
 'Qwen2_5_0_5B',
 'Qwen2_5_3B']

i = 0
all_samples = []
for pair in combinations(model_names, 2): 
    m1, m2 = pair
    d = generate_kl_diff_samples(df, m1 = m1, m2= m2, smoothing =5) 
    all_samples.append(d)
    i = i + 1 
    if i%10 == 0: 
        print(f"Completed {i} model pairs")
    

Completed 10 out of  combinations
Completed 20 out of  combinations
Completed 30 out of  combinations
Completed 40 out of  combinations
Completed 50 out of  combinations


In [45]:
a = pd.concat(all_samples)
a

Unnamed: 0,M1_Name,M2_Name,M1,M2,KL_Diff
0,GPT2,GPT2Large,[\nFrom Terraria Wiki\n\nThis article is about...,[All of the above reasons are why the question...,-238.138899
1,GPT2,GPT2Large,"[A couple years ago, a friend and I stumbled a...","[If it is the first, then it's an upgrade.\n\n...",646.224764
2,GPT2,GPT2Large,"[Catherine and Charles had gone to a hotel, th...",[I'm excited!\n\nThe second big project that I...,-884.299009
3,GPT2,GPT2Large,[Lebanon will not stop fighting for its own fu...,[By Dan Harmon\n\nThere are a number of people...,-439.265242
4,GPT2,GPT2Large,"[""Some people think we're going to go back to ...","[I am a retired police officer, but I'm not a ...",570.729003
...,...,...,...,...,...
4371,Qwen2_5_0_5B,Qwen2_5_3B,[A. 合同是实践性合同\nB. 根据合同当事人间的权利义务关系可以划分为合同债权和合同债务...,"[public"" in public\n```\n\nPlease determine wh...",209.742337
4372,Qwen2_5_0_5B,Qwen2_5_3B,"[from PIL import Image\n\nprint(""What do you w...",[void?\nA: A A: A: A N: R: R: R: R: R: R: R: R...,221.398855
4373,Qwen2_5_0_5B,Qwen2_5_3B,[Human: You are faced with a challenge titled:...,[Human\n# The human body contains approximatel...,-25.017506
4374,Qwen2_5_0_5B,Qwen2_5_3B,"[If you don't take care of that part, you are ...",[Human\n# is a Human\n#\n# 46. __str__...,-237.640222


In [47]:
#a.to_csv("all_models_kldiffs.csv")

In [48]:
#a_sample = a.sample(10000)
#a_sample.to_csv("all_models_kldiffs_10K_sample.csv")

In [49]:
a_sample

Unnamed: 0,M1_Name,M2_Name,M1,M2,KL_Diff
3765,Gemma2_2B,Qwen2_5_0_5B,[On today’s edition of “The State of the Union...,[A. 错误\nB. 正确\n答案:\nA\n\n下列哪种情况会引发休克____\nA. 宫...,235.627081
3112,GPT2,OPT125M,[A BOOZE of U.S. senators came out against an ...,[I would personally just get him. I don't have...,415.006911
1606,GPT2,GPT2Large,"[""You don't think I've done it before,"" he sai...","[""There is a way to bring justice, a way to br...",98.055182
3378,GPT2,OPT2_7B,"[The next day, the Daily Mail reported that th...",[I mean I’m a lesbian and I have dated guys fo...,1223.349826
1832,Gemma2_2B,OPT2_7B,"[/*\n#\n# Copyright (C) 2005-2011, Talend, Inc...","[I think his name was Richard, was from my hom...",493.958953
...,...,...,...,...,...
2478,OPT2_7B,OPT6_7B,[Why the fuck not you silly fuck?\nBecause I t...,[I love how he just kept hitting it.\nI think ...,323.673997
1033,Gemma2_2B,OPT125M,[The first week of winter in 2023 has just arr...,[5.9.0 Patch Information\n\nIn the following p...,786.697523
1562,GPT2Large,Qwen2_5_3B,[The University of Chicago is a very progressi...,[In is in order that it may be a time to be a ...,1654.999331
1067,OPT2_7B,Qwen2_5_0_5B,[Lol how the heck do you get this? Is it from ...,"[# if there are no children, then\n# the node ...",-1063.205452


#### DataLoader 

In [63]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from ast import literal_eval

class KLDataset(Dataset):
    def __init__(self, csv_file):
        # Load the CSV file into a DataFrame
        self.data = pd.read_csv(csv_file)
    
    def __len__(self):
        # Return the number of samples in the dataset
        return len(self.data)
    
    def __getitem__(self, idx):
        # Get the row at the specified index
        row = self.data.iloc[idx]
        m1_samples = literal_eval(row["M1"]) # read as a list
        m1_combined_str = ", ".join([f"Sample {i+1}: {entry}" for i, entry in enumerate(m1_samples)])

        m2_samples = literal_eval(row["M2"]) # read as a list
        m2_combined_str = ", ".join([f"Sample {i+1}: {entry}" for i, entry in enumerate(m2_samples)])
                
        # Combine the M1 and M2 samples 
        x = f"Model 1: {m1_combined_str}, Model 2: {m2_combined_str}"
        
        # Get the KL_Diff as the label
        label = row['KL_Diff']
        
        return x, label

# Example usage
csv_file = "all_models_kldiffs_10K_sample.csv"  # Replace with the path to your CSV file
dataset = KLDataset(csv_file)
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)

# Iterate through the dataloader
for inputs, labels in dataloader:
    #print("Input:", inputs)
    print("Labels:", labels)
    break

Labels: tensor([235.6271, 415.0069], dtype=torch.float64)
