In [None]:
import pandas as pd
import json
import numpy as np
from scipy.special import rel_entr
import os

from scipy.spatial.distance import jensenshannon
import math
from scipy.stats import entropy, wasserstein_distance
from scipy.spatial.distance import correlation
import dcor
from dcor import distance_correlation

os.chdir(os.getenv("WORKING_DIR"))

In [None]:
## Step 1: transform all raw data into distribution files as "train.son"
# arguments
ChaosNLI_file = 'NLI_explanations.json'
# extract premise and hypothesis
Raw_input = ''
# where you save the raw data from MJD-generator.ipynb
Raw_output = ''
# where you would like to save the distribution files

sfmax_temperature = 20
# temparature cofficient for softmax transformation

In [None]:
# load all data

with open(ChaosNLI_file) as f:
    lines = f.readlines()
    data = [json.loads(line) for line in lines]
    chaos_df = pd.DataFrame(data)
    
chaos_df_cols = chaos_df[['premise','hypothesis']]


# loop through the json files in the folder "/data_raw" read 
file_dict = {}
for filename in os.listdir(Raw_input):
    with open(f'{Raw_input}/{filename}') as f:
        lines = f.readlines()
        data = [json.loads(line) for line in lines]
        df = pd.DataFrame(data)
        df = pd.concat([df, chaos_df_cols], axis=1)
        file_dict[filename] = df

In [None]:
file_dict.keys()

In [None]:
# save the distribution files for distribution comparison and fine-tuning comparison

def normalize(instance):
    instance = np.array(instance)
    instance_norm = instance / np.sum(instance, axis=2)[:, :, np.newaxis]
    return instance_norm

def softmax(instance, temperature=sfmax_temperature):
    instance = np.array(instance)
    instance_norm = np.exp(instance/temperature) / np.sum(np.exp(instance/temperature), axis=2)[:, :, np.newaxis]
    return instance_norm


# save distribution from normalization transformation
for number in ['zero', 'one', 'two', 'three', 'four']:
    for mode in ['without_label', 'with_label']:
        df = file_dict[f'Llama3_{mode}.json']
        if number == 'zero':
            # add one dimention to the second axis
            df[f'scores adding {number}'] = df[f'scores adding {number}'].apply(lambda x: np.expand_dims(x, axis=1))
            
        df[f'scores adding {number} norm probs'] = df[f'scores adding {number}'].apply(lambda x: normalize(x))
        df[f'scores adding {number} norm probs avg'] = df[f'scores adding {number} norm probs'].apply(lambda x: np.mean(x, axis=(0,1)))
        # only keep the column "premise", "hypothesis", "scores adding one with labels norm probs avg"
        df_process = df[["premise", "hypothesis", f"scores adding {number} norm probs avg"]]
        # change the name of the column "scores adding one with labels norm probs avg" to "label"
        df_process = df_process.rename(columns={f"scores adding {number} norm probs avg": "label"})
        # create folder if not exist
        os.makedirs(f"{Raw_output}/train/{mode}/add_{number}/", exist_ok=True)
        # save to folder "Raw_output/" as json
        df_process.to_json(f"{Raw_output}/train/{mode}/add_{number}/train.json", orient="records", lines=True)


# save distribution from softmax transformation
for number in ['zero', 'one', 'two', 'three', 'four']:
    for mode in ['without_label', 'with_label']:
        df = file_dict[f'Llama3_{mode}.json']
        if number == 'zero':
            # add one dimention to the second axis
            df[f'scores adding {number}'] = df[f'scores adding {number}'].apply(lambda x: np.expand_dims(x, axis=1))
            
        df[f'scores adding {number} softmax probs'] = df[f'scores adding {number}'].apply(lambda x: softmax(x))
        df[f'scores adding {number} softmax probs avg'] = df[f'scores adding {number} softmax probs'].apply(lambda x: np.mean(x, axis=(0,1)))
        # only keep the column "premise", "hypothesis", "scores adding one with labels norm probs avg"
        df_process = df[["premise", "hypothesis", f"scores adding {number} softmax probs avg"]]
        # change the name of the column "scores adding one with labels norm probs avg" to "label"
        df_process = df_process.rename(columns={f"scores adding {number} softmax probs avg": "label"})
        # create folder if not exist
        os.makedirs(f"{Raw_output}/train_softmax/T_{sfmax_temperature}/{mode}/add_{number}/", exist_ok=True)
        # save to folder "data/llm-translate/add_one_label_train_chaos_dev/" as json
        df_process.to_json(f"{Raw_output}/train_softmax/T_{sfmax_temperature}/{mode}/add_{number}/train.json", orient="records", lines=True)



In [None]:
## Step 2: calculate the metrics in distribution comparison
# all calculation methods

def kl_divergence(P, Q, epsilon=1e-10):
    """
    Calculate the Kullback-Leibler divergence between two probability distributions.

    Parameters:
    P (array-like): The first probability distribution.
    Q (array-like): The second probability distribution.
    epsilon (float): A small value to avoid division by zero.

    Returns:
    float: The KL divergence value.
    """
    # Convert P and Q to numpy arrays
    P = np.asarray(P, dtype=np.float64)
    Q = np.asarray(Q, dtype=np.float64)
    
    # Add epsilon to avoid zero probabilities
    P = np.clip(P, epsilon, 1)
    Q = np.clip(Q, epsilon, 1)
    
    # Normalize the distributions to ensure they sum to 1
    P = P / np.sum(P)
    Q = Q / np.sum(Q)
    
    # Calculate KL divergence
    kl_divergence_value = np.sum(rel_entr(P, Q))
    
    return kl_divergence_value


def jensen_shannon(p, q):
    # calculate JSD
    p = np.asarray(p, dtype=np.float64)
    q = np.asarray(q, dtype=np.float64)
    m = (p + q) / 2
    return math.sqrt((kl_divergence(p, m) + kl_divergence(q, m)) / 2)


def tvd(model_probs, human_probs):
    """
    Computes TVD scores allowing for multiple sub-samples and groups (=classifiers).

    p: classifiers [G, 1, N, C]
    q: MLE given (sub-samples of) annotations [1, S, N, C]

    returns:
        tvd: [G, S, N] (mean_per=None), [G, S] (mean_per=sample), [G, N] (mean_per=instance)
    """
    model_probs = np.array(model_probs)
    human_probs = np.array(human_probs)
    assert model_probs.max() <= 1.0 and model_probs.min() >= 0
    assert human_probs.max() <= 1.0 and human_probs.min() >= 0

    tvds = np.sum(np.abs(model_probs - human_probs), axis=-1) / 2
    return tvds



In [None]:
# calculate metrics

def get_metric_in_Dist(Chaos_dir, target_dir,if_uni=0):
    Chaos_file = os.path.join(Chaos_dir,'train.json')
    target_file = os.path.join(target_dir,'train.json')
    result_list = []
    metrics = ['kl_divergence','jensenshannon','tvd']

    with open(Chaos_file) as f:
        lines = f.readlines()
        data = [json.loads(line) for line in lines]
        record_df = pd.DataFrame(data)
    with open(target_file) as f:
        lines = f.readlines()
        data = [json.loads(line) for line in lines]
        target_df = pd.DataFrame(data)
    
    if if_uni:
        # if you want the uniform-distribution retsults
        record_df['target'] = record_df['label'].apply(lambda x: [1/3,1/3,1/3])
    else:
        record_df['target'] = target_df['label']
    
    for metric in metrics:
        record_df['target metric'] = record_df.apply(lambda x: eval(metric)(x['label'], x[f'target']), axis=1)
        result_list.append(float(format(record_df['target metric'].mean(),'.5f')))
    result_list.append(float(format(distance_correlation(np.vstack(record_df[f'label'].to_list()),np.vstack(record_df['target'].to_list())),'.5f')))

    return result_list


# get fine-tuning results, after conduct fine-tuning by MJD-fine-tuning

# def get_ft_results(inputdir):
#     ft_result_list = []
#     for model in ['bert','roberta']:
#         for type in ['eval_results.json','test_var_results.json']:
#             result_file = os.path.join("output/",f"{model}/{inputdir}/{type}")
#             with open(result_file) as f:
#                 eval_results = json.load(f)
#             for metric in ['eval_kl_divergence','eval_loss','eval_weighted_F1']:
#                 ft_result_list.append(float(format(eval_results[metric],'.5f')))
#     return ft_result_list
                

In [None]:
# save comparison results to excel

import openpyxl

data = []

ChaosNLI_file_341 = ''
MNLIonehot_dir = ''
MNLI_distribution_dir = ''
VariErr_distribution_dir = ''
# All file as {'premise':...,'hypothesis':...,'label':[probE, probN, probC]}


# baseline
for filename in [ChaosNLI_file, MNLIonehot_dir, MNLI_distribution_dir, VariErr_distribution_dir]:
    data.append(get_metric_in_Dist(ChaosNLI_file_341, filename))


# Llama origin
# Llama_origin = ".../add_zero/"
# Llama_origin_sf = "/train_softmax/T_20/.../add_zero/"
# for filename in [Llama_origin, Llama_origin_sf]:
#     data.append(get_metric_in_Dist(ChaosNLI_file_341, filename))


# Llama with VariErr
# .../add_{how_many}/train.json

# for [ifnorm, ifwith] in [['train','Llama3-without-label'],['train','Llama3-with-label'],['train_softmax/T_20','Llama3-without-label'],['train_softmax/T_20','Llama3-with-label']]:
#     for howmany in ['add_four','add_three','add_two','add_one']:
#         filename = f".../Llama3/{ifwith}/{ifnorm}/Llama/{howmany}/"
#         data.append(get_metric_in_Dist(ChaosNLI_file_341, filename))



wb = openpyxl.Workbook()
ws = wb.active


x = 4  
# for every 4 line, calculate the average score
row_offset = 0  
for i in range(0, len(data), x):
    for j in range(x):
        if i + j < len(data):
            for k, value in enumerate(data[i + j]):
                ws.cell(row=i + j + 1 + row_offset, column=k + 1, value=value)
    if i + x - 1 < len(data):  
        avg_row = []
        for col in range(len(data[0])):
            avg_value = sum(data[i + r][col] for r in range(x)) / x
            avg_row.append(avg_value)
        for k, value in enumerate(avg_row):
            ws.cell(row=i + x + 1 + row_offset, column=k + 1, value=value)
        row_offset += 1

# save as Excel
wb.save("output.xlsx")
print("Data written to output.xlsx")


In [None]:
## Step 3: ternary visualization

import matplotlib.pyplot as plt
import ternary
import numpy as np

# Llama
datasets = []
# read the distributions as you want,like Llama3, Llama3 with explanations, Llama3 with explicit explanations

data_name = ['Llama3','+ EXs','+ explicit EXs']
colors = ['k', 'orange', 'purple']
markers = [ 'D', 'v', '<']

distributions = [data_sets[i].apply(lambda x: [x[0]*100,x[1]*100,x[2]*100]).tolist() for i in range(len(data_sets))]

fig, axs = plt.subplots(1, 3, figsize=(18, 6))

for i, data in enumerate(distributions):
    scale = max(max(point) for point in data)
    def scale_center(data, scale_factor):
        centered_data = (data - np.mean(data, axis=0)) * scale_factor + np.mean(data, axis=0)
        return centered_data

    # scale cofficient
    scale_factor = 3.3
    scaled_data = scale_center(data, scale_factor)
    # if you want to scale up
    # if i==1:
    #     data = scaled_data
    ax = axs[i]
    figure, tax = ternary.figure(ax=ax, scale=100)
    tax.boundary(linewidth=2.0)
    tax.gridlines(multiple=10, color="grey")
    # tax.gridlines(multiple=0.1, color="gray", linestyle='-', linewidth=0.5)
    # if i == 1:
    #     tax.scatter(scaled_data, marker=markers[i], color=colors[i], label=data_name[i])
    # else:
    #     tax.scatter(data, marker=markers[i], color=colors[i], label=data_name[i])
    tax.scatter(data, marker=markers[i], color=colors[i], label=data_name[i])
    tax.ticks(axis='lbr', linewidth=1, multiple=10)
    # tax.ticks(axis='lbr', linewidth=1, multiple=0.1, ticks=[0.1 * i for i in range(1, 10)] + [1.0])
    # ticks = [0.1 * j for j in range(1, 10)] + [1.0]
    # tax.ticks(ticks=ticks, axis='lbr', linewidth=1, clockwise=False, offset=0.02)
    
    tax.clear_matplotlib_ticks()
    tax.get_axes().axis('off')
    # tax.get_axes().tick_params(axis='both', which='major', labelsize=30)
    tax.left_axis_label("← Entailment",fontsize=32, offset=0.16)
    tax.right_axis_label("← Neutral",fontsize=32, offset=0.16)
    tax.bottom_axis_label("Contradiction →",fontsize=32, offset=0.04)
    # tax.set_title(data_name[i])
    tax.legend(fontsize=18)

plt.tight_layout()
plt.savefig('triangle_k.pdf', format='pdf')
plt.show()
