# Compute Correlation

This script processes the results of cross-validation and computes various correlations and statistical comparisons.

## Steps:

1. **Compute Cross-Validation Mean:**  
   - Given **5 different CSV files** with predictions, the mean of the cross-validation results is calculated.

2. **Compute Spearman Correlation:**  
   - For each **ViFi-CLIP prediction file**, the **Spearman correlation** with human ratings is computed.

3. **Compare Different Training Set Sizes:**  
   - Correlations are also computed for ViFi-CLIP models trained on:
     - **1k** data points
     - **2.5k** data points  

4. **Ablation Study - Statistical Comparison:**  
   - Different **model configurations** (their cross-validation means) are compared using a **t-test** to assess significant differences.
   - **Holm-corrected p-values** are reported to account for multiple comparisons.


------
### **Naming**:
Folders of cross validation results are named in the following format: 

*vitb(16|32)* _ *(2|8)* _ *(humanedit|magicbrush)*
- ViT-B/16 or ViT-B/32 refers to the model backbone used.
- 2 or 8 indicates the frame length of the visual encoder.
- HumanEdit or MagicBrush specifies the dataset used.

Install required libraries.

In [None]:
#!pip install numpy==1.22.4 -q

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats.multitest as smm

from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import ttest_ind

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [6]:
# given a directory of cross-validation results, compute the mean over all predictions
def mean_predictions_from_cv(directory):
    all_data = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            all_data.append(df)

    combined_df = pd.concat(all_data, ignore_index=True)
    mean_scores = combined_df.groupby(["id", "turn"], as_index=False)["score"].mean()
    return mean_scores

In [7]:
vifi_mean = mean_predictions_from_cv("vitb16_2_humanedit")

### Spearman correlation
Spearman correlation coefficient is chosen because human ratings are not normally distributed.

In [8]:
# given two dataframes (ViFi-CLIP predictions as well as human ratings),
# compute their correlation across all aspects

def get_correlation_df(vifi, human):
    merged_df = pd.merge(vifi, human, on=["id", "turn"])

    correlation_results = []

    merged_df = merged_df.dropna(
        subset=["score", "alignment", "quality", "consistency", "overall"]
    )

    for col in merged_df.columns:
        if col in ["alignment", "quality", "consistency", "overall"]:

            spearman_corr, spearman_p_value = stats.spearmanr(
                merged_df["score"], merged_df[col]
            )
            correlation_results.append(
                {
                    "df1": "score",
                    "df2": col,
                    "spearman_corr": spearman_corr,
                    "spearman_p_value": spearman_p_value,
                }
            )

    correlation_results_df = pd.DataFrame(correlation_results)
    return correlation_results_df

In [9]:
# given a folder containing cross-validation results,
# compute the correlation between the mean predictions and human ratings

def get_table(folder, human, mean=False):
    dfs = []
    for i in range(1, 6):
        df = pd.read_csv(f"{folder}/f{i}.csv")
        corr_df = get_correlation_df(df, human)
        dfs.append(corr_df)

    # concat all dfs
    combined_df = pd.concat(dfs, axis=0, ignore_index=True)

    if mean:
        combined_df = combined_df.drop(columns=["df1"], axis=1)
        grouped = combined_df.groupby("df2").agg("mean")
    else:
        grouped = combined_df.groupby("df2").agg(list)

    if "df1" in grouped.columns:
        grouped = grouped.drop(["df1"], axis=1)

    return grouped

In [10]:
# load human annotations
human = pd.read_csv("../annotations_mean_3.csv")

In [6]:
# human correlation of model trained on 1k data
get_correlation_df(
    pd.read_csv("vitb16_2_humanedit_1k.csv"), pd.read_csv("../annotations_mean_3.csv")
)

Unnamed: 0,df1,df2,spearman_corr,spearman_p_value
0,score,alignment,0.099649,0.030068
1,score,quality,0.013509,0.769259
2,score,consistency,0.031282,0.496869
3,score,overall,0.093982,0.040829


In [7]:
# human correlation of model trained on 2.5k data
get_correlation_df(
    pd.read_csv("vitb16_2_humanedit_2.5k.csv"), pd.read_csv("../annotations_mean_3.csv")
)

Unnamed: 0,df1,df2,spearman_corr,spearman_p_value
0,score,alignment,0.101881,0.026553
1,score,quality,0.022952,0.618166
2,score,consistency,0.045734,0.320419
3,score,overall,0.096887,0.034964


In [103]:
# human correlation of crossvalidation with ViT-B/16, 8 frames, HumanEdit dataset
get_table("vitb16_8_humanedit", human, mean=True)

Unnamed: 0_level_0,spearman_corr,spearman_p_value
df2,Unnamed: 1_level_1,Unnamed: 2_level_1
alignment,0.131537,0.004601
consistency,0.0265,0.565761
overall,0.09017,0.052408
quality,0.017809,0.69913


In [104]:
# human correlation of crossvalidation with ViT-B/32, 2 frames, HumanEdit dataset
get_table("vitb32_2_humanedit", human, mean=True)

Unnamed: 0_level_0,spearman_corr,spearman_p_value
df2,Unnamed: 1_level_1,Unnamed: 2_level_1
alignment,0.106851,0.020768
consistency,0.010432,0.821388
overall,0.095418,0.038477
quality,-0.001649,0.947281


In [90]:
# human correlation of crossvalidation with ViT-B/16, 2 frames, HumanEdit dataset
get_table("vitb16_2_humanedit", human, mean=True)

Unnamed: 0_level_0,spearman_corr,spearman_p_value
df2,Unnamed: 1_level_1,Unnamed: 2_level_1
alignment,0.115268,0.012867
consistency,0.04772,0.303428
overall,0.093533,0.044768
quality,0.023895,0.607538


In [None]:
human_good_bad = pd.read_csv("../annotations_mean_7_78.csv")

In [97]:
# human correlation of crossvalidation with ViT-B/16, 2 frames, HumanEdit dataset
# samples are extremely well or poor executed image edits
get_table("vitb16_2_humanedit", human_good_bad, mean=True)

Unnamed: 0_level_0,spearman_corr,spearman_p_value
df2,Unnamed: 1_level_1,Unnamed: 2_level_1
alignment,0.415044,0.000258
consistency,0.278426,0.019305
overall,0.403861,0.000385
quality,0.284243,0.017097


In [11]:
vitb16_2_humanedit = get_table("vitb16_2_humanedit", human)
vitb16_2_magicbrush = get_table("vitb16_2_magicbrush", human)
vitb16_8_humanedit = get_table("vitb16_8_humanedit", human)
vitb32_2_humanedit = get_table("vitb32_2_humanedit", human)

In [12]:
comparisons = {
    "vitb16_2_humanedit": vitb16_2_humanedit,
    "vitb16_8_humanedit": vitb16_8_humanedit,
    "vitb32_2_humanedit": vitb32_2_humanedit,
}

results = {}
p_values = []

# compute t-tests and p-values for all aspects
for name, df in comparisons.items():
    results[name] = {}
    for aspect in vitb16_2_humanedit.index:
        # spearman correlation for each aspect
        corr_df1 = vitb16_2_humanedit.loc[aspect, "spearman_corr"]
        corr_df2 = df.loc[aspect, "spearman_corr"]

        # t-test given 5 values for each model and each aspect
        t_stat, p_value = ttest_ind(corr_df1, corr_df2, equal_var=False)

        results[name][aspect] = {"t-stat": t_stat, "p-value": p_value}
        p_values.append(p_value)

# Holm-correction
rejected, pvals_corrected, _, _ = smm.multipletests(p_values, method="holm")

p_value_idx = 0

for name, aspects in results.items():
    print(f"\n=== Comparison: vitb16_2_humanedit vs {name} ===")
    for aspect, values in aspects.items():
        original_p_value = values["p-value"]
        corrected_p_value = pvals_corrected[p_value_idx]

        print(
            f"{aspect}: t={values['t-stat']:.3f}, p={original_p_value:.5f}, p-corrected={corrected_p_value:.5f}"
        )

        p_value_idx += 1


=== Comparison: vitb16_2_humanedit vs vitb16_2_humanedit ===
alignment: t=0.000, p=1.00000, p-corrected=1.00000
consistency: t=0.000, p=1.00000, p-corrected=1.00000
overall: t=0.000, p=1.00000, p-corrected=1.00000
quality: t=0.000, p=1.00000, p-corrected=1.00000

=== Comparison: vitb16_2_humanedit vs vitb16_8_humanedit ===
alignment: t=-3.430, p=0.00902, p-corrected=0.08114
consistency: t=6.662, p=0.00052, p-corrected=0.00573
overall: t=0.653, p=0.53222, p-corrected=1.00000
quality: t=1.693, p=0.15832, p-corrected=1.00000

=== Comparison: vitb16_2_humanedit vs vitb32_2_humanedit ===
alignment: t=2.023, p=0.07926, p-corrected=0.63408
consistency: t=11.100, p=0.00001, p-corrected=0.00015
overall: t=-0.450, p=0.66916, p-corrected=1.00000
quality: t=6.556, p=0.00068, p-corrected=0.00677
