In [2]:
import pandas as pd
import numpy as np

In [3]:
import pickle

with open("./data/Cross_vec_index.pkl", "rb") as f:
    cross_vec_lyric = pickle.load(f)

with open("./data/Cross_vec_index_diary.pkl", "rb") as f:
    cross_vec_diary = pickle.load(f)

with open("./data/Attn_vec_index.pkl", "rb") as f:
    attn_vec_lyric = pickle.load(f)

with open("./data/Attn_vec_index_diary.pkl", "rb") as f:
    attn_vec_diary = pickle.load(f)

# =======================================================================

with open("./data/Cross_test_vec_index.pkl", "rb") as f:
    cross_test_vec_lyric = pickle.load(f)

with open("./data/Cross_test_vec_index_diary.pkl", "rb") as f:
    cross_test_vec_diary = pickle.load(f)

with open("./data/Attn_test_vec_index.pkl", "rb") as f:
    attn_test_vec_lyric = pickle.load(f)

with open("./data/Attn_test_vec_index_diary.pkl", "rb") as f:
    attn_test_vec_diary = pickle.load(f)

# =======================================================================


cross_test_vec_lyric = np.array(cross_test_vec_lyric)
cross_test_vec_diary = np.array(cross_test_vec_diary)
cross_test_data = np.concat([cross_test_vec_lyric, cross_test_vec_diary], axis=0)

attn_test_vec_lyric = np.array(attn_test_vec_lyric)
attn_test_vec_diary = np.array(attn_test_vec_diary)
attn_test_data = np.concat([attn_test_vec_lyric, attn_test_vec_diary], axis=0)

In [4]:
import numpy as np
from scipy.stats import entropy, f_oneway, ks_2samp, wilcoxon
from sklearn.metrics import pairwise_kernels


# ==============================
# 1. Helper Functions
# ==============================

def get_hist(vec, bins=50):
    """Normalize histogram for KL-divergence."""
    hist, _ = np.histogram(vec, bins=bins, density=True)
    hist = hist + 1e-12  # smoothing
    return hist


def kl_divergence(a, b):
    """Symmetric KL divergence."""
    ha = get_hist(a)
    hb = get_hist(b)
    return entropy(ha, hb) + entropy(hb, ha)


def mmd_rbf(x, y, gamma=1.0):
    """
    Maximum Mean Discrepancy using RBF kernel.
    Lower = distributions more similar.
    """
    Kxx = pairwise_kernels(x, x, metric='rbf', gamma=gamma).mean()
    Kyy = pairwise_kernels(y, y, metric='rbf', gamma=gamma).mean()
    Kxy = pairwise_kernels(x, y, metric='rbf', gamma=gamma).mean()
    return Kxx + Kyy - 2 * Kxy


# ==============================
# 2. Flatten vectors
# ==============================
cross_diary = cross_test_vec_diary
cross_lyric = cross_test_vec_lyric

attn_diary = attn_test_vec_diary
attn_lyric = attn_test_vec_lyric


# ==============================
# 3. Compute Metrics
# ==============================

metrics = {}

# KL-divergence
metrics["KL_cross"] = kl_divergence(cross_diary, cross_lyric)
metrics["KL_attn"] = kl_divergence(attn_diary, attn_lyric)

# MMD (RBF kernel)
metrics["MMD_cross"] = mmd_rbf(cross_diary, cross_lyric)
metrics["MMD_attn"] = mmd_rbf(attn_diary, attn_lyric)


# ==============================
# 5. Print Results
# ==============================

print("\n===== Distribution Similarity Comparison =====\n")

print("KL Divergence (lower = more similar):")
print("  Cross Attention:", metrics["KL_cross"])
print("  Self Attention :", metrics["KL_attn"], "\n")

print("MMD (lower = more similar):")
print("  Cross Attention:", metrics["MMD_cross"])
print("  Self Attention :", metrics["MMD_attn"], "\n")



===== Distribution Similarity Comparison =====

KL Divergence (lower = more similar):
  Cross Attention: 0.30065256439411925
  Self Attention : 0.3479319723758192 

MMD (lower = more similar):
  Cross Attention: 0.007209934070093127
  Self Attention : 0.01177262790987349 



In [6]:
from statsmodels.multivariate.manova import MANOVA

# ===== 1) 데이터프레임 생성 =====
cross_df = pd.DataFrame(
    np.vstack([cross_lyric, cross_diary])
)
cross_df["label"] = [0]*len(cross_lyric) + [1]*len(cross_diary)
cross_df["group"] = "cross"

attn_df = pd.DataFrame(
    np.vstack([attn_lyric, attn_diary])
)
attn_df["label"] = [0]*len(attn_lyric) + [1]*len(attn_diary)
attn_df["group"] = "self"

df = pd.concat([cross_df, attn_df], axis=0)

# ===== 2) 컬럼명을 'f0', 'f1', ... 형식으로 변환 =====
old_cols = df.columns.tolist()
new_cols = [f"f{i}" for i in range(len(old_cols)-2)] + ["label", "group"]
df.columns = new_cols

# ===== 3) MANOVA 수행 =====
feature_cols = df.columns[:-2]  # 마지막 2개(label, group) 제외
formula = " + ".join(feature_cols)

manova = MANOVA.from_formula(
    f"{formula} ~ group",
    data=df
)

result = manova.mv_test()
result = manova.mv_test()
stat = result['group']['stat']
clean_stat = stat.drop(columns=["Num DF", "Den DF"])

clean_stat

Unnamed: 0,Value,F Value,Pr > F
Wilks' lambda,0.401466,3.987297,0.0
Pillai's trace,0.598534,3.987297,0.0
Hotelling-Lawley trace,1.490869,3.987297,0.0
Roy's greatest root,1.490869,3.987297,0.0
