In [None]:
import pandas as pd

df = pd.read_csv('./output/5_cleaned_with_missing.csv')
selected_features = ['Height(cm)', 'Weight(kg)','Year', 'Month', 'Day', 'Day index', 'Day_sin', 'Day_cos', 'Latitude', 'Longitude', 'Occupation_idx', 'Gender_idx', 'Arched_Eyebrows', 'Big_Nose', 'Pointy_Nose', 'Bushy_Eyebrows', 'Big_Lips', 'Oval_Face', 'Chubby', 'Double_Chin', 'Receding_Hairline', 'Narrow_Eyes', 'High_Cheekbones']
       
bigfive = [
        'final_o', 'final_c', 'final_e', 'final_a', 'final_n', 
]

df_selected = df[selected_features + bigfive]

In [None]:
from causallearn.utils.cit import CIT
import numpy as np
kci_matrix = np.zeros((len(selected_features), len(bigfive)))
for i in range(len(selected_features)):
    for j in range(len(bigfive)):
        data = df_selected[[selected_features[i], bigfive[j]]].dropna().values
        kci_obj = CIT(data, "kci") # construct a CIT instance with data and method name
        pValue = kci_obj(0, 1)
        print(f"data.shape: {data.shape}. p_val({selected_features[i]}, {bigfive[j]}): {pValue}.")
        kci_matrix[i,j] = pValue

In [51]:
kci_df = pd.DataFrame(
    kci_matrix,
    columns=bigfive,
    index=selected_features
)
kci_df.to_csv('LLM_personality_celebA/8_kci_matrix.csv')

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


kci_df_plot = pd.DataFrame(
    kci_df.values,
    columns=['O', 'C', 'E', 'A', 'N']*3,
    index=selected_features
)
# kci_df.to_csv("output/8_kci_matrix.csv", index=True)

# Plot the heatmap using seaborn.
# cmap='Greys_r' is used so that lower values (small p-values) are dark.
plt.figure(figsize=(20, 8))
heatmap = sns.heatmap(kci_df_plot, annot=True, cmap="Blues", cbar=True, fmt=".3f", annot_kws={"size": 14}, cbar_kws={"pad": 0.02})
plt.xticks(rotation=0, fontsize=14)
plt.yticks(rotation=0, fontsize=14)

for i in range(5, len(kci_df_plot.columns), 5):
    plt.axvline(i, color='black', linewidth=2.5, linestyle='--')

# Use different colors for each set of 5 column labels
colors = ['blue', 'green', 'red']
ax = plt.gca()
xtick_labels = ax.get_xticklabels()
for i, label in enumerate(xtick_labels):
    label.set_color(colors[i // 5])

# Add shared grouping labels (secondary x-axis)
ax2 = ax.twiny()
ax2.set_xlim(ax.get_xlim())

# Set ticks at the center of each group of 5 columns
group_labels = ['GPT-4o', 'DeepSeek-V3', 'Gemini-2.0-Flash']
group_positions = [2.5, 7.5, 12.5]  # center positions of each group of 5 columns
ax2.set_xticks(group_positions)
ax2.set_xticklabels(group_labels, fontsize=16, fontweight='bold')
for tick, color in zip(ax2.get_xticklabels(), colors):
    tick.set_color(color)
ax2.tick_params(length=0)

plt.title('KCI P-value Heatmap', fontsize=16, pad=10, fontweight='bold')
plt.tight_layout()
plt.savefig('output/8_kci_matrix.pdf')
plt.show()

In [None]:
# chi-square test
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

selected_features = ['Year', 'Month', 'Day', 'Day index', 'Occupation_idx', 'Gender_idx', 'Arched_Eyebrows', 'Big_Nose', 'Pointy_Nose', 'Bushy_Eyebrows', 'Big_Lips', 'Oval_Face', 'Chubby', 'Double_Chin', 'Receding_Hairline', 'Narrow_Eyes', 'High_Cheekbones']
       
bigfive = [
    '4o_o', '4o_c', '4o_e', '4o_a', '4o_n', 
    'DS_o', 'DS_c', 'DS_e', 'DS_a', 'DS_n',
    'gemini_o', 'gemini_c', 'gemini_e', 'gemini_a', 'gemini_n'
]

df_selected_chi = df[selected_features + bigfive]

chi_matrix = np.zeros((len(selected_features), len(bigfive)))
for i in range(len(selected_features)):
    for j in range(len(bigfive)):
        data = df_selected_chi[[selected_features[i], bigfive[j]]].dropna().values
        contingency = pd.crosstab(data[:,0], data[:,1])
        chi2, p, dof, expected = chi2_contingency(contingency)
        chi_matrix[i,j] = p
        print(f"data.shape: {data.shape}. p_val({selected_features[i]}, {bigfive[j]}): {p}.")

chi_df = pd.DataFrame(
    chi_matrix,
    columns=bigfive,
    index=selected_features
)
chi_df.to_csv('LLM_personality_celebA/8_chisquare_matrix.csv')

In [None]:
chi_df.head(20)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


chi_df_plot = pd.DataFrame(
    chi_df.values,
    columns=['O', 'C', 'E', 'A', 'N']*3,
    index=selected_features
)
# kci_df.to_csv("output/8_kci_matrix.csv", index=True)

# Plot the heatmap using seaborn.
# cmap='Greys_r' is used so that lower values (small p-values) are dark.
plt.figure(figsize=(20, 10))
heatmap = sns.heatmap(chi_df_plot, annot=True, cmap="Blues", cbar=True, fmt=".3f", annot_kws={"size": 14}, cbar_kws={"pad": 0.02})
plt.xticks(rotation=0, fontsize=14)
plt.yticks(rotation=0, fontsize=14)

for i in range(5, len(chi_df_plot.columns), 5):
    plt.axvline(i, color='black', linewidth=2.5, linestyle='--')

# Use different colors for each set of 5 column labels
colors = ['blue', 'green', 'red']
ax = plt.gca()
xtick_labels = ax.get_xticklabels()
for i, label in enumerate(xtick_labels):
    label.set_color(colors[i // 5])

# Add shared grouping labels (secondary x-axis)
ax2 = ax.twiny()
ax2.set_xlim(ax.get_xlim())

# Set ticks at the center of each group of 5 columns
group_labels = ['GPT-4o', 'DeepSeek-V3', 'Gemini-2.0-Flash']
group_positions = [2.5, 7.5, 12.5]  # center positions of each group of 5 columns
ax2.set_xticks(group_positions)
ax2.set_xticklabels(group_labels, fontsize=16, fontweight='bold')
for tick, color in zip(ax2.get_xticklabels(), colors):
    tick.set_color(color)
ax2.tick_params(length=0)

plt.title('Chi-square P-value Heatmap', fontsize=16, pad=10, fontweight='bold')
plt.tight_layout()
plt.savefig('LLM_personality_/8_chi_square_matrix.pdf')
plt.show()