In [None]:
import pandas as pd 
data = pd.read_csv('5_cleaned.csv')
data.shape

In [None]:
personalities = ['final_o', 'final_c', 'final_e', 'final_a', 'final_n']
others = ['Birthyear', 'Birthmonth', 'Birthday', 'Type', 'Latitude', 'Longitude', 'Height', 'Weight']
subdata = data[personalities + others]
print(subdata.shape)
subdata.head()

In [None]:
unique_types = subdata['Type'].unique()
type_mapping = {value: index for index, value in enumerate(unique_types)}
subdata['Type'] = subdata['Type'].map(type_mapping)
subdata.head()

In [None]:
subdata['Type'] = subdata['Type'].astype('category').cat.codes
subdata.head() # subdata['Type'].value_counts()

In [18]:
import numpy as np
subdata = subdata.values
np.save("Z_all_5_8.npy", subdata)

# KCI

In [None]:
from causallearn.utils.cit import CIT
import numpy as np
kci_matrix = np.zeros((len(personalities), len(others)))

for i in range(len(personalities)):
    for j in range(len(others)):
        data = subdata[ [personalities[i], others[j]] ].dropna().values
        kci_obj = CIT(data, "kci")
        pValue = kci_obj(0, 1)
        print(f"data shape: {data.shape}, P Value between {personalities[i]} and {others[j]}: {pValue}.")
        kci_matrix[i, j] = pValue

In [15]:
import os
if not os.path.exists('CIT_output'):
    os.makedirs('CIT_output')
df_kci = pd.DataFrame(kci_matrix, index=personalities, columns=others)
df_kci.to_csv('CIT_output/athlete_kci_matrix.csv')

# RCIT

In [None]:
from causallearn.utils.cit import CIT
import numpy as np
rcit_matrix = np.zeros((len(personalities), len(others)))

for i in range(len(personalities)):
    for j in range(len(others)):
        data = subdata[ [personalities[i], others[j]] ].dropna().values
        try: 
            kci_obj = CIT(data, "rcit")
            pValue = kci_obj(0, 1)
        except:
            print("Error")
            pValue = 0 
        
        print(f"data shape: {data.shape}, RCIT P Value between {personalities[i]} and {others[j]}: {pValue}.")
        rcit_matrix[i, j] = pValue

In [42]:
df_cit = pd.DataFrame(rcit_matrix, index=personalities, columns=others)
df_cit.to_csv('CIT_output/athlete_rcit_matrix.csv')

# HSIC

In [None]:
from conditional_independence import hsic_test
import numpy as np
hsic_matrix = np.zeros((len(personalities), len(others)))
for i in range(len(personalities)):
    for j in range(len(others)):
        data = subdata[ [personalities[i], others[j]] ].dropna().values
        p_value = hsic_test(data, 0, 1)['p_value']
        print(f"data shape: {data.shape}, HSIC P Value between {personalities[i]} and {others[j]}: {p_value}.")
        hsic_matrix[i, j] = p_value

In [44]:
df_hsic = pd.DataFrame(hsic_matrix, index=personalities, columns=others)
df_hsic.to_csv('CIT_output/athlete_hsic_matrix.csv')

# Chi-square

In [None]:
from causallearn.utils.cit import CIT
import numpy as np

other_discrete = ['Birthyear', 'Birthmonth', 'Birthday', 'Type']
chisq_matrix = np.zeros((len(personalities), len(other_discrete)))

for i in range(len(personalities)):
    for j in range(len(other_discrete)):
        data = subdata[ [personalities[i], other_discrete[j]] ].dropna().values
        # print(data.shape)
        kci_obj = CIT(data, "chisq")
        pValue = kci_obj(0, 1)
        print(f"data shape: {data.shape}, Chi-square P Value between {personalities[i]} and {other_discrete[j]}: {pValue}.")
        chisq_matrix[i, j] = pValue

In [46]:
df_chisq = pd.DataFrame(chisq_matrix, index=personalities, columns=other_discrete)
df_chisq.to_csv('CIT_output/athlete_chisq_matrix_with_causallearn.csv')

In [None]:
import numpy as np
from scipy.stats import chi2_contingency
import pandas as pd

chi_matrix = np.zeros((len(personalities), len(other_discrete)))
for i in range(len(personalities)):
    for j in range(len(other_discrete)):
        data = subdata[ [personalities[i], other_discrete[j]] ].dropna().values
        contingency_table = pd.crosstab(data[:,0], data[:,1])
        chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
        print(f"data shape: {data.shape}, Chi-square P Value between {personalities[i]} and {other_discrete[j]}: {p_value}.")
        chi_matrix[i, j] = p_value

In [48]:
df_chi = pd.DataFrame(chi_matrix, index=personalities, columns=other_discrete)
df_chi.to_csv('CIT_output/athlete_chi_matrix_with_scipy.csv')

# G square

In [None]:
from causallearn.utils.cit import CIT
import numpy as np
gsq_matrix = np.zeros((len(personalities), len(other_discrete)))

for i in range(len(personalities)):
    for j in range(len(other_discrete)):
        data = subdata[ [personalities[i], other_discrete[j]] ].dropna().values
        # print(data.shape)
        kci_obj = CIT(data, "gsq")
        pValue = kci_obj(0, 1)
        print(f"data shape: {data.shape}, G-square P Value between {personalities[i]} and {other_discrete[j]}: {pValue}.")
        gsq_matrix[i, j] = pValue

In [50]:
df_gsq = pd.DataFrame(gsq_matrix, index=personalities, columns=other_discrete)
df_gsq.to_csv('CIT_output/athlete_gsq_matrix.csv')

In [None]:
df = pd.read_csv("athlete_kci_matrix.csv", index_col=0)
# df.head()
matrix = df.values 
threshold = 0.05 
binary_matrix = np.where(matrix < threshold, 0, 1)
binary_matrix 
np.save("athlete_kci_matrix_binary.npy", binary_matrix)
binary_matrix