In [1]:
import pandas as pd

In [4]:
# Scaricare i dataset originali

prefix = "data/original_datasets/"

raw_MB = pd.read_csv(prefix + "dataset_Metabody.csv")
raw_ME = pd.read_csv(prefix + "dataset_MetaEducation.csv")
raw_MI = pd.read_csv(prefix + "dataset_MetaImagery.csv")
raw_MM = pd.read_csv(prefix + "dataset_MoveMe.csv")

In [None]:
# Selezionare solo le colonne che ci interessano e rinominarle in modo standard

clean_MB = raw_MB[["Metaphor", "Met_structure", "FamMetabody_Met", "SensMetabody_Met", "BodyMetabody_Met"]]
clean_MB.columns= ["Metaphor", "Met_structure", "FAMILIARITY_human", "MEANINGFULNESS_human", "BODY_RELATEDNESS_human"]
clean_MB.to_csv("data/new_datasets/human_MB.csv", index = False)

clean_ME = raw_ME[["Metaphor", "Met_structure", "FAM(M)MetaEdu_Met", "MEA(M)MetaEdu_Met", "DIFF(M)MetaEdu_Met"]]
clean_ME.columns= ["Metaphor", "Met_structure", "FAMILIARITY_human", "MEANINGFULNESS_human", "DIFFICULTY_human"]
clean_ME.to_csv("data/new_datasets/human_ME.csv", index = False)

clean_MI= raw_MI[["Metaphor", "Met_structure", "PhysMetaIma(M)_Met", "ImaMetaIma(M)_Met"]]
clean_MI.columns= ["Metaphor", "Met_structure", "PHISICALITY_human", "IMAGEABILITY_human"]
clean_MI.to_csv("data/new_datasets/human_MI.csv", index = False)

clean_MM = raw_MM[["Metaphor", "Met_structure", "FamMoveme_Met", "SensMoveme_Met"]]
clean_MM.columns= ["Metaphor", "Met_structure", "FAMILIARITY_human", "MEANINGFULNESS_human"]
clean_MM.to_csv("data/new_datasets/human_MM.csv", index = False)

In [None]:
# Osservare le caratteristiche statistiche dei vari dataset umani

prefix = "data/human_datasets/"

human_MB = pd.read_csv(prefix + "human_MB.csv")
human_values_MB = human_MB[["FAMILIARITY_human", "MEANINGFULNESS_human", "BODY_RELATEDNESS_human"]]
human_values_MB = human_values_MB.replace(",", ".", regex=True)
print("\nHuman_MB:", human_values_MB.describe())

human_ME = pd.read_csv(prefix + "human_ME.csv")
human_values_ME = human_ME[["FAMILIARITY_human", "MEANINGFULNESS_human", "DIFFICULTY_human"]]
human_values_ME = human_values_ME.replace(",", ".", regex=True)
human_values_ME = human_values_ME.astype(float)
print("\nHuman_ME:", human_values_ME.describe())

human_MI = pd.read_csv(prefix + "human_MI.csv")
human_values_MI = human_MI[["PHISICALITY_human", "IMAGEABILITY_human"]]
human_values_MI = human_values_MI.replace(",", ".", regex=True)
human_values_MI = human_values_MI.astype(float)
print("\nHuman_MI:", human_values_MI.describe())

human_MM = pd.read_csv(prefix + "human_MM.csv")
human_values_MM = human_MM[["FAMILIARITY_human", "MEANINGFULNESS_human"]]
human_values_MM = human_values_MM.replace(",", ".", regex=True)
human_values_MM = human_values_MM.astype(float)
print("\nHuman_MM:", human_values_MM.describe())




Human_MB:        FAMILIARITY_human  MEANINGFULNESS_human  BODY_RELATEDNESS_human
count          54.000000             64.000000               64.000000
mean            4.062797              4.908003                4.019181
std             1.231482              0.940674                1.734109
min             1.846154              2.625000                1.346154
25%             3.088942              4.311699                2.354167
50%             4.118590              5.080128                3.807692
75%             5.062500              5.509615                5.673077
max             6.291667              6.384615                6.791667

Human_ME:        FAMILIARITY_human  MEANINGFULNESS_human  DIFFICULTY_human
count          80.000000             80.000000         80.000000
mean            2.813966              3.588370          1.886662
std             0.828771              0.598169          0.403041
min             1.000000              2.333333          1.125000
25%           

In [None]:
# Osservare le caratteristiche statistiche dei vari dataset sintetici per il dato modello

prefix = "data/synthetic_datasets/"
model = "_llama-3.3-70b-versatile_"

synthetic_MB = pd.read_csv(prefix + "synthetic_MB" + model + ".csv")
synthetic_values_MB = synthetic_MB[["annotator", "FAMILIARITY_synthetic", "MEANINGFULNESS_synthetic", "BODY_RELATEDNESS_synthetic"]]
synthetic_values_MB = synthetic_values_MB.replace(",", ".", regex=True)
synthetic_values_MB = synthetic_values_MB.astype(float)
synthetic_values_MB_SINGLE_RUN = synthetic_values_MB[synthetic_values_MB["annotator"] == 1]

print("\nsynthetic_MB:\n", synthetic_values_MB_SINGLE_RUN.describe())


synthetic_MB:
        annotator  FAMILIARITY_synthetic  MEANINGFULNESS_synthetic  \
count       64.0              64.000000                 64.000000   
mean         1.0               3.859375                  5.234375   
std          0.0               0.940570                  0.771356   
min          1.0               2.000000                  3.000000   
25%          1.0               3.000000                  5.000000   
50%          1.0               4.000000                  5.000000   
75%          1.0               4.000000                  6.000000   
max          1.0               6.000000                  7.000000   

       BODY_RELATEDNESS_synthetic  
count                   64.000000  
mean                     3.875000  
std                      2.994704  
min                      1.000000  
25%                      1.000000  
50%                      1.000000  
75%                      7.000000  
max                      7.000000  


In [8]:
# Rintracciare le metafore già usate in altri studi quanto a specifiche dimensioni

df_dict = {"MB" : raw_MB , "ME" : raw_ME, "MI" : raw_MI, "MM" : raw_MM}
used_metaphors_MB = {"FAMILIARITY" : set(), "MEANINGFULNESS" : set(), "BODY_RELATEDNESS" : set()}
used_metaphors_ME = {"FAMILIARITY" : set(), "MEANINGFULNESS" : set(), "DIFFICULTY" : set()}
used_metaphors_MI = {"PHISICALITY" : set(), "IMAGEABILITY" : set()}
used_metaphors_MM = {"FAMILIARITY" : set(), "MEANINGFULNESS" : set()}

for name, df in df_dict.items():

  if name == "MB":

    for idx, row in df.iterrows():

      if row["Bambini et al. (2013)"] == "Y":
        used_metaphors_MB["FAMILIARITY"].add(row["Metaphor"])
        used_metaphors_MB["MEANINGFULNESS"].add(row["Metaphor"])

      if row["Canal et al. (2022)"] == "Y":
        used_metaphors_MB["FAMILIARITY"].add(row["Metaphor"])
 
      if  row["Bambini et al. (2024)"] == "Y":
        used_metaphors_MB["FAMILIARITY"].add(row["Metaphor"])

      if row["Lago et al. (2024)"] == "Y":
        used_metaphors_MB["FAMILIARITY"].add(row["Metaphor"])

    all_metaphors = set(df["Metaphor"])
    used_familiarity_MB = used_metaphors_MB["FAMILIARITY"]
    used_meaningfulness_MB = used_metaphors_MB["MEANINGFULNESS"]
    used_body_relatedness = used_metaphors_MB["BODY_RELATEDNESS"]

    unused_familiarity_MB = all_metaphors - used_familiarity_MB
    unused_meaningfulness_MB = all_metaphors - used_meaningfulness_MB
    unused_body_relatedness_MB = all_metaphors - used_body_relatedness

    print(name + ": \n",
      "unused_familiarity_MB: ", len(unused_familiarity_MB),
      "unused_meaningfulness_MB: ", len(unused_meaningfulness_MB),
      "unused_body_relatedness_MB: ", len(unused_body_relatedness_MB),
      "\n"
    )

  if name == "ME":

    for idx, row in df.iterrows():

      if row["Bambini et al. (2013)"] == "Y":
        used_metaphors_ME["FAMILIARITY"].add(row["Metaphor"])
        used_metaphors_ME["MEANINGFULNESS"].add(row["Metaphor"])
        used_metaphors_ME["DIFFICULTY"].add(row["Metaphor"])

      if row["Canal et al. (2022)"] == "Y":
        used_metaphors_ME["FAMILIARITY"].add(row["Metaphor"])
 
      if  row["Bambini et al. (2024)"] == "Y":
        used_metaphors_ME["FAMILIARITY"].add(row["Metaphor"])
        used_metaphors_ME["DIFFICULTY"].add(row["Metaphor"])

      if row["Lago et al. (2024)"] == "Y":
        used_metaphors_ME["FAMILIARITY"].add(row["Metaphor"])

    all_metaphors = set(df["Metaphor"])
    used_familiarity_ME = used_metaphors_ME["FAMILIARITY"]
    used_meaningfulness_ME = used_metaphors_ME["MEANINGFULNESS"]
    used_difficulty_ME = used_metaphors_ME["DIFFICULTY"]

    unused_familiarity_ME = all_metaphors - used_familiarity_ME
    unused_meaningfulness_ME = all_metaphors - used_meaningfulness_ME
    unused_difficulty_ME = all_metaphors - used_difficulty_ME

    print(name + ": \n",
      "unused_familiarity_ME: ", len(unused_familiarity_ME),
      "unused_meaningfulness_ME: ", len(unused_meaningfulness_ME),
      "unused_difficulty_ME: ", len(unused_difficulty_ME),
      "\n"
    )

  if name == "MI":

    for idx, row in df.iterrows():

      if row["Canal et al. (2022)"] == "Y":
        used_metaphors_MI["PHISICALITY"].add(row["Metaphor"])
 
      if row["Bambini et al. (2024)"] == "Y":
        used_metaphors_MI["IMAGEABILITY"].add(row["Metaphor"])

    all_metaphors = set(df["Metaphor"])
    used_phisicality_MI = used_metaphors_MI["PHISICALITY"]
    used_imageability_MI = used_metaphors_MI["IMAGEABILITY"]

    unused_phisicality_MI = all_metaphors - used_phisicality_MI
    unused_imageability_MI = all_metaphors - used_imageability_MI

    print(name + ": \n",
      "unused_phisicality_MI: ", len(unused_phisicality_MI),
      "unused_imageability_MI: ", len(unused_imageability_MI),
      "\n"
    )

  if name == "MM":

    for idx, row in df.iterrows():

      if row["Bambini et al. (2013)"] == "Y":
        used_metaphors_MM["FAMILIARITY"].add(row["Metaphor"])
        used_metaphors_MM["MEANINGFULNESS"].add(row["Metaphor"])

      if row["Canal et al. (2022)"] == "Y":
        used_metaphors_MM["FAMILIARITY"].add(row["Metaphor"])
 
      if row["Bambini et al. (2024)"] == "Y":
        used_metaphors_MM["FAMILIARITY"].add(row["Metaphor"])

      if row["Lago et al. (2024)"] == "Y":
        used_metaphors_MM["FAMILIARITY"].add(row["Metaphor"])

    all_metaphors = set(df["Metaphor"])
    used_familiarity_MM = used_metaphors_MM["FAMILIARITY"]
    used_meaningfulness_MM = used_metaphors_MM["MEANINGFULNESS"]

    unused_familiarity_MM = all_metaphors - used_familiarity_MM
    unused_meaningfulness_MM = all_metaphors - used_meaningfulness_MM

    print(name + ": \n",
      "unused_familiarity_MM: ", len(unused_familiarity_MM),
      "unused_meaningfulness_MM: ", len(unused_meaningfulness_MM),
      "\n"
    )

MB: 
 unused_familiarity_MB:  25 unused_meaningfulness_MB:  59 unused_body_relatedness_MB:  64 

ME: 
 unused_familiarity_ME:  32 unused_meaningfulness_ME:  38 unused_difficulty_ME:  34 

MI: 
 unused_phisicality_MI:  15 unused_imageability_MI:  40 

MM: 
 unused_familiarity_MM:  60 unused_meaningfulness_MM:  60 



In [10]:
# CORRELAZIONI STATISTICHE

import scipy.stats as stats

# Definizione dei percorsi che portano ai dati + lettura dei dati

prefix = "data/human_datasets/"
human_MB = pd.read_csv(prefix + "human_MB.csv")

prefix = "data/synthetic_datasets/"
model = "_meta-llama-Llama-3.3-70B-Instruct_"
synthetic_MB = pd.read_csv(prefix + "synthetic_MB" + model + ".csv")

# Definizione della funzione che seleziona solo le metafore usate o non usate

def filter(df, metaphors_list):
    
    if "metaphor" in df.columns:
        col_name = "metaphor"
    elif "Metaphor" in df.columns:
        col_name = "Metaphor"
    else:
        raise ValueError("Nessuna colonna 'Metaphor' o 'metaphor' trovata nel dataframe")
    
    out_df = df[df[col_name].isin(metaphors_list)]
    
    return out_df


# Definizione della funzione che processa i valori in modo che siano trattabili per i calcoli statistici

def preparing_4_stat(df):

    if df.columns[0] == "Metaphor":
        output_df = df.iloc[:, 2:]

    else:
        output_df = df.iloc[:, 3:]

    output_df = output_df.replace(",", ".", regex=True)
    output_df = output_df.astype(float)

    return output_df

# MB:

# FAMILIARITY
# Non tutte le metafore di MB erano state giudicate quanto alla dimensione di familiarità, quindi nel calcolare la correlazione tra familiarità umana e familiarità sintetica i dataset vanno epurati da tali metafore

no_familiarity_metaphors_MB = human_MB[human_MB["FAMILIARITY_human"].isna() | (human_MB["FAMILIARITY_human"] == "")]

fixed_human_MB = human_MB.drop(no_familiarity_metaphors_MB.index)
synthetic_MB_SINGLE_RUN = synthetic_MB[synthetic_MB["annotator"] == 1]
fixed_synthetic_MB_SINGLE_RUN = synthetic_MB_SINGLE_RUN.drop(no_familiarity_metaphors_MB.index)
synthetic_MB_AGGREGATED = (synthetic_MB.groupby(["annotator", "metaphor", "metaphor_structure"], as_index=False)[["FAMILIARITY_synthetic", "MEANINGFULNESS_synthetic", "BODY_RELATEDNESS_synthetic"]].mean())
fixed_synthetic_MB_AGGREGATED = synthetic_MB_AGGREGATED.drop(no_familiarity_metaphors_MB.index)


# Definire le maschere USED e UNUSED prima di togliere la colonna "Metaphor" applicando la funzione preparing_4_stat()
#USED_MASK_FAMILIARITY = fixed_human_MB["Metaphor"].isin(used_familiarity_MB)
#UNUSED_MASK_FAMILIARITY = fixed_human_MB["Metaphor"].isin(unused_familiarity_MB)

# USED FAMILIARITY
USED_fixed_human_values_MB = preparing_4_stat(filter(fixed_human_MB, used_familiarity_MB))
USED_fixed_synthetic_values_MB_SINGLE_RUN = preparing_4_stat(filter(fixed_synthetic_MB_SINGLE_RUN, used_familiarity_MB))
USED_fixed_synthetic_values_MB_AGGREGATED = preparing_4_stat(filter(fixed_synthetic_MB_AGGREGATED, used_familiarity_MB))

# human vs synthetic SINGLE_RUN
rho, p_value = stats.spearmanr(USED_fixed_human_values_MB["FAMILIARITY_human"], USED_fixed_synthetic_values_MB_SINGLE_RUN["FAMILIARITY_synthetic"])
print("\nUSED FAMILIARITY - SINGLE_RUN:")
print("Spearman correlation coefficient:", rho, "\np-value: ", p_value)
if p_value > 0.05:
    print("p-value TROPPO ALTO")
print("\n")

# human vs synthetic AGGREGATED
rho, p_value = stats.spearmanr(USED_fixed_human_values_MB["FAMILIARITY_human"], USED_fixed_synthetic_values_MB_AGGREGATED["FAMILIARITY_synthetic"])
print("\nUSED FAMILIARITY - AGGREGATED:")
print("Spearman correlation coefficient:", rho, "\np-value: ", p_value)
if p_value > 0.05:
    print("p-value TROPPO ALTO")
print("\n")

# UNUSED FAMILIARITY
UNUSED_fixed_human_values_MB = fixed_human_values_MB.loc[UNUSED_MASK_FAMILIARITY]
UNUSED_fixed_synthetic_values_MB_SINGLE_RUN = fixed_synthetic_values_MB_SINGLE_RUN.loc[UNUSED_MASK_FAMILIARITY]
UNUSED_fixed_synthetic_values_MB_AGGREGATED = fixed_synthetic_values_MB_AGGREGATED.loc[UNUSED_MASK_FAMILIARITY]

# human vs synthetic SINGLE_RUN
rho, p_value = stats.spearmanr(UNUSED_fixed_human_values_MB["FAMILIARITY_human"], UNUSED_fixed_synthetic_values_MB_SINGLE_RUN["FAMILIARITY_synthetic"])
print("\nUNUSED FAMILIARITY - SINGLE_RUN:")
print("Spearman correlation coefficient:", rho, "\np-value: ", p_value)
if p_value > 0.05:
    print("p-value TROPPO ALTO", "\n\n")
print("\n")

# human vs synthetic AGGREGATED
rho, p_value = stats.spearmanr(UNUSED_fixed_human_values_MB["FAMILIARITY_human"], UNUSED_fixed_synthetic_values_MB_AGGREGATED["FAMILIARITY_synthetic"])
print("\nUNUSED FAMILIARITY - AGGREGATED:")
print("Spearman correlation coefficient:", rho, "\np-value: ", p_value)
if p_value > 0.05:
    print("p-value TROPPO ALTO", "\n\n")
print("\n")

# MEANINGFULNESS e BODY_RELATEDNESS:

# Definire le maschere USED e UNUSED prima di togliere la colonna "Metaphor" applicando la funzione preparing_4_stat()
USED_MASK_MEANINGFULNESS = human_MB["Metaphor"].isin(used_meaningfulness_MB)
UNUSED_MASK_MEANINGFULNESS = human_MB["Metaphor"].isin(unused_meaningfulness_MB)

USED_MASK_BODY_RELATEDNESS = human_MB["Metaphor"].isin(used_meaningfulness_MB)
UNUSED_MASK_BODY_RELAREDNESS = human_MB["Metaphor"].isin(unused_meaningfulness_MB)

human_values_MB = preparing_4_stat(human_MB)

synthetic_MB_SINGLE_RUN = synthetic_MB[synthetic_MB["annotator"] == 1]
synthetic_values_MB_SINGLE_RUN = preparing_4_stat(synthetic_MB_SINGLE_RUN)

synthetic_MB_AGGREGATED = (synthetic_MB.groupby(["annotator", "metaphor", "metaphor_structure"], as_index=False)[["FAMILIARITY_synthetic", "MEANINGFULNESS_synthetic", "BODY_RELATEDNESS_synthetic"]].mean())
synthetic_values_MB_AGGREGATED = preparing_4_stat(synthetic_MB_AGGREGATED)

# USED MEANINGFULNESS

USED_human_values_MB = human_values_MB.loc[USED_MASK_MEANINGFULNESS]
USED_synthetic_values_MB_SINGLE_RUN = synthetic_values_MB_SINGLE_RUN.loc[USED_MASK_MEANINGFULNESS]
USED_synthetic_values_MB_AGGREGATED = synthetic_values_MB_AGGREGATED.loc[USED_MASK_MEANINGFULNESS]

# human vs synthetic SINGLE_RUN
rho, p_value = stats.spearmanr(USED_human_values_MB["MEANINGFULNESS_human"], USED_synthetic_values_MB_SINGLE_RUN["MEANINGFULNESS_synthetic"])
print("\nUSED MEANINGFULNESS - SINGLE_RUN:")
print("Spearman correlation coefficient:", rho, "\np-value: ", p_value)
if p_value > 0.05:
    print("p-value TROPPO ALTO", "\n\n")
print("\n")

# human vs synthetic AGGREGATED
rho, p_value = stats.spearmanr(USED_human_values_MB["MEANINGFULNESS_human"], USED_synthetic_values_MB_AGGREGATED["MEANINGFULNESS_synthetic"])
print("\nUSED MEANINGFULNESS - AGGREGATED:")
print("Spearman correlation coefficient:", rho, "\np-value: ", p_value)
if p_value > 0.05:
    print("p-value TROPPO ALTO", "\n\n")
print("\n")

# UNUSED MEANINGFULNESS

UNUSED_human_values_MB = human_values_MB.loc[UNUSED_MASK_MEANINGFULNESS]
UNUSED_synthetic_values_MB_SINGLE_RUN = synthetic_values_MB_SINGLE_RUN.loc[UNUSED_MASK_MEANINGFULNESS]
UNUSED_synthetic_values_AGGREGATED = synthetic_values_MB_AGGREGATED.loc[UNUSED_MASK_MEANINGFULNESS]

# human vs synthetic SINGLE RUN
rho, p_value = stats.spearmanr(UNUSED_human_values_MB["MEANINGFULNESS_human"], UNUSED_synthetic_values_MB_SINGLE_RUN["MEANINGFULNESS_synthetic"])
print("\nUNUSED MEANINGFULNESS - SINGLE_RUN:")
print("Spearman correlation coefficient:", rho, "\np-value: ", p_value)
if p_value > 0.05:
    print("p-value TROPPO ALTO", "\n\n")
print("\n")

# human vs synthetic AGGREGATED
rho, p_value = stats.spearmanr(UNUSED_human_values_MB["MEANINGFULNESS_human"], UNUSED_synthetic_values_AGGREGATED["MEANINGFULNESS_synthetic"])
print("\nUNUSED MEANINGFULNESS - AGGREGATED:")
print("Spearman correlation coefficient:", rho, "\np-value: ", p_value)
if p_value > 0.05:
    print("p-value TROPPO ALTO", "\n\n")
print("\n")

# USED BODY_RELATEDNESS

USED_human_values_MB = human_values_MB.loc[USED_MASK_BODY_RELATEDNESS]
USED_synthetic_values_MB_SINGLE_RUN = synthetic_values_MB_SINGLE_RUN.loc[USED_MASK_BODY_RELATEDNESS]
USED_synthetic_values_MB_AGGREGATED = synthetic_values_MB_AGGREGATED.loc[USED_MASK_BODY_RELATEDNESS]

# human vs synthetic SINGLE RUN
rho, p_value = stats.spearmanr(USED_human_values_MB["BODY_RELATEDNESS_human"], USED_synthetic_values_MB_SINGLE_RUN["BODY_RELATEDNESS_synthetic"])
print("\nUSED BODY_RELATEDNESS - SINGLE_RUN:")
print("Spearman correlation coefficient:", rho, "\np-value: ", p_value)
if p_value > 0.05:
    print("p-value TROPPO ALTO", "\n\n")
print("\n")

# human vs synthetic AGGREGATED
rho, p_value = stats.spearmanr(USED_human_values_MB["BODY_RELATEDNESS_human"], USED_synthetic_values_MB_AGGREGATED["BODY_RELATEDNESS_synthetic"])
print("\nUSED BODY_RELATEDNESS - AGGREGATED:")
print("Spearman correlation coefficient:", rho, "\np-value: ", p_value)
if p_value > 0.05:
    print("p-value TROPPO ALTO", "\n\n")
print("\n")

#UNUSED BODY_RELATEDNESS
UNUSED_human_values_MB = human_values_MB.loc[UNUSED_MASK_BODY_RELAREDNESS]
UNUSED_synthetic_values_MB_SINGLE_RUN = synthetic_values_MB_SINGLE_RUN.loc[UNUSED_MASK_BODY_RELAREDNESS]
UNUSED_synthetic_values_MB_AGGREGATED = synthetic_values_MB_AGGREGATED.loc[UNUSED_MASK_BODY_RELAREDNESS]

# human vs synthetic SINGLE RUN
rho, p_value = stats.spearmanr(UNUSED_human_values_MB["BODY_RELATEDNESS_human"], UNUSED_synthetic_values_MB_SINGLE_RUN["BODY_RELATEDNESS_synthetic"])
print("\nUNUSED BODY_RELATEDNESS - SINGLE RUN:")
print("Spearman correlation coefficient:", rho, "\np-value: ", p_value)
if p_value > 0.05:
    print("p-value TROPPO ALTO", "\n\n")
print("\n")

# human vs synthetic AGGREGATED
rho, p_value = stats.spearmanr(UNUSED_human_values_MB["BODY_RELATEDNESS_human"], UNUSED_synthetic_values_MB_AGGREGATED["BODY_RELATEDNESS_synthetic"])
print("\nUNUSED BODY_RELATEDNESS - AGGREGATED:")
print("Spearman correlation coefficient:", rho, "\np-value: ", p_value)
if p_value > 0.05:
    print("p-value TROPPO ALTO", "\n\n")
print("\n")



USED FAMILIARITY - SINGLE_RUN:
Spearman correlation coefficient: 0.39400027541521054 
p-value:  0.02829794128323047




ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 31 and the array at index 1 has size 384

In [2]:
# Concatenare pezzi di dataset sintetici

prefix = "data/synthetic_datasets/"
model = "_meta-llama-Llama-3.3-70B-Instruct_"
synthetic_1 = pd.read_csv(prefix + "synthetic_MM" + model + str(1) + ".csv")
synthetic_2 = pd.read_csv(prefix + "synthetic_MM" + model + str(2) + ".csv")
synthetic = pd.concat([synthetic_1, synthetic_2])
synthetic.to_csv(prefix + "synthetic_MM" + model + ".csv", index = False)

In [23]:
results = pd.read_csv("spearman_results.csv")
results_single_run = results[["dataset","dimension","n_annot1","spearman_r_annot1","pval_annot1"]]
results_mean = results[["dataset","dimension","n_mean","spearman_r_mean","pval_mean"]]

In [24]:
results_single_run_sorted = results_single_run.sort_values(by= "spearman_r_annot1", ascending=False)
results_single_run_sorted

Unnamed: 0,dataset,dimension,n_annot1,spearman_r_annot1,pval_annot1
2,MB,BODY_RELATEDNESS,64,0.838605,5.2639610000000004e-18
6,MI,PHISICALITY,15,0.665985,0.006717716
7,MI,IMAGEABILITY,40,0.473239,0.002041907
3,ME,FAMILIARITY,32,0.470756,0.006543102
1,MB,MEANINGFULNESS,59,0.24178,0.06505353
8,MM,FAMILIARITY,60,0.23375,0.07225226
0,MB,FAMILIARITY,23,0.198253,0.3645056
5,ME,DIFFICULTY,34,0.131554,0.4583147
9,MM,MEANINGFULNESS,60,0.026735,0.8393181
4,ME,MEANINGFULNESS,38,-0.010652,0.9493889


In [25]:
results_mean_sorted = results_mean.sort_values(by= "spearman_r_mean", ascending=False)
results_mean_sorted


Unnamed: 0,dataset,dimension,n_mean,spearman_r_mean,pval_mean
2,MB,BODY_RELATEDNESS,64,0.817073,1.81797e-16
6,MI,PHISICALITY,15,0.586025,0.02168962
7,MI,IMAGEABILITY,40,0.494551,0.001179441
3,ME,FAMILIARITY,32,0.469869,0.006659215
1,MB,MEANINGFULNESS,59,0.314825,0.01515808
0,MB,FAMILIARITY,23,0.251302,0.2474061
8,MM,FAMILIARITY,60,0.247882,0.05618279
5,ME,DIFFICULTY,34,0.219665,0.2119364
9,MM,MEANINGFULNESS,60,0.177172,0.1756607
4,ME,MEANINGFULNESS,38,0.091655,0.5841871
