# LiteralEvaluation - Synthetic Dataset Experiments: Tables

This notebook creates the Latex tables showing the quality of model predictions on the Semi-Synthetic Dataset.

This notebook uses the scores (triple likelihood) predicted by the models of the test triples from `NUM_RUNS` runs stored in `data/saved_models/saved_models_run_[run number]`. The logs produced during all our experiments (over 3 runs) are included for completeness.

In [6]:
import os
import pandas as pd
import json

SAVED_MODELS_PATH = '../data/saved_models'
OUT_FILE = '../data/tex/synthetic_tab.tex'
NUM_RUNS = 3

run_name_2_model_name = json.load(open('../data/tex/constants.json'))['rank-file_2_name']

# should not be modified
dataset_name = 'Synthetic'
value_relation = '/m/is_a'
category_a = '/m/high'
category_b = '/m/low'

## Adjust file names
The KGA rank files have been saved with a different naming convention. We need to adjust the file names to match the other models.

In [41]:
for run_i in range(1, 4):
# Set the directory where your files are located
    directory = os.path.join(SAVED_MODELS_PATH, f'saved_models_run_{run_i}')
    
    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        # Check if 'distmult' is in the file name
        new_filename = ''
        
        if 'QHC_5' in filename and 'Synthetic' in filename:
            model = 'kgadistmult' if 'distmult' in filename else 'kgatucker'
            feature_type = 'rand' if 'rand' in filename else 'numerical_literals'
            new_filename = f'ranks_test_evaluation_{model}_Synthetic_{feature_type}_train.tsv'

        if new_filename != '':
            old_file = os.path.join(directory, filename)
            new_file = os.path.join(directory, new_filename)
            # Rename the file
            os.rename(old_file, new_file)
    
print("Files have been renamed successfully.")

Files have been renamed successfully.


## Read in the Source Dataset (Synthetic Dataset) 

The source dataset contains the true triples which are required to compute the metrics. 

In [4]:
# read the true literal values that are used to classify into the two categories (the file should only contain one literal relation type)
df_literals = pd.read_csv(f'../data/{dataset_name}/literals/numerical_literals.txt', sep='\t', header=None,  names=["head", "relation", "literal"])

# derive human entities from the class mapping file
class_mapping_df = pd.read_csv('../data/FB15k-237_class_mapping.csv', sep=';', header=0)
human_uris = class_mapping_df[class_mapping_df["class_label"] == "human"]["dataset_entity"]

df_literals[df_literals['head'].isin(human_uris.to_list())]

Unnamed: 0,head,relation,literal
3,/m/01sl1q,/m/has_value,0.046682
6,/m/07nznf,/m/has_value,0.375532
8,/m/0q9kd,/m/has_value,0.130452
15,/m/04bdxl,/m/has_value,0.718915
17,/m/079vf,/m/has_value,0.296804
...,...,...,...
13503,/m/09f5pp,/m/has_value,0.711507
13647,/m/0glyyw,/m/has_value,0.024703
13704,/m/06rkl,/m/has_value,0.616274
13717,/m/02p59ry,/m/has_value,0.862526


## Evaluate the models based on their predictions on the Synthetic dataset

In [5]:
# Returns the scores predicted by the model (given by df_results) for a certain tail entity.
def get_scores_for_certain_tail(tail_entity, df_results):
    related_scores = pd.merge(df_literals[df_literals['head'].isin(human_uris.to_list())], df_results[df_results["tail"]==tail_entity][['head', 'score']], left_on='head', right_on='head')
    related_scores = related_scores.rename(columns={ 'score': f'score {tail_entity}' })
    return related_scores

In [70]:

head = """
\\begin{table}[]

\setlength{\\tabcolsep}{6pt}
\\renewcommand{\\arraystretch}{1.1}

\caption{Scores achieved on the synthetic dataset. Acc$_{org}$ denotes the Acc score achieved on the synthetic dataset when we provide the meaningful synthetic literal values, whereas Acc$_{rand}$ denotes the Acc score on the synthetic dataset if we apply the random feature ablation after the dataset creation.}
\label{tab:synthetic}
\\begin{center}

\\begin{tabular}{l|l|l}
\hline
\\bf{Model} & \\bf{Acc$_{org}$}                            &  \\bf{Acc$_{rand}$}                            \\\\ \hline
"""

tail = """
\hline
\end{tabular}

\end{center}
\end{table}
"""

out = open(OUT_FILE, 'w')

out.write(head)

for run_name in ['ranks_test_evaluation_DistMult_0.2_0.0_literal', 'ranks_test_evaluation_ComplEx_0.2_0.0_literal', 'ranks_test_evaluation_KBLN_0.2_0.0_literal', 'ranks_test_evaluation_MTKGNN_0.2_0.0_literal', 'ranks_test_evaluation_TransEA_0.3', 'ranks_test_evaluation_kgatucker', 'ranks_test_evaluation_kgadistmult']:

    line_string = f'{run_name_2_model_name[run_name]}'

    for literal_type in ['numerical_literals', 'rand']:

        acc_scores = []
        mr_diff_scores = []

        for i in range(1, NUM_RUNS + 1):

            result_file = os.path.join(SAVED_MODELS_PATH ,f'saved_models_run_{i}/{run_name}_Synthetic_{literal_type}_train.tsv')

            if not os.path.exists(result_file):
                print(f'Run {i} does not exist')
                print(result_file)
            else:
                df_results = pd.read_csv(result_file, sep='\t', header=None,  names=["head", "relation", "tail", "rank head", "rank tail", "score"])
                # read the results (only value_relation triples are relevant for the evaluation, therefore we filter the dataframe)
                df_results = df_results[df_results['relation']==value_relation]

                # acc score
                a = get_scores_for_certain_tail(category_a, df_results)
                b = get_scores_for_certain_tail(category_b, df_results)
                merged = pd.merge(a, b)

                if len(merged) == 0:
                    print(f'No results for {result_file}')
                    continue

                highs = merged[merged['literal'] > 0.5]
                true_high = (highs[f'score {category_a}'] > highs[f'score {category_b}']).sum()
                                    
                lows = merged[merged['literal'] <= 0.5]
                true_low = (lows[f'score {category_a}'] < lows[f'score {category_b}']).sum()
                
                acc = (true_high + true_low) / len(merged)
                acc_scores.append(acc)

                # MR diff
                # filter dataframes to only contain the relevant entities
                no_human_results = df_results[~df_results['head'].isin(human_uris.to_list())]
                human_results = df_results[df_results['head'].isin(human_uris.to_list())]

                # compute MR for human and non-human entities
                mr_no_human = no_human_results[no_human_results['relation']==value_relation]['rank tail'].mean()
                mr_human = human_results[human_results['relation']==value_relation]['rank tail'].mean()

                mr_diff = mr_no_human - mr_human
                mr_diff_scores.append(mr_diff)

        acc_scores_mean, acc_scores_std = round(pd.Series(acc_scores).mean(), 3), round(pd.Series(acc_scores).std(), 3)
        mr_diff_scores_mean, mr_diff_scores_std = round(pd.Series(mr_diff_scores).mean(), 3), round(pd.Series(mr_diff_scores).std(), 3)

        line_string += f' & ${acc_scores_mean:.3f} {{\scriptstyle \pm {acc_scores_std:.3f}}}$'

    out.write(line_string + '\\\\ \n')

out.write(tail)
out.close()