In [1]:
import yaml
import pandas as pd

In [2]:
def create_df_from_yaml(yaml_file_path, output_file_path):
    """
    Load a YAML file, create a DataFrame with columns 'label', 'term', and 'score', and save it to a TSV file.

    Parameters:
    yaml_file_path (str): The path to the YAML file.
    output_file_path (str): The path where the TSV file will be saved.

    Returns:
    pd.DataFrame: The DataFrame created from the YAML data.
    """
    # Load data from YAML
    with open(yaml_file_path, 'r', encoding='utf-8') as file:
        documents = list(yaml.safe_load_all(file))

    # Prepare data for DataFrame
    data = []
    for document in documents:
        extracted_object = document.get("extracted_object")
        if extracted_object:
            label = extracted_object.get('label')
            terms = extracted_object.get('terms')
            if terms:
                num_terms = len(terms)
                score = [1 / (i + 1) for i in range(num_terms)]
                for term, scr in zip(terms, score):
                    data.append({'label': label, 'term': term, 'score': scr})

    # Create DataFrame
    df = pd.DataFrame(data)

    # Save DataFrame to TSV
    df.to_csv(output_file_path, sep='\t', index=False)

    return df


In [3]:
# Example usage
yaml_file_path = '../en_test_fix.yaml'
output_file_path = 'output_file.tsv'
df = create_df_from_yaml(yaml_file_path, output_file_path)

In [4]:
df

Unnamed: 0,label,term,score
0,PMID_9101303_nan-prompt.txt,Neurofibromatosis Type 1,1.000000
1,PMID_9101303_nan-prompt.txt,OMIM:611431,0.500000
2,PMID_9101303_nan-prompt.txt,OMIM:163950,0.333333
3,PMID_9101303_nan-prompt.txt,OMIM:174800,0.250000
4,PMID_9101303_nan-prompt.txt,OMIM:218040,0.200000
...,...,...,...
273,PMID_9312167_BPMID_9312167_B:I:4IPMID_9312167_...,OMIM:109270,1.000000
274,PMID_9312167_BPMID_9312167_B:I:4IPMID_9312167_...,Primary Hyperparathyroidism,0.500000
275,PMID_9312167_BPMID_9312167_B:I:4IPMID_9312167_...,Dent's Disease,0.333333
276,PMID_9312167_BPMID_9312167_B:I:4IPMID_9312167_...,Bartter Syndrome,0.250000
