# Creating record set curation file

- Fields in recordsets are annotated as a manual effort done by multiple people.
- Curation was collected in a Google spreadsheet table.
- The table has columns to make it easier for the curators to annotated also, also provides extra annotation for croissant ingestion

## Columns

- `dataset_name` - name of the dataset for curators.
- `field_id` - identifier of the field for croissant.
- `column_name` - just the label of the field for curators.
- `column_description` - Curators annotate fields with description
- `foreign_key` - curators add `field_id` of the foreign field
- `bioregistry_prefix` - if data in a column comes from a database in bioregisty, annotate
- `example` - helps curatos 

## Process

1. Fetch curation table from Google
2. Composing description
3. Iterating over column and building output
4. Save curation as json.

In [1]:
import pandas as pd
import json
# specifying the spreadsheet in the file 26.03:
gid='328767591'

# Curation of all columns from all OpenTargets output datasets:
curation = f'https://docs.google.com/spreadsheets/d/132SKHMoaJePu4nTlBnQwfaz3dhfJiKmJUujfYkzXMdI/export?format=tsv&gid={gid}'
# curation = 'annotated_new_evidence_columns.csv'
# Folder to save the resulting curation file:
asset_folder = '../src/ot_croissant/assets/'

# Reading table:
curation_table = (
    pd.read_csv(curation, sep='\t')
    .astype(
        {
            'column_description': pd.StringDtype(),
            'foreign_key': pd.StringDtype(),
        }
    )
    .sort_values(['dataset_name', 'field_id'])
)
curation_table.head()
print(len(curation_table))

1256


In [2]:
# Collection of curated dataset:
curation_json = []

# Composing description:
def compose_description(row: pd.Series) -> str:
    """
    Composes the description of a column based on the bioregistry prefix and the column description.
    If the bioregistry prefix is not available, it returns the column description as is.

    Args:
        row (pd.Series): A row from the curation table.
    
    Returns:
        str: The composed description.
    """
    # If the bioregistry prefix is not available, return the column description as is:
    description = (
        row['column_description']
        if pd.isna(row['bioregistry_prefix'])
        else f"{row['column_description']} [bioregistry:{row['bioregistry_prefix'].lower()}]"
    )

    return description

# Iterating over the rows of the curation table:
for _, row in curation_table.iterrows():
    # If the column description is not available, skip the row:
    if pd.isna(row['column_description']):
        continue

    # Adding curation to the dictionary:
    data = {
        'id': row['field_id'],
        'description': compose_description(row)
    }

    # If the foreign key is available, add it to the dictionary:
    if not pd.isna(row['foreign_key']):
        data['foreign_key'] = row['foreign_key']

    # If the bioregistry prefix is available, add it to the dictionary:
    curation_json.append(data)

# Saving the curation to a JSON file:
with open(f'{asset_folder}/recordset.json', 'w') as f:
    json.dump(curation_json, f, indent=2)


In [10]:
# Columns with bioregistry prefix:
curation_table.loc[curation_table.bioregistry_prefix.notna()]

Unnamed: 0,dataset_name,field_id,column_name,column_description,foreign_key,bioregistry_prefix
31,biosample,biosample/biosampleId,biosampleId,Unique identifier for the biosample,,UBERON
140,disease_phenotype,disease_phenotype/evidence/references,references,References or citations supporting the evidence,,pubmed
213,evidence_cancer_biomarkers,evidence_cancer_biomarkers/drugId,drugId,Open Targets drug identifier,drug_molecule/id,CHEMBL
252,evidence_chembl,evidence_chembl/drugId,drugId,Open Targets drug identifier,drug_molecule/id,CHEMBL
388,evidence_eva,evidence_eva/variantFunctionalConsequenceId,variantFunctionalConsequenceId,Sequence ontology (SO) identifier of the funct...,so/id,SO
416,evidence_eva_somatic,evidence_eva_somatic/variantFunctionalConseque...,variantFunctionalConsequenceId,Sequence ontology (SO) identifier of the funct...,so/id,SO
459,evidence_gene2phenotype,evidence_gene2phenotype/variantFunctionalConse...,variantFunctionalConsequenceId,Sequence ontology (SO) identifier of the funct...,so/id,SO
462,evidence_gene_burden,evidence_gene_burden/ancestryId,ancestryId,Identifier of the ancestry in the HANCESTRO on...,,HANCESTRO
554,evidence_impc,evidence_impc/targetInModelEnsemblId,targetInModelEnsemblId,Target Ensembl ID in animal model,,ENSEMBL
555,evidence_impc,evidence_impc/targetInModelMgiId,targetInModelMgiId,Target MGI ID in animal model,,MGI


In [6]:
# Fields without description:
curation_table.loc[curation_table.column_description.isna()]

Unnamed: 0,dataset_name,field_id,column_name,column_description,foreign_key,bioregistry_prefix,Example
67,expression,expression/tissues/protein/reliability,reliability,,,,
71,expression,expression/tissues/protein/cell_type/reliability,reliability,,,,
72,expression,expression/tissues/protein/cell_type/level,level,,,,


# Adding tags to distributions


Creat distribution json based on curation table

In [None]:
import pandas as pd
import json


# specifying the spreadsheet in the file 26.03:
gid='194385091'

# Curation of all columns from all OpenTargets output datasets:
curation = f'https://docs.google.com/spreadsheets/d/17uyBBAu62ugRApNftu8hILuv_Qhh6_VfOUj91Tu-_R0/export?format=tsv&gid={gid}'
# curation = 'annotated_new_evidence_columns.csv'
# Folder to save the resulting curation file:
asset_folder = '../src/ot_croissant/assets/'

# Reading table:
curation_table = (
    pd.read_csv(curation, sep='\t')
    .assign(
        tags=lambda df: df.tags.fillna('').apply(lambda x: x.split('|') if x else []),
        key=lambda df: df.key.str.split('|'),
    )
    .sort_values(['id'])
)
curation_table.head()

Unnamed: 0,id,nice_name,key,description,tags
0,association_by_datasource_direct,Associations - direct (by data source),"[association_by_datasource_direct/diseaseId, a...",Data source-specific metrics for direct target...,[Target-Disease]
1,association_by_datasource_indirect,Associations - indirect (by data source),"[association_by_datasource_indirect/diseaseId,...",Data source-specific metrics for direct target...,[Target-Disease]
2,association_by_datatype_direct,Associations - direct (by data type),"[association_by_datatype_direct/diseaseId, ass...",Data type-specific metrics for direct target-d...,[Target-Disease]
3,association_by_datatype_indirect,Associations - indirect (by data type),"[association_by_datatype_indirect/diseaseId, a...",Data type-specific metrics for direct target-d...,[Target-Disease]
4,association_overall_direct,Associations - direct (overall score),"[association_overall_direct/diseaseId, associa...",Overall metrics for direct target-disease asso...,[Target-Disease]


In [15]:
columns = [
  "id", "nice_name", "key", "tags", "description"
]


with open(f'{asset_folder}/distribution.json', 'w') as f:
    json.dump(curation_table[columns].to_dict(orient='records'), f, indent=2)