# Creating record set curation file

- Fields in recordsets are annotated as a manual effort done by multiple people.
- Curation was collected in a Google spreadsheet table.
- The table has columns to make it easier for the curators to annotated also, also provides extra annotation for croissant ingestion

## Columns

- `dataset_name` - name of the dataset for curators.
- `field_id` - identifier of the field for croissant.
- `column_name` - just the label of the field for curators.
- `column_description` - Curators annotate fields with description
- `foreign_key` - curators add `field_id` of the foreign field
- `bioregistry_prefix` - if data in a column comes from a database in bioregisty, annotate
- `example` - helps curatos 

## Process

1. Fetch curation table from Google
2. Composing description
3. Iterating over column and building output
4. Save curation as json.

In [3]:
import pandas as pd
import json

# Curation of all columns from all OpenTargets output datasets:
curation = 'https://docs.google.com/spreadsheets/d/132SKHMoaJePu4nTlBnQwfaz3dhfJiKmJUujfYkzXMdI/export?format=tsv&gid=179018892'

# Folder to save the resulting curation file:
asset_folder = '../src/ot_croissant/assets/'

# Reading table:
curation_table = (
    pd.read_csv(curation, sep='\t')
    .astype(
        {
            'column_description': pd.StringDtype(),
            'foreign_key': pd.StringDtype(),
        }
    )
)
curation_table.head()

Unnamed: 0,dataset_name,field_id,column_name,column_description,foreign_key,bioregistry_prefix,Example
0,disease_phenotype,disease_phenotype/disease,disease,Disease identifier,disease/id,,MONDO_0800026
1,disease_phenotype,disease_phenotype/phenotype,phenotype,The phenotype linked to the disease.,disease/id,,
2,disease_phenotype,disease_phenotype/evidence,evidence,A container for all evidence-related attribute...,,,
3,disease_phenotype,disease_phenotype/evidence/aspect,aspect,The category of biological information being p...,,,C
4,disease_phenotype,disease_phenotype/evidence/bioCuration,bioCuration,Indicates whether the evidence has been manual...,,,HPO:probinson[2021-09-23];HPO:probinson[2021-0...


In [4]:
# Collection of curated dataset:
curation_json = []

# Composing description:
def compose_description(row: pd.Series) -> str:
    """
    Composes the description of a column based on the bioregistry prefix and the column description.
    If the bioregistry prefix is not available, it returns the column description as is.

    Args:
        row (pd.Series): A row from the curation table.
    
    Returns:
        str: The composed description.
    """
    # If the bioregistry prefix is not available, return the column description as is:
    description = (
        row['column_description']
        if pd.isna(row['bioregistry_prefix'])
        else f"{row['column_description']} [bioregistry:{row['bioregistry_prefix'].lower()}]"
    )

    return description

# Iterating over the rows of the curation table:
for _, row in curation_table.iterrows():
    # If the column description is not available, skip the row:
    if pd.isna(row['column_description']):
        continue

    # Adding curation to the dictionary:
    data = {
        'id': row['field_id'],
        'description': compose_description(row)
    }

    # If the foreign key is available, add it to the dictionary:
    if not pd.isna(row['foreign_key']):
        data['foreign_key'] = row['foreign_key']

    # If the bioregistry prefix is available, add it to the dictionary:
    curation_json.append(data)

# Saving the curation to a JSON file:
with open(f'{asset_folder}/recordset.json', 'w') as f:
    json.dump(curation_json, f, indent=2)


In [None]:
# Columns with bioregistry prefix:
curation_table.loc[curation_table.bioregistry_prefix.notna()]

Unnamed: 0,dataset_name,field_id,column_name,column_description,foreign_key,bioregistry_prefix,Example
14,disease_phenotype,disease_phenotype/evidence/references,references,References or citations supporting the evidence.,,pubmed,[PMID:14566559]
20,mouse_phenotype,mouse_phenotype/biologicalModels/id,id,Unique identifier for the biological model.,,MGI,MGI:6140117
21,mouse_phenotype,mouse_phenotype/biologicalModels/literature,literature,References related to the mouse model.,,pubmed,[30949703]
23,mouse_phenotype,mouse_phenotype/modelPhenotypeClasses/id,id,Unique identifier for the phenotype class.,,MP,MP:0005389
25,mouse_phenotype,mouse_phenotype/modelPhenotypeId,modelPhenotypeId,Identifier for the specific phenotype observed...,,MP,MP:0005343
29,mouse_phenotype,mouse_phenotype/targetInModelEnsemblId,targetInModelEnsemblId,Ensembl identifier for the target gene in the ...,,ENSEMBL,ENSMUSG00000087651
30,mouse_phenotype,mouse_phenotype/targetInModelMgiId,targetInModelMgiId,MGI (Mouse Genome Informatics) identifier for ...,,MGI,MGI:1917034
48,reactome,reactome/id,id,Unique identifier for the Reactome pathway,,reactome,
55,expression,expression/id,id,Ensembl human gene identifier for the expresse...,,ENSEMBL,ENSG00000071243
57,expression,expression/tissues/efo_code,efo_code,Ontology ID of the biosample the expression da...,biosample/biosampleId,UBERON,


In [6]:
# Fields without description:
curation_table.loc[curation_table.column_description.isna()]

Unnamed: 0,dataset_name,field_id,column_name,column_description,foreign_key,bioregistry_prefix,Example
67,expression,expression/tissues/protein/reliability,reliability,,,,
71,expression,expression/tissues/protein/cell_type/reliability,reliability,,,,
72,expression,expression/tissues/protein/cell_type/level,level,,,,


# Adding tags to distributions

In [29]:
import pandas as pd
import json

# Curation of all columns from all OpenTargets output datasets:
curation = 'https://docs.google.com/spreadsheets/d/1JBu9HsRqwdGYMsYVoNAghmyEoU34S7fFnNIjHuzl0To/export?format=tsv'

# Folder to save the resulting curation file:
asset_folder = '../src/ot_croissant/assets/'

# Reading table:
curation_table = (
    pd.read_csv(curation, sep='\t')
    .rename(columns={'Dataset': 'id', 'Tag': 'tag'})
    .assign(
        tags = lambda df: df.tag.str.split('|')
    )
    [['id', 'tags']]
)

curation_table.head()


Unnamed: 0,id,tags
0,association_by_datasource_direct,[Target-Disease]
1,association_by_datasource_indirect,[Target-Disease]
2,association_by_datatype_direct,[Target-Disease]
3,association_by_datatype_indirect,[Target-Disease]
4,association_by_overall_indirect,[Target-Disease]


In [None]:
# Collection of curated dataset:
curation_json = []

for _, row in (
    pd.read_json(f'{asset_folder}/distribution.json', orient='records')
    .merge(curation_table, on='id', how='left')
).iterrows():
    curation_json.append(
        row.to_dict()
    )


# Saving the curation to a JSON file:
with open(f'{asset_folder}/distribution.json', 'w') as f:
    json.dump(curation_json, f, indent=2)

In [21]:
(
    pd.read_json(f'{asset_folder}/distribution.json', orient='records')
    .merge(curation_table, on='id', how='left')
    .to_json(f'{asset_folder}/distribution_v2.json', orient='table', indent=2, lines=False)
)

In [28]:
from pyspark.sql import SparkSession, functions as f, types as t


spark = SparkSession.builder.getOrCreate()
df = (
    spark.read.csv('/Users/dsuveges/33414548-GCST90010801-EFO_0010226.h.tsv.gz', sep='\t', header=True)
)


df.show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/29 11:41:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+--------------+-----------+--------+--------+---------------+----------------+-------+-------------+-----------+-----------+--------------------------+-------+----------+------------------+-------------+------------+-------+----------+--------+--------+----+--------------+-----------------------+-----------+
| hm_variant_id|    hm_rsid|hm_chrom|  hm_pos|hm_other_allele|hm_effect_allele|hm_beta|hm_odds_ratio|hm_ci_lower|hm_ci_upper|hm_effect_allele_frequency|hm_code|chromosome|base_pair_location|effect_allele|other_allele|p_value|odds_ratio|ci_lower|ci_upper|beta|standard_error|effect_allele_frequency| variant_id|
+--------------+-----------+--------+--------+---------------+----------------+-------+-------------+-----------+-----------+--------------------------+-------+----------+------------------+-------------+------------+-------+----------+--------+--------+----+--------------+-----------------------+-----------+
|1_99534456_G_T| rs10875231|       1|99534456|              G|     

In [None]:
|