# Creating record set curation file

- Fields in recordsets are annotated as a manual effort done by multiple people.
- Curation was collected in a Google spreadsheet table.
- The table has columns to make it easier for the curators to annotated also, also provides extra annotation for croissant ingestion

## Columns

- `dataset_name` - name of the dataset for curators.
- `field_id` - identifier of the field for croissant.
- `column_name` - just the label of the field for curators.
- `column_description` - Curators annotate fields with description
- `foreign_key` - curators add `field_id` of the foreign field
- `bioregistry_prefix` - if data in a column comes from a database in bioregisty, annotate
- `example` - helps curatos 

## Process

1. Fetch curation table from Google
2. Composing description
3. Iterating over column and building output
4. Save curation as json.

In [3]:
import pandas as pd
import json
# specifying the spreadsheet in the file 26.03:
gid='328767591'

# Curation of all columns from all OpenTargets output datasets:
curation = f'https://docs.google.com/spreadsheets/d/132SKHMoaJePu4nTlBnQwfaz3dhfJiKmJUujfYkzXMdI/export?format=tsv&gid={gid}'
# curation = 'annotated_new_evidence_columns.csv'
# Folder to save the resulting curation file:
asset_folder = '../src/ot_croissant/assets/'

# Reading table:
curation_table = (
    pd.read_csv(curation, sep='\t')
    .astype(
        {
            'column_description': pd.StringDtype(),
            'foreign_key': pd.StringDtype(),
        }
    )
    .assign(
        column_description = lambda df: df.column_description.str.strip()
    )
    .sort_values(['dataset_name', 'field_id'])
)
print(curation_table.head())
print(len(curation_table))

                          dataset_name  \
1184  association_by_datasource_direct   
1185  association_by_datasource_direct   
1186  association_by_datasource_direct   
1193  association_by_datasource_direct   
1182  association_by_datasource_direct   

                                               field_id       column_name  \
1184   association_by_datasource_direct/aggregationType   aggregationType   
1185  association_by_datasource_direct/aggregationValue  aggregationValue   
1186  association_by_datasource_direct/associationScore  associationScore   
1193    association_by_datasource_direct/currentNovelty    currentNovelty   
1182         association_by_datasource_direct/diseaseId         diseaseId   

                                     column_description foreign_key  \
1184  Evidence column used to aggregate data: dataso...        <NA>   
1185  Datasource identifier of the group: eg. gwas_c...        <NA>   
1186  Current disease/target association score of ev...        <NA>   


In [4]:
# Collection of curated dataset:
curation_json = []

# Composing description:
def compose_description(row: pd.Series) -> str:
    """
    Composes the description of a column based on the bioregistry prefix and the column description.
    If the bioregistry prefix is not available, it returns the column description as is.

    Args:
        row (pd.Series): A row from the curation table.

    Returns:
        str: The composed description.
    """
    # If the bioregistry prefix is not available, return the column description as is:
    description = (
        row['column_description']
        if pd.isna(row['bioregistry_prefix'])
        else f"{row['column_description']} [bioregistry:{row['bioregistry_prefix'].lower()}]"
    )

    return description

# Iterating over the rows of the curation table:
for _, row in curation_table.iterrows():
    # If the column description is not available, skip the row:
    if pd.isna(row['column_description']):
        continue

    # Adding curation to the dictionary:
    data = {
        'id': row['field_id'],
        'description': compose_description(row)
    }

    # If the foreign key is available, add it to the dictionary:
    if not pd.isna(row['foreign_key']):
        data['foreign_key'] = row['foreign_key']

    # If the bioregistry prefix is available, add it to the dictionary:
    curation_json.append(data)

# Saving the curation to a JSON file:
with open(f'{asset_folder}/recordset.json', 'w') as f:
    json.dump(curation_json, f, indent=2)


In [38]:
# Columns with bioregistry prefix:
curation_table.loc[curation_table.bioregistry_prefix.notna()]

Unnamed: 0,dataset_name,field_id,column_name,column_description,foreign_key,bioregistry_prefix
1183,association_by_datasource_direct,association_by_datasource_direct/targetId,targetId,Unique identifier for the target,target/id,ENSEMBL
1195,association_by_datasource_indirect,association_by_datasource_indirect/targetId,targetId,Unique identifier for the target,target/id,ENSEMBL
1207,association_by_datatype_direct,association_by_datatype_direct/targetId,targetId,Unique identifier for the target,target/id,ENSEMBL
1219,association_by_datatype_indirect,association_by_datatype_indirect/targetId,targetId,Unique identifier for the target,target/id,ENSEMBL
1231,association_overall_direct,association_overall_direct/targetId,targetId,Unique identifier for the target,target/id,ENSEMBL
...,...,...,...,...,...,...
1151,variant,variant/transcriptConsequences/targetId,targetId,Open Target target identifier of the transcript,target/id,ENSEMBL
1152,variant,variant/transcriptConsequences/transcriptId,transcriptId,Ensembl transcript identifier,,ENSEMBL
1154,variant,variant/transcriptConsequences/uniprotAccessions,uniprotAccessions,Uniprot identifiers of the gene product,,UNIPROT
1155,variant,variant/transcriptConsequences/variantFunction...,variantFunctionalConsequenceIds,The sequence ontology identifier of the conseq...,so/id,SO


In [39]:
# Fields without description:
curation_table.loc[curation_table.column_description.isna()]

Unnamed: 0,dataset_name,field_id,column_name,column_description,foreign_key,bioregistry_prefix
716,expression,expression/tissues/protein/cell_type/level,level,,,
718,expression,expression/tissues/protein/cell_type/reliability,reliability,,,
720,expression,expression/tissues/protein/reliability,reliability,,,


# Generate distributions JSON


Creat distribution json based on curation table

In [5]:
import pandas as pd
import json


# specifying the spreadsheet in the file 26.03:
gid='194385091'

# Curation of all columns from all OpenTargets output datasets:
curation = f'https://docs.google.com/spreadsheets/d/17uyBBAu62ugRApNftu8hILuv_Qhh6_VfOUj91Tu-_R0/export?format=tsv&gid={gid}'
# curation = 'annotated_new_evidence_columns.csv'
# Folder to save the resulting curation file:
asset_folder = '../src/ot_croissant/assets/'

# Reading table:
curation_table = (
    pd.read_csv(curation, sep='\t')
    .assign(
        tags=lambda df: df.tags.fillna('').apply(lambda x: x.split('|') if x else []),
        key=lambda df: df.key.str.split('|'),
    )
    .sort_values(['id'])
)
curation_table.head()

Unnamed: 0,id,nice_name,key,description,tags
0,association_by_datasource_direct,Associations - direct (by data source),"[association_by_datasource_direct/diseaseId, a...",Data source-specific metrics for direct target...,[Target-Disease]
1,association_by_datasource_indirect,Associations - indirect (by data source),"[association_by_datasource_indirect/diseaseId,...",Data source-specific metrics for indirect targ...,[Target-Disease]
2,association_by_datatype_direct,Associations - direct (by data type),"[association_by_datatype_direct/diseaseId, ass...",Data type-specific metrics for direct target-d...,[Target-Disease]
3,association_by_datatype_indirect,Associations - indirect (by data type),"[association_by_datatype_indirect/diseaseId, a...",Data type-specific metrics for indirect target...,[Target-Disease]
4,association_overall_direct,Associations - direct (overall score),"[association_overall_direct/diseaseId, associa...",Overall metrics for direct target-disease asso...,[Target-Disease]


In [6]:
columns = [
  "id", "nice_name", "key", "tags", "description"
]


with open(f'{asset_folder}/distribution.json', 'w') as f:
    json.dump(curation_table[columns].to_dict(orient='records'), f, indent=2)

## Adding representation of new datasets

In [18]:
import pandas as pd
from pyspark.sql import SparkSession, functions as f

spark = SparkSession.builder.getOrCreate()

# Processing a field in the spark schema:
def process_fields(
    fields: dict[str, str], 
    parent: str, 
    dataset_name: str,
    schema: list[dict[str,str]] | None = None,
) -> list[dict[str, str]]:
    if schema is None:
        schema = []

    # Iterating over all fields of the schema:
    for field in fields.get('fields'):

        name = field.get('name')
        # Recurse for nested structs:
        if isinstance(field['type'], dict):
            if field['type']['type'] == 'struct':
                # Call self:
                schema = schema + process_fields(
                    field['type'], 
                    f'{parent}/{name}',
                    dataset_name,
                    [],
                )
            elif isinstance(field['type']['elementType'], dict):
                # Call self:
                schema = schema + process_fields(
                    field["type"]['elementType'], 
                    f'{parent}/{name}',
                    dataset_name,
                    [],
                )
        # Capture atomic column types:
        schema.append(
            {
                'dataset_name': dataset_name,
                'field_id': f'{parent}/{name}',
                'column_name': name,
            }
        )
    # Return captured schema:
    return schema


def generate_schema_representation(
    dataset_location: str, output: str,
) -> pd.DataFrame:

    # Read dataset:
    try:
        df = spark.read.parquet(f'{dataset_location}/{output}')
    except:
        print(f'missing dataset: {dataset_location}/{output}')
        return None

    # Get its schema:
    data = json.loads(df.schema.json())

    # We iterate over the schema and collect field names that we will be able to annotate on the spreadsheet:
    schema = process_fields(
        data, 
        output,
        output
    )

    # Process and return schema representation:
    return (
        pd.DataFrame(schema)
        .assign(
            field_join = lambda x: x.field_id.str.extract(r'\/(.+)$')
        )
    )

dataset_location = '/Users/dsuveges/project_data/releases/25.12/output'
datasets = [
    'association_by_datasource_direct', 
    'association_by_datasource_indirect', 
    'association_by_datatype_direct', 
    'association_by_datatype_indirect', 
    'association_overall_direct', 
    'association_overall_indirect'
]


# generate_schema_representation(**dataset)    
new_evidence_collated = pd.concat([
    generate_schema_representation(dataset_location, dataset)
    for dataset in 
    datasets
])

print(len(new_evidence_collated))
new_evidence_collated.head()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/04 11:43:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


72


Unnamed: 0,dataset_name,field_id,column_name,field_join
0,association_by_datasource_direct,association_by_datasource_direct/diseaseId,diseaseId,diseaseId
1,association_by_datasource_direct,association_by_datasource_direct/targetId,targetId,targetId
2,association_by_datasource_direct,association_by_datasource_direct/aggregationType,aggregationType,aggregationType
3,association_by_datasource_direct,association_by_datasource_direct/aggregationValue,aggregationValue,aggregationValue
4,association_by_datasource_direct,association_by_datasource_direct/associationScore,associationScore,associationScore


In [21]:
new_evidence_collated.to_csv(
    'associations.tsv', sep=',', index = False
)