In [None]:
from pyspark.sql import DataFrame, SparkSession, functions as f
import pandas as pd
import json
from urllib.error import URLError

# https://docs.google.com/spreadsheets/d/132SKHMoaJePu4nTlBnQwfaz3dhfJiKmJUujfYkzXMdI/edit?gid=179018892#gid=179018892

def get_evidence_datasets() -> pd.DataFrame:
    # Evidence datasets:
    spreadsheet_url = f'https://docs.google.com/spreadsheets/d/1G3zq-aJg3uKBz3VZUvGV8yYVrECtpQ9SJoGX6X3B-ms/export?format=csv'

    try:
        df = pd.read_csv(spreadsheet_url, sep=',')
    except URLError:
        df = pd.read_csv('dataset_name_mapping.csv')

    # Extract table from spreadsheet:
    return (
        df
        .loc[lambda df: df.output.notna()]
        [['output', 'datasourceId']]
    )

def get_full_croissant(gid:str) -> pd.DataFrame:
    """
    (
        pd.read_csv(full_croissant_description_url,  sep=',')
        .to_csv('full_croissant_description.csv', sep=',', index=False)
    )
    """
    # Evidence datasets:
    full_croissant_description_url = f'https://docs.google.com/spreadsheets/d/132SKHMoaJePu4nTlBnQwfaz3dhfJiKmJUujfYkzXMdI/export?gid={gid}&format=csv'

    try:
        df = pd.read_csv(full_croissant_description_url, sep=',')
    except URLError:
        print('reading from backup: full_croissant_description.csv')
        df = pd.read_csv('full_croissant_description.csv')

    # Extract table from spreadsheet:
    return df

def get_full_evidence(gid: str) -> pd.DataFrame:
    return (
        get_full_croissant(gid)
        .loc[lambda df: df.dataset_name == 'evidence']
        .assign(
            field_join = lambda x: x.field_id.str.extract(r'\/(.+)$')
        )
        .drop(['dataset_name', 'Example', 'column_name', 'field_id'], axis=1)
    )

# 25.09:
gid = '179018892'

# Get a list of evidence datasets:
dataset_list = get_evidence_datasets()

# Reading a piece of dataset representing the schema of the entire dataset:
spark = SparkSession.builder.getOrCreate()

data_folder = '/Users/dsuveges/project_data/25.09/output'




Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/26 14:39:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/26 14:39:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [24]:
dataset_list.head() #to_csv('dataset_name_mapping.csv', sep=',', index=False)

Unnamed: 0,output,datasourceId
0,evidence_cancer_biomarkers,cancer_biomarkers
1,evidence_cancer_gene_census,cancer_gene_census
2,evidence_chembl,chembl
3,evidence_clingen,clingen
4,evidence_crispr,crispr


In [6]:
# Processing a field in the spark schema:
def process_fields(
    fields: dict[str, str], 
    parent: str, 
    dataset_name: str,
    schema: list[dict[str,str]] | None = None,
) -> list[dict[str, str]]:
    if schema is None:
        schema = []

    # Iterating over all fields of the schema:
    for field in fields.get('fields'):

        name = field.get('name')
        # Recurse for nested structs:
        if isinstance(field['type'], dict):
            if field['type']['type'] == 'struct':
                # Call self:
                schema = schema + process_fields(
                    field['type'], 
                    f'{parent}/{name}',
                    dataset_name,
                    [],
                )
            elif isinstance(field['type']['elementType'], dict):
                # Call self:
                schema = schema + process_fields(
                    field["type"]['elementType'], 
                    f'{parent}/{name}',
                    dataset_name,
                    [],
                )
        # Capture atomic column types:
        schema.append(
            {
                'dataset_name': dataset_name,
                'field_id': f'{parent}/{name}',
                'column_name': name,
            }
        )
    # Return captured schema:
    return schema


def generate_schema_representation(
    output: str,
    datasourceId: str
) -> pd.DataFrame:

    # Read dataset:
    try:
        df = spark.read.parquet(f'{data_folder}/{output}')
    except:
        print(f'missing dataset: {data_folder}/{output}')
        return None

    # Get its schema:
    data = json.loads(df.schema.json())

    # We iterate over the schema and collect field names that we will be able to annotate on the spreadsheet:
    schema = process_fields(
        data, 
        output,
        output
    )

    # Process and return schema representation:
    return (
        pd.DataFrame(schema)
        .assign(
            field_join = lambda x: x.field_id.str.extract(r'\/(.+)$')
        )
    )



# generate_schema_representation(**dataset)    
new_evidence_collated = pd.concat([
    generate_schema_representation(**row)
    for _, row in 
    dataset_list.iterrows()
])

print(len(new_evidence_collated))
new_evidence_collated.head()


missing dataset: /Users/dsuveges/project_data/25.09/output/evidence_cancer_biomarkers
missing dataset: /Users/dsuveges/project_data/25.09/output/evidence_cancer_gene_census
missing dataset: /Users/dsuveges/project_data/25.09/output/evidence_chembl
missing dataset: /Users/dsuveges/project_data/25.09/output/evidence_clingen
missing dataset: /Users/dsuveges/project_data/25.09/output/evidence_crispr
missing dataset: /Users/dsuveges/project_data/25.09/output/evidence_crispr_screen
missing dataset: /Users/dsuveges/project_data/25.09/output/evidence_encore
missing dataset: /Users/dsuveges/project_data/25.09/output/evidence_europepmc
missing dataset: /Users/dsuveges/project_data/25.09/output/evidence_eva
missing dataset: /Users/dsuveges/project_data/25.09/output/evidence_eva_somatic
missing dataset: /Users/dsuveges/project_data/25.09/output/evidence_expression_atlas
missing dataset: /Users/dsuveges/project_data/25.09/output/evidence_gene_burden
missing dataset: /Users/dsuveges/project_data/25.

ValueError: All objects passed were None

In [27]:
old_evidence = get_full_evidence(gid)
old_evidence.head() 

Unnamed: 0,column_description,foreign_key,bioregistry_prefix,field_join
202,Identifer of the evidence source,,,datasourceId
203,Open Targets target identifier,target/id,,targetId
204,Origin of the variant allele,,,alleleOrigins
205,Inheritance patterns,,,allelicRequirements
206,Genetic origin of a population,,,ancestry


In [28]:
updated_evidence_table = (
    new_evidence_collated
    .merge(
        old_evidence, on='field_join', how='left'
    )
    # Updating missing descriptions:
    .assign(
        column_description = lambda df: df.apply(
            lambda row: 
              "Evidence quality flags" if row['column_name'] == 'qualityControls' else 
              "Effect of direction on target" if row['column_name'] == 'directionOnTarget' else
              row['column_description'],
            axis=1
        )
    )
)


updated_evidence_table.loc[lambda df: df.column_description.isna()]

Unnamed: 0,dataset_name,field_id,column_name,field_join,column_description,foreign_key,bioregistry_prefix


In [29]:
(
    pd.concat([
        updated_evidence_table,
        get_full_croissant(gid).loc[lambda df: df.dataset_name != 'evidence']
    ])
    .drop(['Example', 'field_join'], axis=1)
    .sort_values(['dataset_name', 'field_id'])
    .to_csv('annotated_new_evidence_columns.csv', index=False, sep=',')
)

25/11/20 22:59:45 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 972968 ms exceeds timeout 120000 ms
25/11/20 22:59:45 WARN SparkContext: Killing executors is not supported by current scheduler.
25/11/20 23:16:30 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$

In [19]:
len(pd.read_csv('annotated_new_evidence_columns.csv'))

1199

In [14]:
(
    updated_evidence_table
    .assign(
        column_description = lambda df: df.apply(
            lambda row: 
              "Evidence quality flags" if row['column_name'] == 'qualityControls' else 
              "Effect of direction on target" if row['column_name'] == 'directionOnTarget' else
              row['column_description'],
            axis=1
        )
    )
    .loc[updated_evidence_table.column_description.isna()]
)

Unnamed: 0,dataset_name,field_id,column_name,field_join,column_description,foreign_key,bioregistry_prefix
23,evidence_cancer_biomarkers,evidence_cancer_biomarkers/qualityControls,qualityControls,qualityControls,Evidence quality flags,,
44,evidence_cancer_gene_census,evidence_cancer_gene_census/qualityControls,qualityControls,qualityControls,Evidence quality flags,,
50,evidence_cancer_gene_census,evidence_cancer_gene_census/directionOnTarget,directionOnTarget,directionOnTarget,Effect of direction on target,,
69,evidence_chembl,evidence_chembl/qualityControls,qualityControls,qualityControls,Evidence quality flags,,
75,evidence_chembl,evidence_chembl/directionOnTarget,directionOnTarget,directionOnTarget,Effect of direction on target,,
88,evidence_clingen,evidence_clingen/qualityControls,qualityControls,qualityControls,Evidence quality flags,,
108,evidence_crispr,evidence_crispr/qualityControls,qualityControls,qualityControls,Evidence quality flags,,
131,evidence_crispr_screen,evidence_crispr_screen/qualityControls,qualityControls,qualityControls,Evidence quality flags,,
164,evidence_encore,evidence_encore/qualityControls,qualityControls,qualityControls,Evidence quality flags,,
186,evidence_europepmc,evidence_europepmc/qualityControls,qualityControls,qualityControls,Evidence quality flags,,


In [None]:
# generate_schema_representation(**dataset)    
new_evidence_collated = pd.concat([
    generate_schema_representation(**row)
    for _, row in 
    dataset_list.iterrows()
])

print(len(new_evidence_collated))
new_evidence_collated.head()

In [None]:
data_folder = '/Users/dsuveges/project_data/releases/25.12/output'


datasets = """timeseries_by_datasource_direct
timeseries_by_datasource_indirect
timeseries_overall_direct
timeseries_overall_indirect"""

# column_description	foreign_key	bioregistry_prefix
pd.concat([
    generate_schema_representation(d, None)
    for d  in 
    datasets.split('\n')
]).drop(columns=['field_join']).assign(
    column_description = None,
    foreign_key = None,
    bioregistry_prefix = None
)



Unnamed: 0,dataset_name,field_id,column_name
0,timeseries_by_datasource_direct,timeseries_by_datasource_direct/diseaseId,diseaseId
1,timeseries_by_datasource_direct,timeseries_by_datasource_direct/targetId,targetId
2,timeseries_by_datasource_direct,timeseries_by_datasource_direct/aggregationType,aggregationType
3,timeseries_by_datasource_direct,timeseries_by_datasource_direct/aggregationValue,aggregationValue
4,timeseries_by_datasource_direct,timeseries_by_datasource_direct/associationScore,associationScore
5,timeseries_by_datasource_direct,timeseries_by_datasource_direct/timeseries/year,year
6,timeseries_by_datasource_direct,timeseries_by_datasource_direct/timeseries/yea...,yearlyAssociationScore
7,timeseries_by_datasource_direct,timeseries_by_datasource_direct/timeseries/nov...,novelty
8,timeseries_by_datasource_direct,timeseries_by_datasource_direct/timeseries/yea...,yearlyEvidenceScores
9,timeseries_by_datasource_direct,timeseries_by_datasource_direct/timeseries,timeseries


In [10]:
spark.read.parquet(f'{data_folder}/timeseries_overall_direct').printSchema()


root
 |-- diseaseId: string (nullable = true)
 |-- targetId: string (nullable = true)
 |-- aggregationType: string (nullable = true)
 |-- aggregationValue: string (nullable = true)
 |-- associationScore: double (nullable = true)
 |-- timeseries: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- year: integer (nullable = true)
 |    |    |-- yearlyAssociationScore: double (nullable = true)
 |    |    |-- novelty: double (nullable = true)
 |    |    |-- yearlyEvidenceScores: array (nullable = true)
 |    |    |    |-- element: double (containsNull = true)
 |-- currentNovelty: double (nullable = true)



In [None]:
0	timeseries_by_datasource_direct	timeseries_by_datasource_direct/diseaseId	diseaseId	diseaseId
1	timeseries_by_datasource_direct	timeseries_by_datasource_direct/targetId	targetId	targetId
2	timeseries_by_datasource_direct	timeseries_by_datasource_direct/aggregationType	aggregationType	aggregationType
3	timeseries_by_datasource_direct	timeseries_by_datasource_direct/aggregationValue	aggregationValue	aggregationValue
4	timeseries_by_datasource_direct	timeseries_by_datasource_direct/associationScore	associationScore	associationScore
5	timeseries_by_datasource_direct	timeseries_by_datasource_direct/timeseries/year	year	timeseries/year
6	timeseries_by_datasource_direct	timeseries_by_datasource_direct/timeseries/yea...	yearlyAssociationScore	timeseries/yearlyAssociationScore
7	timeseries_by_datasource_direct	timeseries_by_datasource_direct/timeseries/nov...	novelty	timeseries/novelty
8	timeseries_by_datasource_direct	timeseries_by_datasource_direct/timeseries/yea...	yearlyEvidenceScores	timeseries/yearlyEvidenceScores
9	timeseries_by_datasource_direct	timeseries_by_datasource_direct/timeseries	timeseries	timeseries
10	timeseries_by_datasource_direct	timeseries_by_datasource_direct/currentNovelty	currentNovelty	currentNovelty
