In [22]:
import pandas as pd
from pathlib import Path

# Load all association partitions
associations_dir = Path('../data/raw/direct_associations')
association_files = sorted(associations_dir.glob('*.parquet'))

# Read and concatenate all partitions
a_df = pd.concat([pd.read_parquet(f) for f in association_files], ignore_index=True)

# Load the biomarkers file
b_df = pd.read_parquet("../data/raw/evidence_cancer_biomarkers/evidence_cancer_biomarkers.parquet")

In [31]:
print(f"Loaded {len(association_files)} partitions")
print(f"Total rows: {a_df.shape[0]:,}")

a_df.head()

Loaded 20 partitions
Total rows: 4,492,971


Unnamed: 0,diseaseId,targetId,score,evidenceCount
0,DOID_0050890,ENSG00000001084,0.031799,4
1,DOID_0050890,ENSG00000002549,0.001478,1
2,DOID_0050890,ENSG00000004142,0.002217,1
3,DOID_0050890,ENSG00000004478,0.002217,1
4,DOID_0050890,ENSG00000004948,0.002957,1


In [41]:
b_df.head(10)

Unnamed: 0,id,targetFromSourceId,diseaseFromSourceMappedId,datasourceId,datatypeId,drugFromSource,drugId,drugResponse,diseaseFromSource,confidence,biomarkerName,literature,urls,biomarkers,qualityControls,diseaseId,targetId,publicationDate,evidenceDate,score
0,0010fb8e532bdb5d0072cc5db9991923f3c4b121,EGFR,EFO_0003060,cancer_biomarkers,affected_pathway,Egfr Inhibitor 1st Gens,,EFO_0020001,Non-small cell lung cancer,Case report,EGFR (D761Y),[19680293],,"{'geneExpression': None, 'geneticVariation': [...",[],EFO_0003060,ENSG00000146648,2009-08-01,2009-08-01,1.0
1,0023bc1ee01a34cf68f6e8d55e0b4466893114b8,SMARCB1,EFO_0005701,cancer_biomarkers,affected_pathway,Ezh2 Inhibitors,,GO_0042493,Malignant rhabdoid tumor,Case report,SMARCB1 deletion,,[{'niceName': 'European Society for Medical On...,"{'geneExpression': None, 'geneticVariation': [...",[],EFO_0005701,ENSG00000099956,,,1.0
2,006c16ffc7eb3dc04bb2ccf3ce4994bb1990fc9d,EGFR,EFO_0000365,cancer_biomarkers,affected_pathway,Egfr Mab Inhibitors,,GO_0042493,Colorectal adenocarcinoma,Late trials,EGFR amplification,"[17664472, 18794099]",,"{'geneExpression': None, 'geneticVariation': [...",[],EFO_0000365,ENSG00000146648,2007-08-01,2007-08-01,1.0
3,007f807da0c31088ec4a9136812edb6f70d87701,NF1,EFO_0001071,cancer_biomarkers,affected_pathway,Dasatinib,CHEMBL5416410,EFO_0020001,Lung carcinoma,Pre-clinical,NF1 deletion,[24296828],,"{'geneExpression': None, 'geneticVariation': [...",[],EFO_0001071,ENSG00000196712,2013-12-02,2013-12-02,1.0
4,00b3e7749de158868d86a20a241ef3122f932137,ERCC1,EFO_0001071,cancer_biomarkers,affected_pathway,Cisplatin,CHEMBL11359,GO_0042493,Lung carcinoma,Pre-clinical,ERCC1 deletion,"[23934192, 23275151]",,"{'geneExpression': None, 'geneticVariation': [...",[],EFO_0001071,ENSG00000012061,2012-12-28,2012-12-28,1.0
5,0119fae7b3f606cce180328d0ad893706586429a,MAP2K1,EFO_0000365,cancer_biomarkers,affected_pathway,Egfr Mab Inhibitors,,EFO_0020001,Colorectal adenocarcinoma,Case report,MAP2K1 oncogenic mutation,[26030179],,"{'geneExpression': None, 'geneticVariation': [...",[],EFO_0000365,ENSG00000169032,2015-06-01,2015-06-01,1.0
6,01a588cbcd96623bb612b85abfc97e51b93bbaa0,EGFR,EFO_0005543,cancer_biomarkers,affected_pathway,Egfr Inhibitor 2nd Gens,,EFO_0020002,Glioma,Early trials,EGFR inframe deletion (30-336),[19204207],,"{'geneExpression': None, 'geneticVariation': [...",[],EFO_0005543,ENSG00000146648,2009-02-09,2009-02-09,1.0
7,01a6a63ad25808fe0eee29dc7fdb1b50b54993bd,ERCC6,EFO_0001075,cancer_biomarkers,affected_pathway,Cisplatin,CHEMBL11359,GO_0042493,Ovarian carcinoma,Pre-clinical,ERCC6 oncogenic mutation,[25634215],,"{'geneExpression': None, 'geneticVariation': [...",[],EFO_0001075,ENSG00000225830,2015-01-29,2015-01-29,1.0
8,01edd01a0414afd50a9964a16f911dc36cdad84f,NF1,EFO_0000760,cancer_biomarkers,affected_pathway,Aurk Inhibitors,,GO_0042493,Malignant peripheral nerve sheat tumor,Pre-clinical,NF1 deletion,[24373973],,"{'geneExpression': None, 'geneticVariation': [...",[],EFO_0000760,ENSG00000196712,2013-12-27,2013-12-27,1.0
9,022396645475587dfd4d032e0673b7cec820369b,ZNRF3,EFO_0000365,cancer_biomarkers,affected_pathway,Porcupine Inhibitors,,GO_0042493,Colorectal adenocarcinoma,Pre-clinical,ZNRF3 oncogenic mutation,[26023187],,"{'geneExpression': None, 'geneticVariation': [...",[],EFO_0000365,ENSG00000183579,2015-05-28,2015-05-28,1.0


In [34]:
# Compare score distributions
print(f"Associations score range: {a_df['score'].min():.2f} to {a_df['score'].max():.2f}")
print(f"Biomarkers score range: {b_df['score'].min():.2f} to {b_df['score'].max():.2f}")

# Are they similar distributions?
print("\nAssociations score stats:")
print(a_df['score'].describe())

print("\nBiomarkers score stats:")
print(b_df['score'].describe())

# How many unique target-disease pairs in each?
print("\nAssociations pairs:", a_df[['targetId', 'diseaseId']].drop_duplicates().shape[0])
print("Biomarkers pairs:", b_df[['targetId', 'diseaseId']].drop_duplicates().shape[0])

# How many overlap?
assoc_pairs = set(a_df[['targetId', 'diseaseId']].apply(tuple, axis=1))
biom_pairs = set(b_df[['targetId', 'diseaseId']].apply(tuple, axis=1))
print("Overlapping pairs:", len(assoc_pairs & biom_pairs))

Associations score range: 0.00 to 0.91
Biomarkers score range: 1.00 to 1.00

Associations score stats:
count    4.492971e+06
mean     6.339187e-02
std      1.085055e-01
min      7.902492e-04
25%      3.695799e-03
50%      2.197744e-02
75%      5.335624e-02
max      9.133829e-01
Name: score, dtype: float64

Biomarkers score stats:
count    1301.0
mean        1.0
std         0.0
min         1.0
25%         1.0
50%         1.0
75%         1.0
max         1.0
Name: score, dtype: float64

Associations pairs: 4492971
Biomarkers pairs: 469
Overlapping pairs: 469


## Score Column Analysis

The `score` columns in both datasets are **semantically distinct** and cannot be directly compared:

**Associations dataset:**
- Score represents the aggregated strength of evidence for a direct target-disease association
- Calculated across all data sources (GWAS, literature, animal models, etc.)
- Range: 0.01 to 0.83 in our data
- Interpretation: Higher score = stronger overall evidence linking target to disease

**Biomarkers dataset:**
- Score is always 1.0 (binary indicator that biomarker evidence exists)
- Does not provide a meaningful confidence metric
- The actual evidence quality is captured in the `confidence` column instead (values: "case report", "late trials", "pre-clinical", etc.)
- Represents strength of evidence for a specific biomarker observation, not the overall target-disease relationship

## Dataset Relationship

Key findings from overlap analysis:
- **Associations:** 4,492,971 unique target-disease pairs
- **Biomarkers:** 469 unique target-disease pairs
- **Overlap:** All 469 biomarker pairs exist in the associations dataset (100% overlap)

This confirms that the biomarkers dataset is a **focused subset** of associations, providing drug-biomarker enrichment data for cancer-specific target-disease relationships. The biomarkers data represents detailed evidence for a small fraction (~0.01%) of all known target-disease associations.

In [35]:
# Can one target-disease pair have multiple biomarker records?
biomarker_counts = b_df.groupby(['targetId', 'diseaseId']).size()
print("Biomarker records per target-disease pair:")
print(biomarker_counts.describe())
print("\nMax biomarkers for one pair:", biomarker_counts.max())

Biomarker records per target-disease pair:
count    469.000000
mean       2.773987
std        4.208773
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max       42.000000
dtype: float64

Max biomarkers for one pair: 42


## Cardinality Analysis

The relationship between associations and biomarkers is **one-to-many**:
- Most target-disease pairs (50%) have just 1 biomarker record
- Mean: 2.77 biomarker records per pair
- Maximum: 42 records for a single pair
- Distribution is heavily right-skewed

**Why one-to-many?**
A single target-disease association can have multiple biomarker evidence items, each documenting different drugs, genetic variants, clinical contexts, or confidence levels.

**Example:** BRCA1-breast cancer (one association) might have separate biomarker records for trastuzumab response, olaparib efficacy, and various clinical trial findings.

**Implication for data model:** Associations and biomarkers should be kept in **separate, related tables** rather than flattened into one denormalized table.

In [42]:
# Look at ONE complete biomarker record
sample = b_df.iloc[3]  # The Dasatinib/NF1/Lung carcinoma example

print("=== FULL BIOMARKER RECORD ===\n")
for col in b_df.columns:
    print(f"{col}:")
    print(f"  {sample[col]}")
    print()

=== FULL BIOMARKER RECORD ===

id:
  007f807da0c31088ec4a9136812edb6f70d87701

targetFromSourceId:
  NF1

diseaseFromSourceMappedId:
  EFO_0001071

datasourceId:
  cancer_biomarkers

datatypeId:
  affected_pathway

drugFromSource:
  Dasatinib

drugId:
  CHEMBL5416410

drugResponse:
  EFO_0020001

diseaseFromSource:
  Lung carcinoma

confidence:
  Pre-clinical

biomarkerName:
  NF1 deletion

literature:
  ['24296828']

urls:
  None

biomarkers:
  {'geneExpression': None, 'geneticVariation': array([{'functionalConsequenceId': 'SO_0001563', 'id': None, 'name': 'NF1:del'}],
      dtype=object)}

qualityControls:
  []

diseaseId:
  EFO_0001071

targetId:
  ENSG00000196712

publicationDate:
  2013-12-02

evidenceDate:
  2013-12-02

score:
  1.0



In [43]:
# Find the corresponding association record
matching_assoc = a_df[(a_df['targetId'] == sample['targetId']) & 
                      (a_df['diseaseId'] == sample['diseaseId'])]

print("=== CORRESPONDING ASSOCIATION RECORD ===\n")
print(matching_assoc)

=== CORRESPONDING ASSOCIATION RECORD ===

           diseaseId         targetId    score  evidenceCount
2995136  EFO_0001071  ENSG00000196712  0.44082              9


In [44]:
# Check unique values
print("Unique datatypeId values:")
print(b_df['datatypeId'].unique())

print("\nValue counts:")
print(b_df['datatypeId'].value_counts())

Unique datatypeId values:
['affected_pathway']

Value counts:
datatypeId
affected_pathway    1301
Name: count, dtype: int64


In [45]:
# Check the numbers
cancer_targets = set(b_df['targetId'].unique())
cancer_diseases = set(b_df['diseaseId'].unique())

print(f"Unique targets in biomarkers: {len(cancer_targets)}")
print(f"Unique diseases in biomarkers: {len(cancer_diseases)}")

# How many associations would we get?
a_df_option_b = a_df[
    (a_df['targetId'].isin(cancer_targets)) | 
    (a_df['diseaseId'].isin(cancer_diseases))
]

print(f"\nOption A (exact pairs): ~469 associations")
print(f"Option B (all targets OR diseases): {len(a_df_option_b):,} associations")

Unique targets in biomarkers: 184
Unique diseases in biomarkers: 77

Option A (exact pairs): ~469 associations
Option B (all targets OR diseases): 437,267 associations


In [46]:
cancer_targets = set(b_df['targetId'].unique())
cancer_diseases = set(b_df['diseaseId'].unique())

a_df_option_c = a_df[
    (a_df['targetId'].isin(cancer_targets)) & 
    (a_df['diseaseId'].isin(cancer_diseases))
]

print(f"Option C (cancer targets AND cancer diseases): {len(a_df_option_c):,} associations")

Option C (cancer targets AND cancer diseases): 9,428 associations


After inspecting the data and reviewing the schemas, it now makes sense to filter by the 469 overlapping target-disease pairs.