In [None]:
from src.datasets.download_cached_files import download_zenodo_files

download_zenodo_files()

In [None]:
from src.dataloader.data_wrapper import (
    RealClinVar, OligogenicDataWrapper, MAVEDataWrapper,
    GWASDataWrapper, ClinVarDataWrapper, GeneKoDataWrapper,
    CellPassportDataWrapper, eQTLDataWrapper, sQTLDataWrapper
)

NUM_RECORDS = 1000
ALL_RECORDS = False
SEQ_LEN = 20

# Load RealClinVar data
data_loader = RealClinVar(num_records=NUM_RECORDS, all_records=ALL_RECORDS)
data = data_loader.get_data(Seq_length=SEQ_LEN)
print(data)

# Load Oligogenic data
data_loader = OligogenicDataWrapper(num_records=NUM_RECORDS, all_records=ALL_RECORDS)
data = data_loader.get_data(Seq_length=SEQ_LEN)
print(data)

# Load ClinVar data
data_loader = ClinVarDataWrapper(num_records=NUM_RECORDS, all_records=ALL_RECORDS)
data = data_loader.get_data(Seq_length=SEQ_LEN)
print(data)

# Load GeneKo data
data_loader = GeneKoDataWrapper(num_records=NUM_RECORDS, all_records=ALL_RECORDS)
data = data_loader.get_data(Seq_length=SEQ_LEN)
print(data)

# Load CellPassport data
data_loader = CellPassportDataWrapper(num_records=NUM_RECORDS, all_records=ALL_RECORDS)
data = data_loader.get_data(Seq_length=SEQ_LEN)
print(data)

# Load eQTL data
data_loader = eQTLDataWrapper(num_records=NUM_RECORDS, all_records=ALL_RECORDS)
data = data_loader.get_data(Seq_length=SEQ_LEN)
print(data)

# Load sQTL data
data_loader = sQTLDataWrapper(num_records=NUM_RECORDS, all_records=ALL_RECORDS)
data = data_loader.get_data(Seq_length=SEQ_LEN)
print(data)

# Load MAVE data
data_loader = MAVEDataWrapper(num_records=NUM_RECORDS, all_records=ALL_RECORDS)
data = data_loader.get_data(Seq_length=SEQ_LEN)
print(data)

In [None]:
# Exploring the data

from src.dataloader.data_wrapper import (
    RealClinVar, OligogenicDataWrapper, MAVEDataWrapper,
    ClinVarDataWrapper, GeneKoDataWrapper, CellPassportDataWrapper,
    eQTLDataWrapper, sQTLDataWrapper
)

SEQ_LEN = 20
SAMPLES = 3

def show(name, loader_fn):
    try:
        data = loader_fn()
        print(f"{name}: {len(data)} records")
        for row in data[:SAMPLES]:
            print("  ", row)
    except Exception as e:
        print(f"{name}: ERROR {e}")

show("RealClinVar", lambda: RealClinVar(num_records=200, all_records=False).get_data(Seq_length=SEQ_LEN))
show("Oligogenic", lambda: OligogenicDataWrapper(num_records=200, all_records=False).get_data(Seq_length=SEQ_LEN))
show("ClinVar", lambda: ClinVarDataWrapper(num_records=200, all_records=False).get_data(Seq_length=SEQ_LEN))
show("GeneKo", lambda: GeneKoDataWrapper(num_records=200, all_records=False).get_data(Seq_length=SEQ_LEN))
show("CellPassport", lambda: CellPassportDataWrapper(num_records=50, all_records=False).get_data(Seq_length=SEQ_LEN))
show("eQTL", lambda: eQTLDataWrapper(num_records=50, all_records=False).get_data(Seq_length=SEQ_LEN))
show("sQTL", lambda: sQTLDataWrapper(num_records=50, all_records=False).get_data(Seq_length=SEQ_LEN))
show("MAVE", lambda: MAVEDataWrapper(num_records=50, all_records=False).get_data(Seq_length=SEQ_LEN))


In [2]:
%pip install requests

Collecting requestsNote: you may need to restart the kernel to use updated packages.

  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting charset_normalizer<4,>=2 (from requests)
  Using cached charset_normalizer-3.4.4-cp310-cp310-win_amd64.whl.metadata (38 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Using cached urllib3-2.6.3-py3-none-any.whl.metadata (6.9 kB)
Collecting certifi>=2017.4.17 (from requests)
  Using cached certifi-2026.1.4-py3-none-any.whl.metadata (2.5 kB)
Using cached requests-2.32.5-py3-none-any.whl (64 kB)
Using cached charset_normalizer-3.4.4-cp310-cp310-win_amd64.whl (107 kB)
Using cached idna-3.11-py3-none-any.whl (71 kB)
Using cached urllib3-2.6.3-py3-none-any.whl (131 kB)
Using cached certifi-2026.1.4-py3-none-any.whl (152 kB)
Installing collected packages: urllib3, idna, charset_normalizer, certifi, requests

   ------------------

In [4]:
%pip install pyfaidx

Collecting pyfaidx
  Using cached pyfaidx-0.9.0.3-py3-none-any.whl.metadata (25 kB)
Using cached pyfaidx-0.9.0.3-py3-none-any.whl (29 kB)
Installing collected packages: pyfaidx
Successfully installed pyfaidx-0.9.0.3
Note: you may need to restart the kernel to use updated packages.


In [5]:
import os, requests, gzip, shutil
import pyfaidx

url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz"
os.makedirs("root/data", exist_ok=True)

gz_path = "root/data/hg38.fa.gz"
fa_path = "root/data/hg38.fa"

if not os.path.exists(fa_path):
    print("Downloading hg38...")
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(gz_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                f.write(chunk)
    print("Decompressing...")
    with gzip.open(gz_path, "rb") as f_in, open(fa_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

print("Indexing...")
pyfaidx.Faidx(fa_path)
print("Done:", fa_path)


Indexing...
Done: root/data/hg38.fa
