Exploration notebook of the latest

In [11]:
import polars as pl
from pathlib import Path
from datetime import datetime

# Configure Polars to show strings up to 500 characters
pl.Config.set_fmt_str_lengths(500)  # Show up to 500 characters
pl.Config.set_tbl_width_chars(1000)  # Increase table width
pl.Config.set_tbl_cols(-1)  # Show all columns


polars.config.Config

## Find Latest Test Output
 

In [2]:
# Find the latest test output directory
output_dir = Path("/home/antonkulaga/sources/cell2sentence4longevity/data/output")
test_dirs = sorted([d for d in output_dir.glob("test_*") if d.is_dir()])
latest_test = test_dirs[-1] if test_dirs else None

print(f"Latest test directory: {latest_test}")
print(f"\nAvailable subdirectories:")
if latest_test:
    for subdir in latest_test.iterdir():
        if subdir.is_dir():
            num_files = len(list(subdir.glob("*.parquet")))
            print(f"  {subdir.name}: {num_files} parquet files")


Latest test directory: /home/antonkulaga/sources/cell2sentence4longevity/data/output/test_20251111_005902

Available subdirectories:
  test: 7 parquet files
  train: 121 parquet files


## Load Train Data (Lazy

In [3]:
# Use scan_parquet to efficiently load all train chunks
train_dir = latest_test / "train"
train_lf = pl.scan_parquet(train_dir / "*.parquet")

print("Train Data Schema:")
print(train_lf.collect_schema())


Train Data Schema:
Schema({'reference_genome': Categorical, 'gene_annotation_version': Categorical, 'alignment_software': Categorical, 'intronic_reads_counted': Categorical, 'library_id': Categorical, 'assay_ontology_term_id': Categorical, 'sequenced_fragment': Categorical, 'cell_number_loaded': Categorical, 'institute': Categorical, 'is_primary_data': Boolean, 'cell_type_ontology_term_id': Categorical, 'author_cell_type': Categorical, 'sample_id': Categorical, 'sample_preservation_method': Categorical, 'tissue_ontology_term_id': Categorical, 'development_stage_ontology_term_id': Categorical, 'sample_collection_method': Categorical, 'donor_BMI_at_collection': Float64, 'tissue_type': Categorical, 'suspension_derivation_process': Categorical, 'suspension_enriched_cell_types': Categorical, 'cell_viability_percentage': Float64, 'suspension_uuid': Categorical, 'suspension_type': Categorical, 'donor_id': Categorical, 'self_reported_ethnicity_ontology_term_id': Categorical, 'donor_living_at_s

In [4]:
# Get metadata without loading all data into memory
print("\n=== Train Data Metadata ===")
print(f"\nTotal rows: {train_lf.select(pl.len()).collect().item()}")
print(f"Number of columns: {len(train_lf.collect_schema())}")



=== Train Data Metadata ===

Total rows: 1202342
Number of columns: 50


In [5]:
# Show column details
print("\nColumn Information:")
schema = train_lf.collect_schema()
for col_name, dtype in schema.items():
    print(f"  - {col_name}: {dtype}")



Column Information:
  - reference_genome: Categorical
  - gene_annotation_version: Categorical
  - alignment_software: Categorical
  - intronic_reads_counted: Categorical
  - library_id: Categorical
  - assay_ontology_term_id: Categorical
  - sequenced_fragment: Categorical
  - cell_number_loaded: Categorical
  - institute: Categorical
  - is_primary_data: Boolean
  - cell_type_ontology_term_id: Categorical
  - author_cell_type: Categorical
  - sample_id: Categorical
  - sample_preservation_method: Categorical
  - tissue_ontology_term_id: Categorical
  - development_stage_ontology_term_id: Categorical
  - sample_collection_method: Categorical
  - donor_BMI_at_collection: Float64
  - tissue_type: Categorical
  - suspension_derivation_process: Categorical
  - suspension_enriched_cell_types: Categorical
  - cell_viability_percentage: Float64
  - suspension_uuid: Categorical
  - suspension_type: Categorical
  - donor_id: Categorical
  - self_reported_ethnicity_ontology_term_id: Categorica

In [12]:
# Show first few rows
print("\nFirst 5 rows of train data:")
train_lf.head(5).collect()



First 5 rows of train data:


reference_genome,gene_annotation_version,alignment_software,intronic_reads_counted,library_id,assay_ontology_term_id,sequenced_fragment,cell_number_loaded,institute,is_primary_data,cell_type_ontology_term_id,author_cell_type,sample_id,sample_preservation_method,tissue_ontology_term_id,development_stage_ontology_term_id,sample_collection_method,donor_BMI_at_collection,tissue_type,suspension_derivation_process,suspension_enriched_cell_types,cell_viability_percentage,suspension_uuid,suspension_type,donor_id,self_reported_ethnicity_ontology_term_id,donor_living_at_sample_collection,disease_ontology_term_id,sex_ontology_term_id,nCount_RNA,nFeature_RNA,pMito,NODG,nUMI,Country,Annotation_Level1,Annotation_Level2,Annotation_Level3,Annotation_Level4,Smoking Status,cell_type,assay,disease,sex,tissue,self_reported_ethnicity,development_stage,observation_joinid,cell_sentence,age
cat,cat,cat,cat,cat,cat,cat,cat,cat,bool,cat,cat,cat,cat,cat,cat,cat,f64,cat,cat,cat,f64,cat,cat,cat,cat,cat,cat,cat,f64,i32,f64,i32,i32,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,str,str,i64
"""GRCh38""","""v98""","""DRAGEN RNA v3.8.4""","""no""","""d43d2448-bf59-4267-a458-1ceb83096c67""","""EFO:0009900""","""5 prime tag""","""40000 cells""","""Riken""",True,"""CL:0000904""","""CD4+_T_cm""","""1be8a203-bb01-4b29-8273-66461567644d""","""other""","""UBERON:0000178""","""HsapDv:0000131""","""blood draw""",23.2,"""tissue""","""density gradient centrifugation""","""peripheral blood mononuclear cell""",96.0,"""290a5efa-30e1-4749-add0-35a49d3659f7""","""cell""","""JP_RIK_H114""","""HANCESTRO:0019""","""True""","""PATO:0000461""","""PATO:0000383""",6953.0,2312,0.02632,2312,6953,"""JP""","""T""","""CD4+_T""","""CD4+_T_cm""","""CD4+_T_cm""","""1""","""central memory CD4-positive, alpha-beta T cell""","""10x 5' v2""","""normal""","""female""","""blood""","""Japanese""","""37-year-old stage""","""v-w>{Qh~}1""","""RPLP1 B2M MALAT1 RPL28 RPL41 RPL13 RPL10 RPS27 EEF1A1 RPS12 RPL19 RPS27A RPL11 RPS15A TMSB10 RPL18A RPL30 S100A4 RPL32 RPS18 RPL34 RPS4X RPS23 MT-CO1 RPS19 ACTB RPS28 TMSB4X RPS8 HLA-B RPL29 RPL18 RPS15 RPL37 RPS14 RPLP0 RPL8 RPS3A RPL26 FAU RPS21 TPT1 RPL3 MT-CO2 RPS13 RPS3 RPL12 RPS2 RPLP2 RPS25 RPL6 MT-CO3 RPL17 RPL36 HLA-C RPS6 RPL7A RPL14 LTB RPL23A RPS24 RPL39 RPS5 HLA-A RPS7 RPL35A TRBV5-6 RPSA VIM PFN1 IL7R RACK1 RPL22 KLF2 PPIA PTMA RPL35 NACA MT-ND4L RPS16 RPL4 RPL10A CD52 RPS29 IL32 E…",37
"""GRCh38""","""v98""","""DRAGEN RNA v3.8.4""","""no""","""5491dd9a-e8f1-4788-8495-3deaf7b3fa4d""","""EFO:0009900""","""5 prime tag""","""40000 cells""","""Samsung Genome Institute""",True,"""CL:0000939""","""CD16+_NK""","""1e8c5574-340b-4b57-ad22-5eb768024adb""","""other""","""UBERON:0000178""","""HsapDv:0000142""","""blood draw""",,"""tissue""","""density gradient centrifugation""","""peripheral blood mononuclear cell""",96.8,"""8c10b066-3f9a-46b1-b412-84189f5fa646""","""cell""","""KR_SGI_H033""","""HANCESTRO:0022""","""True""","""PATO:0000461""","""PATO:0000383""",4794.0,2117,0.041927,2117,4794,"""KR""","""NK""","""CD16+_NK""","""CD16+_NK""","""CD16+_NK_FCER1Ghi_KLRC2lo""","""0""","""CD16-positive, CD56-dim natural killer cell, human""","""10x 5' v2""","""normal""","""female""","""blood""","""Korean""","""48-year-old stage""","""a9BU%y_Jg5""","""MALAT1 B2M MT-CO1 HLA-B ACTB NKG7 MT-CO2 HLA-A TMSB4X GNLY RPLP1 TMSB10 HLA-C MT-CO3 RPL10 TPT1 CCL5 RPS27 EEF1A1 CTSW RPS12 ZFP36 PFN1 FOS RPL13 RPS27A RPS6 RPS3A RPS15A PTMA KLF2 RPS18 RPL28 RPL23A RPS14 RPS3 RPS25 MT-ATP6 JUN FLNA RPL18 RPL7A RPL37 SH3BGRL3 RPL18A RPLP2 EIF1 MT-CYB RPL34 RPS19 MT-ND5 DDX5 IFITM1 GZMB FTH1 RPL19 JUNB RPS28 RPS4X CYBA RPS2 IL32 CFL1 FAU RPS21 MYH9 FGFBP2 RAC2 RPL13A RPS8 FTL UBC RPSA RPL29 RPL32 RPL27 RPL11 EEF1B2 RPL14 RPS5 BTG1 RPS15 RUNX3 IER2 CYFIP2 RPL17 T…",48
"""GRCh38""","""v98""","""DRAGEN RNA v4""","""yes""","""f22affcb-67e9-458e-81e8-4f7c6fc2bcf1""","""EFO:0009900""","""5 prime tag""","""40000 cells""","""Genome Institute of Singapore""",True,"""CL:0000904""","""CD4+_T_cm""","""d8e5835f-da4d-4ae9-a748-bf9d853a836e""","""other""","""UBERON:0000178""","""HsapDv:0000117""","""blood draw""",30.0,"""tissue""","""leukapheresis""","""peripheral blood mononuclear cell""",67.0,"""ffbdc85b-f97c-4793-bf57-795f560b2ec6""","""cell""","""LONZA3038099""","""unknown""","""True""","""PATO:0000461""","""PATO:0000384""",7741.0,2759,0.039142,2759,7741,"""SG""","""T""","""CD4+_T""","""CD4+_T_cm""","""CD4+_T_cm""","""0""","""central memory CD4-positive, alpha-beta T cell""","""10x 5' v2""","""normal""","""male""","""blood""","""unknown""","""23-year-old stage""","""p46&6>BNkm""","""MALAT1 RPLP1 RPS12 RPL10 EEF1A1 RPL13 RPS18 TPT1 RPS27A RPL32 TMSB4X RPS8 MT-CO1 B2M RPS27 ACTB MT-CYB RPS3 RPL19 RPS6 RPS23 RPL11 RPS14 RPS19 RPL41 HLA-B RPS13 RPS15A RPL3 RPS3A RPS7 MT-CO3 RPL28 RPS25 MT-CO2 RPL29 RPS2 RPS24 RPL37 RPL7A RPL18A RPS4X RPL6 RPL8 RPL30 RPL23A RPL14 IL7R RPL4 RPS15 RPL34 RPL10A RPS21 RPL26 MT-ATP6 RPS28 RPSA RPL18 MT-ND5 FAU RPL17 EIF1 RPL37A RPS5 PTMA RPL35A EEF2 RPS9 LTB HLA-C RPL12 RPLP0 RPS29 RPL15 RPL13A RPL9 MT-ND4L RPL36 RPL5 EEF1B2 RPLP2 TMSB10 JUN HLA-A RA…",23
"""GRCh38""","""v98""","""DRAGEN RNA v3.8.4""","""no""","""5b6184ee-60b3-4f4e-ad21-c8424f7a4a8b""","""EFO:0009900""","""5 prime tag""","""40000 cells""","""Samsung Genome Institute""",True,"""CL:0000785""","""atypical_B""","""734c5d77-a70d-45b3-84e3-f7d479371499""","""other""","""UBERON:0000178""","""HsapDv:0000132""","""blood draw""",,"""tissue""","""density gradient centrifugation""","""peripheral blood mononuclear cell""",97.5,"""137a3537-0baf-4162-ac55-bef90b9f8824""","""cell""","""KR_SGI_H067""","""HANCESTRO:0022""","""True""","""PATO:0000461""","""PATO:0000383""",4203.0,1448,0.033547,1448,4203,"""KR""","""B""","""atypical_B""","""atypical_B""","""atypical_B_FCRL3hi""","""0""","""mature B cell""","""10x 5' v2""","""normal""","""female""","""blood""","""Korean""","""38-year-old stage""","""4kp;Y&NE~c""","""CD74 RPS27 MALAT1 RPS8 EEF1A1 RPL41 RPL32 RPL10 RPL13 B2M RPS18 RPLP1 RPL30 RPL34 RPS15A RPS2 MT-CO1 RPL28 RPL12 RPS14 RPS6 RPS12 RPS4X RPS23 HLA-DRA MT-CO2 RPL11 RPS27A RPL37 RPS19 RPS15 JUNB RPS3A RPL18 RPS7 FAU TPT1 RPS3 RPL8 RPS25 ACTB RPL7A RPL26 RPL19 HLA-B RPL18A RPL35A RPL29 RPL39 RPL23A RPS28 CD37 PTMA HLA-DRB1 RPS13 BTG1 NACA IGKV1-39 RPLP2 RPS29 RPL15 RPL6 HLA-A CD52 RPS24 RPS21 FOS MT-CYB DUSP1 TMSB10 HLA-C MT-CO3 MT-ND3 RPS9 EIF1 RPL36 RPL9 RPS5 RPSA RPL5 RPL17 HLA-DQB1 IGKV1D-39 KL…",38
"""GRCh38""","""v98""","""DRAGEN RNA v4""","""yes""","""de842ea6-cb26-47c3-8957-5e58c178c2a1""","""EFO:0009900""","""5 prime tag""","""40000 cells""","""Genome Institute of Singapore""",True,"""CL:0000624""","""CD4+_T_unknown""","""ad14f15d-7e83-45c3-b644-f9042573e654""","""other""","""UBERON:0000178""","""HsapDv:0000152""","""blood draw""",30.1,"""tissue""","""density gradient centrifugation""","""peripheral blood mononuclear cell""",90.0,"""f5690b74-4a38-47ec-b9df-6c530a58340d""","""cell""","""SG_HEL_H353""","""HANCESTRO:0598""","""True""","""PATO:0000461""","""PATO:0000384""",6736.0,2218,0.038895,2218,6736,"""SG""","""T""","""CD4+_T""","""CD4+_T""","""CD4+_T_memory""","""1""","""CD4-positive, alpha-beta T cell""","""10x 5' v2""","""normal""","""male""","""blood""","""Singaporean Indian""","""58-year-old stage""","""gR+OeT|N5K""","""MALAT1 B2M EEF1A1 RPS12 JUNB RPL10 FOS RPL13 RPL41 RPL30 RPL19 RPS27 RPL32 MT-CO1 RPL28 RPS18 RPL11 RPS15A RPS3 RPS23 RPS19 RPS4X RPLP1 RPS14 RPL34 HLA-B RPL6 RPL18A RPS27A JUN RPS3A TPT1 RPS25 TMSB4X MT-CYB MT-CO2 RPL3 RPS8 RPS28 MT-ATP6 RPS13 ACTB TMSB10 RPS2 RPS6 RPLP0 RPL18 RPL29 VIM HLA-C RPL26 RPL7A RPL36 RPL14 TSC22D3 RPL37 RPS5 RPL17 ZFP36 RPS7 RPL12 RPL8 RPL35A ZFP36L2 DUSP1 RPS21 RPL22 RPS24 RPL39 RPS9 MT-CO3 RPL23A RPS26 HLA-A RPL5 RPS15 KLF2 RPLP2 RPL9 PTMA FAU PFN1 RPL10A RPS29 MT-N…",58


In [None]:
# Check for null values
print("\nNull counts per column:")
train_lf.select([pl.col(col).null_count().alias(col) for col in schema.keys()]).collect()


## Load Test Data (Lazy)with

In [None]:
# Use scan_parquet to efficiently load all test chunks
test_dir = latest_test / "test"
test_lf = pl.scan_parquet(test_dir / "*.parquet")

print("Test Data Schema:")
print(test_lf.collect_schema())


In [None]:
# Get metadata without loading all data into memory
print("\n=== Test Data Metadata ===")
print(f"\nTotal rows: {test_lf.select(pl.len()).collect().item()}")
print(f"Number of columns: {len(test_lf.collect_schema())}")


In [None]:
# Show column details
print("\nColumn Information:")
schema = test_lf.collect_schema()
for col_name, dtype in schema.items():
    print(f"  - {col_name}: {dtype}")


In [None]:
# Get basic statistics for numeric columns
print("\nBasic Statistics:")
test_lf.describe()


In [None]:
# Show first few rows
print("\nFirst 5 rows of test data:")
test_lf.head(5).collect()


In [None]:
# Check for null values
print("\nNull counts per column:")
test_lf.select([pl.col(col).null_count().alias(col) for col in schema.keys()]).collect()


In [None]:
print("=== Train vs Test Comparison ===")
train_count = train_lf.select(pl.len()).collect().item()
test_count = test_lf.select(pl.len()).collect().item()
total_count = train_count + test_count

print(f"\nTrain rows: {train_count:,} ({train_count/total_count*100:.2f}%)")
print(f"Test rows: {test_count:,} ({test_count/total_count*100:.2f}%)")
print(f"Total rows: {total_count:,}")


## Explore Specific Columns (if they exist)


In [None]:
# Check if common metadata columns exist
train_schema = train_lf.collect_schema()
metadata_cols = ["cell_id", "age", "tissue", "cell_type", "dataset_id", "organism"]
available_metadata = [col for col in metadata_cols if col in train_schema]

print("Available metadata columns:")
print(available_metadata)


In [None]:
# Show value counts for categorical columns (train)
if available_metadata:
    print("\n=== Train Data - Value Counts ===")
    for col in available_metadata[:5]:  # Limit to first 5 metadata columns
        print(f"\n{col}:")
        value_counts = train_lf.select(pl.col(col)).collect().get_column(col).value_counts()
        print(value_counts.head(10))


In [None]:
# Show value counts for categorical columns (test)
if available_metadata:
    print("\n=== Test Data - Value Counts ===")
    for col in available_metadata[:5]:  # Limit to first 5 metadata columns
        print(f"\n{col}:")
        value_counts = test_lf.select(pl.col(col)).collect().get_column(col).value_counts()
        print(value_counts.head(10))


## Validate Top Genes for Specific Cell


In [24]:
import anndata as ad
import numpy as np
import pickle

# Load h5ad in backed mode (memory efficient)
h5ad_path = Path("/home/antonkulaga/sources/cell2sentence4longevity/data/input/test_dataset.h5ad")
adata = ad.read_h5ad(h5ad_path, backed='r')

print(f"Loaded h5ad with {adata.n_obs} cells and {adata.n_vars} genes")

# Find the cell index for the specific observation_joinid
target_cell_id = "v-w>{Qh~}1"
print(f"Looking for cell ID: '{target_cell_id}'")


Loaded h5ad with 1265624 cells and 35477 genes
Looking for cell ID: 'v-w>{Qh~}1'


In [25]:
# Try to find cell by observation_joinid
try:
    cell_idx = adata.obs_names.get_loc(target_cell_id)
    print(f"Found cell at index: {cell_idx}")
except KeyError:
    print(f"Cell ID '{target_cell_id}' not found in obs_names, checking observation_joinid column...")
    if 'observation_joinid' in adata.obs.columns:
        mask = adata.obs['observation_joinid'] == target_cell_id
        if mask.sum() > 0:
            # Get the integer position, not the label
            cell_idx = mask.to_numpy().argmax()
            print(f"Found cell at index: {cell_idx}")
        else:
            print(f"Cell ID '{target_cell_id}' not found in observation_joinid column")
            cell_idx = None
    else:
        print("observation_joinid column not found")
        cell_idx = None


Cell ID 'v-w>{Qh~}1' not found in obs_names, checking observation_joinid column...
Found cell at index: 785790


In [26]:
if cell_idx is not None:
    # Load gene mappers
    mappers_path = Path("/home/antonkulaga/sources/cell2sentence4longevity/data/input/mappers.pkl")
    
    if mappers_path.exists():
        with open(mappers_path, 'rb') as f:
            mappers = pickle.load(f)
        ensembl_to_symbol = mappers['ensembl_to_symbol']
        print(f"Loaded {len(ensembl_to_symbol)} gene mappings")
    else:
        print("Mappers file not found, will use feature_name from h5ad")
        ensembl_to_symbol = {}
    
    # Map all genes to symbols (memory efficient)
    print("Mapping genes to symbols...")
    gene_symbols = []
    for i, ens_id in enumerate(adata.var_names):
        if ens_id in ensembl_to_symbol:
            gene_symbols.append(ensembl_to_symbol[ens_id])
        elif 'feature_name' in adata.var.columns:
            gene_symbols.append(adata.var['feature_name'].iloc[i])
        else:
            gene_symbols.append(ens_id)
    
    gene_symbols = np.array(gene_symbols, dtype=object)
    print(f"Mapped {len(gene_symbols)} genes")


Mappers file not found, will use feature_name from h5ad
Mapping genes to symbols...
Mapped 35477 genes


In [27]:
if cell_idx is not None:
    # Get expression values for the target cell (only loads one row)
    cell_expr = adata.X[cell_idx].toarray().flatten()
    
    print(f"\nExpression vector shape: {cell_expr.shape}")
    print(f"Non-zero genes: {np.count_nonzero(cell_expr)}")
    
    # Get top 3 genes
    top_3_indices = np.argsort(cell_expr)[-3:][::-1]
    top_3_genes = gene_symbols[top_3_indices]
    top_3_values = cell_expr[top_3_indices]
    
    print(f"\n=== Top 3 Expressed Genes ===")
    for i, (gene, value) in enumerate(zip(top_3_genes, top_3_values), 1):
        print(f"{i}. {gene}: {value:.4f}")
    
    # Validate against expected
    expected_genes = ["RPLP1", "B2M", "MALAT1"]
    print(f"\n=== Validation ===")
    print(f"Expected top 3: {expected_genes}")
    print(f"Actual top 3: {list(top_3_genes)}")
    
    if list(top_3_genes) == expected_genes:
        print("✅ VALIDATION PASSED: Top 3 genes match!")
    else:
        print("❌ VALIDATION FAILED: Top 3 genes do not match")
        
    # Close the file
    adata.file.close()



Expression vector shape: (35477,)
Non-zero genes: 2310

=== Top 3 Expressed Genes ===
1. RPLP1: 5.1400
2. B2M: 4.9453
3. MALAT1: 4.8142

=== Validation ===
Expected top 3: ['RPLP1', 'B2M', 'MALAT1']
Actual top 3: ['RPLP1', 'B2M', 'MALAT1']
✅ VALIDATION PASSED: Top 3 genes match!


## Explore Column Names in Detail


In [None]:
# Identify gene expression columns (typically start with a specific pattern)

all_cols = list(train_schema.keys())

print(f"\nTotal columns: {len(all_cols)}")

print(f"\nFirst 20 column names:")

for col in all_cols[:20]:
    print(f"  {col}")


In [17]:
# Identify gene expression columns (typically start with a specific pattern)
all_cols = list(train_schema.keys())
print(f"\nTotal columns: {len(all_cols)}")
print(f"\nFirst 20 column names:")
for col in all_cols[:20]:
    print(f"  {col}")


NameError: name 'train_schema' is not defined

In [23]:
# Count non-zero values for first few gene columns (if they exist)
gene_cols = [col for col in all_cols if col not in metadata_cols][:10]

if gene_cols:
    print("\nNon-zero counts for sample gene columns (train):")
    for col in gene_cols:
        non_zero = train_lf.select((pl.col(col) != 0).sum()).collect().item()
        total = train_lf.select(pl.len()).collect().item()
        print(f"  {col}: {non_zero}/{total} ({non_zero/total*100:.2f}% non-zero)")


NameError: name 'all_cols' is not defined

## File Size Analysis


In [None]:
import os

def get_dir_size(path):
    total = 0
    for entry in Path(path).rglob('*.parquet'):
        if entry.is_file():
            total += entry.stat().st_size
    return total

train_size = get_dir_size(train_dir)
test_size = get_dir_size(test_dir)

print("=== Storage Information ===")
print(f"\nTrain data size: {train_size / 1024**2:.2f} MB ({train_size / 1024**3:.2f} GB)")
print(f"Test data size: {test_size / 1024**2:.2f} MB ({test_size / 1024**3:.2f} GB)")
print(f"Total size: {(train_size + test_size) / 1024**2:.2f} MB ({(train_size + test_size) / 1024**3:.2f} GB)")

print(f"\nNumber of chunks:")
print(f"  Train: {len(list(train_dir.glob('*.parquet')))} files")
print(f"  Test: {len(list(test_dir.glob('*.parquet')))} files")
