In [1]:
import hail as hl
import pandas as pd
# Parameters
log_path = '../logs/hail'
file_in = '../data/multisample_espanha.vcf.gz'
file_out = '../data/multisample_espanha.mt'
reference_genome = 'GRCh37' #'GRCh38'


In [2]:
hl.init(app_name = 'gnomAD', log = log_path)

25/11/04 08:29:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.5.7
SparkUI available at http://pimenta:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.136-c32f88309ab0
LOGGING: writing to ../logs/hail


In [4]:
# Import and write matrix table
mt = hl.import_vcf(file_in, force_bgz=True, reference_genome=reference_genome)
mt.write(file_out, overwrite=True)

2025-11-04 08:30:12.094 Hail: INFO: scanning VCF for sortedness...
2025-11-04 08:30:16.663 Hail: INFO: Coerced sorted VCF - no additional import work to do
2025-11-04 08:30:36.566 Hail: INFO: wrote matrix table with 708424 rows and 267 columns in 2 partitions to ../data/multisample_espanha.mt


In [29]:
mt = hl.read_matrix_table(file_out)

In [30]:
# Here need to be changed in the real dataset. The current dataset is just a mockup.
# For example, adding annotations from gnomAD, filtering samples, etc.
# # Calculate basic allele statistics (similar to gnomAD)
# mt = hl.variant_qc(mt)
print("Matrix table schema:")
mt.describe()

Matrix table schema:
----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        DP: int32, 
        DS: bool, 
        Dels: float64, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        RPA: array<int32>, 
        RU: str, 
        ReadPosRankSum: float64, 
        STR: bool
    }
----------------------------------------
Entry fields:
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'GT': call
    'PL': a

In [31]:
# Get available info fields
info_fields = list(mt.info.dtype.fields)
print(f"\n=== Available INFO fields ===")
print(info_fields)


=== Available INFO fields ===
['AC', 'AF', 'AN', 'BaseQRankSum', 'DP', 'DS', 'Dels', 'FS', 'HaplotypeScore', 'InbreedingCoeff', 'MLEAC', 'MLEAF', 'MQ', 'MQ0', 'MQRankSum', 'QD', 'RPA', 'RU', 'ReadPosRankSum', 'STR']


In [32]:
# Extract gnomAD statistics from info field
variants = mt.rows()

In [33]:
def get_info_field(field_name, default_value, is_array=False, index=0):
    """
    Safely extract info field, handling missing fields and missing arrays
    by coalescing to a default value.
    """
    if field_name not in info_fields:
        return hl.literal(default_value)  # Field doesn't exist in header
    
    field = variants.info[field_name]
    
    if is_array:
        # 1. Check if field is defined.
        # 2. If yes, check if length is > index.
        # 3. If yes, get element.
        # 4. If no to 1 or 2, return default.
        return hl.if_else(
            hl.is_defined(field) & (hl.len(field) > index),
            field[index],
            hl.literal(default_value)
        )
    else:
        # Coalesce returns the field if it's not missing,
        # otherwise it returns the default value.
        return hl.coalesce(field, hl.literal(default_value))

In [35]:
# Build selection dict dynamically
selection = {
    # Locus information (always present)
    'chrom': variants.locus.contig,
    'pos': variants.locus.position,
    'ref': variants.alleles[0],
    'alt': hl.delimit(variants.alleles[1:], ','),
    'rsid': hl.coalesce(variants.rsid, hl.missing(hl.tstr)), # rsid can also be missing
    'qual': variants.qual,
    'filters': hl.delimit(variants.filters, ','),
}

In [36]:
# gnomAD-specific fields (may not be present)
gnomad_fields = {
    'grpmax': ('grpmax', '', True),
    'faf95_max': ('fafmax_faf95_max', 0.0, True),
    'faf95_max_gen_anc': ('fafmax_faf95_max_gen_anc', '', True),
    'nhomalt': ('nhomalt', 0, True),
    
    # Sex-specific
    'AC_XX': ('AC_XX', 0, True),
    'AN_XX': ('AN_XX', 0, False),
    'AF_XX': ('AF_XX', 0.0, True),
    'nhomalt_XX': ('nhomalt_XX', 0, True),
    'AC_XY': ('AC_XY', 0, True),
    'AN_XY': ('AN_XY', 0, False),
    'AF_XY': ('AF_XY', 0.0, True),
    'nhomalt_XY': ('nhomalt_XY', 0, True),
}
for new_name, (field, default, is_array) in gnomad_fields.items():
    selection[new_name] = get_info_field(field, default, is_array)

In [37]:
# Population-specific fields
populations = ['afr', 'amr', 'asj', 'eas', 'fin', 'nfe', 'sas', 'mid']
for pop in populations:
    gnomad_fields[f'AC_{pop}'] = (f'AC_{pop}', 0, True)
    gnomad_fields[f'AF_{pop}'] = (f'AF_{pop}', 0.0, True)
    gnomad_fields[f'AN_{pop}'] = (f'AN_{pop}', 0, False)


In [38]:
# Prediction scores
prediction_fields = {
    'cadd_phred': ('cadd_phred', hl.float64(0.0), False),
    'cadd_raw_score': ('cadd_raw_score', hl.float64(0.0), False),
    'revel_max': ('revel_max', hl.float64(0.0), False),
    'spliceai_ds_max': ('spliceai_ds_max', hl.float64(0.0), False),
    'phylop': ('phylop', hl.float64(0.0), False),
    'sift_max': ('sift_max', hl.float64(0.0), False),
    'polyphen_max': ('polyphen_max', hl.float64(0.0), False),
}
for new_name, (field, default, is_array) in prediction_fields.items():
    selection[new_name] = get_info_field(field, default, is_array)

In [39]:
# Quality metrics (common in VCFs)
quality_fields = {
    'DP': 0, 'AN': 0, # Integers
    'FS': 0.0, 'MQ': 0.0, 'MQRankSum': 0.0, 'QD': 0.0, 
    'ReadPosRankSum': 0.0, 'SOR': 0.0, 'BaseQRankSum': 0.0, 
    'InbreedingCoeff': 0.0 # Floats
}
for field, default in quality_fields.items():
    selection[field] = get_info_field(field, default, is_array=False)

In [None]:
# # Add gnomAD fields if they exist
# for new_name, (field_name, default, is_array) in gnomad_fields.items():
#     if field_name in info_fields:
#         selection[new_name] = get_info_field(variants, field_name, default, is_array)

# # Add prediction fields if they exist
# for new_name, (field_name, default, is_array) in prediction_fields.items():
#     if field_name in info_fields:
#         selection[new_name] = get_info_field(variants, field_name, default, is_array)


In [40]:
# Quality metrics (common in VCFs)
quality_fields = ['DP', 'FS', 'MQ', 'MQRankSum', 'QD', 'ReadPosRankSum', 'SOR', 
                  'BaseQRankSum', 'InbreedingCoeff']
for field in quality_fields:
    if field in info_fields:
        selection[field] = variants.info[field]



In [41]:
# VEP annotation if present
if 'vep' in info_fields:
    selection['vep'] = variants.info.vep # VEP is complex, handle separately

# Core allele statistics (now using our robust function)
# Note: AN is already handled by quality_fields, but this is fine
selection['AC'] = get_info_field('AC', 0, is_array=True, index=0)
selection['AN'] = get_info_field('AN', 0, is_array=False)
selection['AF'] = get_info_field('AF', 0.0, is_array=True, index=0)

In [43]:
# Apply selection
variants = variants.select(**selection)

# Get field list for display
selected_fields = list(selection.keys())
print(f"\n=== Selected fields ({len(selected_fields)}) ===")
print(selected_fields)


=== Selected fields (38) ===
['chrom', 'pos', 'ref', 'alt', 'rsid', 'qual', 'filters', 'grpmax', 'faf95_max', 'faf95_max_gen_anc', 'nhomalt', 'AC_XX', 'AN_XX', 'AF_XX', 'nhomalt_XX', 'AC_XY', 'AN_XY', 'AF_XY', 'nhomalt_XY', 'cadd_phred', 'cadd_raw_score', 'revel_max', 'spliceai_ds_max', 'phylop', 'sift_max', 'polyphen_max', 'DP', 'AN', 'FS', 'MQ', 'MQRankSum', 'QD', 'ReadPosRankSum', 'SOR', 'BaseQRankSum', 'InbreedingCoeff', 'AC', 'AF']


In [None]:
# # helper function to extract info field
# def first_or_default(expr, default):
#     return hl.if_else(hl.len(expr) > 0, expr[0], default)


In [24]:
# # Select relevant fields from the info struct
# variants = variants.select(
#     # Locus information
#     chrom=variants.locus.contig,
#     pos=variants.locus.position,
#     ref=variants.alleles[0],
#     alt=hl.delimit(variants.alleles[1:], ','),  # Join alt alleles if multiple
#     rsid=variants.rsid,

#     # Basic allele statistics (from gnomAD info field)
#     AC=first_or_default(variants.info.AC, 0),
#     AN=variants.info.AN,
#     AF=first_or_default(variants.info.AF, 0.0),

#     # Genetic ancestry group with max AF
#     grpmax=first_or_default(variants.info.grpmax, ''),

#     # Filtering Allele Frequency (95% confidence)
#     faf95_max=first_or_default(variants.info.fafmax_faf95_max, 0.0),
#     faf95_max_gen_anc=first_or_default(variants.info.fafmax_faf95_max_gen_anc, ''),

#     # Number of homozygotes
#     nhomalt=first_or_default(variants.info.nhomalt, 0),

#     # Sex-specific counts
#     AC_XX=first_or_default(variants.info.AC_XX, 0),
#     AN_XX=variants.info.AN_XX,
#     AF_XX=first_or_default(variants.info.AF_XX, 0.0),
#     nhomalt_XX=first_or_default(variants.info.nhomalt_XX, 0),

#     AC_XY=first_or_default(variants.info.AC_XY, 0),
#     AN_XY=variants.info.AN_XY,
#     AF_XY=first_or_default(variants.info.AF_XY, 0.0),
#     nhomalt_XY=first_or_default(variants.info.nhomalt_XY, 0),

#     # Population-specific (examples)
#     AC_afr=first_or_default(variants.info.AC_afr, 0),
#     AF_afr=first_or_default(variants.info.AF_afr, 0.0),
#     AN_afr=variants.info.AN_afr,

#     AC_amr=first_or_default(variants.info.AC_amr, 0),
#     AF_amr=first_or_default(variants.info.AF_amr, 0.0),
#     AN_amr=variants.info.AN_amr,

#     AC_eas=first_or_default(variants.info.AC_eas, 0),
#     AF_eas=first_or_default(variants.info.AF_eas, 0.0),
#     AN_eas=variants.info.AN_eas,

#     AC_nfe=first_or_default(variants.info.AC_nfe, 0),
#     AF_nfe=first_or_default(variants.info.AF_nfe, 0.0),
#     AN_nfe=variants.info.AN_nfe,

#     AC_sas=first_or_default(variants.info.AC_sas, 0),
#     AF_sas=first_or_default(variants.info.AF_sas, 0.0),
#     AN_sas=variants.info.AN_sas,

#     # Prediction scores
#     cadd_phred=variants.info.cadd_phred,
#     revel_max=variants.info.revel_max,
#     spliceai_ds_max=variants.info.spliceai_ds_max,

#     # Quality metrics
#     qual=variants.qual,
#     filters=hl.delimit(variants.filters, ','),
# )


In [44]:
# Convert to pandas
df = variants.to_pandas()

print("\n=== Dataset Summary ===")
print(f"Total variants: {len(df):,}")
print(f"Chromosomes: {sorted(df['chrom'].unique())}")


# Show basic statistics for available fields
print("\n=== Basic Statistics ===")
if 'AC' in df.columns:
    print(f"AC - Mean: {df['AC'].mean():.2f}, Median: {df['AC'].median():.0f}, Max: {df['AC'].max()}")
if 'AF' in df.columns:
    print(f"AF - Mean: {df['AF'].mean():.6f}, Median: {df['AF'].median():.6f}, Max: {df['AF'].max():.6f}")
if 'AN' in df.columns:
    print(f"AN - Mean: {df['AN'].mean():.2f}, Median: {df['AN'].median():.0f}")
if 'nhomalt' in df.columns:
    print(f"Total Homozygotes: {df['nhomalt'].sum():,}")

print("\nFirst 10 variants:")
print(df.head(10))

# Export to TSV for Streamlit
output_tsv = '../data/variant_stats.tsv'
variants.export(output_tsv)
print(f"\nExported to: {output_tsv}")


[Stage 7:>                                                          (0 + 2) / 2]

FatalError: SparkException: Job aborted due to stage failure: Task 1 in stage 7.0 failed 1 times, most recent failure: Lost task 1.0 in stage 7.0 (TID 15) (pimenta executor driver): TaskResultLost (result lost from block manager)
Driver stacktrace:

Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 7.0 failed 1 times, most recent failure: Lost task 1.0 in stage 7.0 (TID 15) (pimenta executor driver): TaskResultLost (result lost from block manager)
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2898)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2834)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2833)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2833)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1253)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1253)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3102)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3036)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3025)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:995)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at is.hail.backend.spark.SparkBackend.$anonfun$parallelizeAndComputeWithIndex$4(SparkBackend.scala:344)
	at is.hail.backend.spark.SparkBackend.$anonfun$parallelizeAndComputeWithIndex$4$adapted(SparkBackend.scala:343)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at is.hail.backend.spark.SparkBackend.parallelizeAndComputeWithIndex(SparkBackend.scala:343)
	at is.hail.backend.BackendUtils.collectDArray(BackendUtils.scala:85)
	at __C4691Compile.apply_region242_242(Emit.scala)
	at __C4691Compile.apply_region240_242(Emit.scala)
	at __C4691Compile.apply(Emit.scala)
	at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$5(CompileAndEvaluate.scala:93)
	at scala.runtime.java8.JFunction0$mcJ$sp.apply(JFunction0$mcJ$sp.java:23)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:99)
	at is.hail.backend.ExecuteContext.time(ExecuteContext.scala:174)
	at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$4(CompileAndEvaluate.scala:93)
	at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$4$adapted(CompileAndEvaluate.scala:91)
	at is.hail.backend.ExecuteContext.$anonfun$scopedExecution$2(ExecuteContext.scala:153)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:99)
	at is.hail.backend.ExecuteContext.time(ExecuteContext.scala:174)
	at is.hail.backend.ExecuteContext.$anonfun$scopedExecution$1(ExecuteContext.scala:152)
	at is.hail.utils.package$.using(package.scala:682)
	at is.hail.backend.ExecuteContext.scopedExecution(ExecuteContext.scala:151)
	at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$1(CompileAndEvaluate.scala:91)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:99)
	at is.hail.backend.ExecuteContext.time(ExecuteContext.scala:174)
	at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:56)
	at is.hail.expr.ir.CompileAndEvaluate$.evalToIR(CompileAndEvaluate.scala:37)
	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.evaluate$1(LowerOrInterpretNonCompilable.scala:31)
	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.rewrite$1(LowerOrInterpretNonCompilable.scala:60)
	at is.hail.expr.ir.LowerOrInterpretNonCompilable$.apply(LowerOrInterpretNonCompilable.scala:65)
	at is.hail.expr.ir.lowering.LowerOrInterpretNonCompilablePass$.transform(LoweringPass.scala:88)
	at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$1(LoweringPass.scala:39)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:99)
	at is.hail.backend.ExecuteContext.time(ExecuteContext.scala:174)
	at is.hail.expr.ir.lowering.LoweringPass.apply(LoweringPass.scala:37)
	at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$2(LoweringPipeline.scala:21)
	at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$2$adapted(LoweringPipeline.scala:19)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1(LoweringPipeline.scala:19)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:99)
	at is.hail.backend.ExecuteContext.time(ExecuteContext.scala:174)
	at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:10)
	at is.hail.expr.ir.lowering.EvalRelationalLets$.$anonfun$apply$2(EvalRelationalLets.scala:15)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:99)
	at is.hail.backend.ExecuteContext.time(ExecuteContext.scala:174)
	at is.hail.expr.ir.lowering.EvalRelationalLets$.execute$1(EvalRelationalLets.scala:14)
	at is.hail.expr.ir.lowering.EvalRelationalLets$.lower$1(EvalRelationalLets.scala:23)
	at is.hail.expr.ir.lowering.EvalRelationalLets$.$anonfun$apply$1(EvalRelationalLets.scala:37)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:99)
	at is.hail.backend.ExecuteContext.time(ExecuteContext.scala:174)
	at is.hail.expr.ir.lowering.EvalRelationalLets$.apply(EvalRelationalLets.scala:12)
	at is.hail.expr.ir.lowering.EvalRelationalLetsPass.transform(LoweringPass.scala:175)
	at is.hail.expr.ir.lowering.LoweringPass.$anonfun$apply$1(LoweringPass.scala:39)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:99)
	at is.hail.backend.ExecuteContext.time(ExecuteContext.scala:174)
	at is.hail.expr.ir.lowering.LoweringPass.apply(LoweringPass.scala:37)
	at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$2(LoweringPipeline.scala:21)
	at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$2$adapted(LoweringPipeline.scala:19)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at is.hail.expr.ir.lowering.LoweringPipeline.$anonfun$apply$1(LoweringPipeline.scala:19)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:99)
	at is.hail.backend.ExecuteContext.time(ExecuteContext.scala:174)
	at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:10)
	at is.hail.expr.ir.CompileAndEvaluate$.$anonfun$_apply$1(CompileAndEvaluate.scala:57)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:99)
	at is.hail.backend.ExecuteContext.time(ExecuteContext.scala:174)
	at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:56)
	at is.hail.backend.spark.SparkBackend.$anonfun$execute$1(SparkBackend.scala:430)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:99)
	at is.hail.backend.ExecuteContext.time(ExecuteContext.scala:174)
	at is.hail.backend.spark.SparkBackend.execute(SparkBackend.scala:419)
	at is.hail.backend.driver.BackendRpc.$anonfun$runRpc$2(BackendRpc.scala:96)
	at is.hail.backend.driver.BackendRpc.withRegisterSerializedFns(BackendRpc.scala:172)
	at is.hail.backend.driver.BackendRpc.$anonfun$runRpc$1(BackendRpc.scala:94)
	at is.hail.backend.ExecuteContext$.$anonfun$scoped$3(ExecuteContext.scala:94)
	at is.hail.utils.package$.using(package.scala:682)
	at is.hail.backend.ExecuteContext$.$anonfun$scoped$2(ExecuteContext.scala:94)
	at is.hail.utils.package$.using(package.scala:682)
	at is.hail.annotations.RegionPool.scopedRegion(RegionPool.scala:166)
	at is.hail.backend.ExecuteContext$.$anonfun$scoped$1(ExecuteContext.scala:77)
	at is.hail.utils.package$.using(package.scala:682)
	at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:13)
	at is.hail.backend.ExecuteContext$.scoped(ExecuteContext.scala:76)
	at is.hail.backend.driver.Py4JQueryDriver.$anonfun$withExecuteContext$1(Py4JQueryDriver.scala:308)
	at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:16)
	at is.hail.backend.driver.Py4JQueryDriver.is$hail$backend$driver$Py4JQueryDriver$$withExecuteContext(Py4JQueryDriver.scala:290)
	at is.hail.backend.driver.Py4JQueryDriver$$anon$1$Context$.scoped(Py4JQueryDriver.scala:398)
	at is.hail.backend.driver.Py4JQueryDriver$$anon$1$Context$.scoped(Py4JQueryDriver.scala:396)
	at is.hail.backend.driver.BackendRpc.runRpc(BackendRpc.scala:80)
	at is.hail.backend.driver.BackendRpc.runRpc$(BackendRpc.scala:76)
	at is.hail.backend.driver.Py4JQueryDriver$$anon$1.runRpc(Py4JQueryDriver.scala:347)
	at is.hail.backend.driver.Py4JQueryDriver$$anon$1.$anonfun$new$1(Py4JQueryDriver.scala:406)
	at jdk.httpserver/com.sun.net.httpserver.Filter$Chain.doFilter(Filter.java:95)
	at jdk.httpserver/sun.net.httpserver.AuthFilter.doFilter(AuthFilter.java:82)
	at jdk.httpserver/com.sun.net.httpserver.Filter$Chain.doFilter(Filter.java:98)
	at jdk.httpserver/sun.net.httpserver.ServerImpl$Exchange$LinkHandler.handle(ServerImpl.java:855)
	at jdk.httpserver/com.sun.net.httpserver.Filter$Chain.doFilter(Filter.java:95)
	at jdk.httpserver/sun.net.httpserver.ServerImpl$Exchange.run(ServerImpl.java:831)
	at jdk.httpserver/sun.net.httpserver.ServerImpl$DefaultExecutor.execute(ServerImpl.java:201)
	at jdk.httpserver/sun.net.httpserver.ServerImpl$Dispatcher.handle(ServerImpl.java:561)
	at jdk.httpserver/sun.net.httpserver.ServerImpl$Dispatcher.run(ServerImpl.java:526)
	at java.base/java.lang.Thread.run(Thread.java:840)



Hail version: 0.2.136-c32f88309ab0
Error summary: SparkException: Job aborted due to stage failure: Task 1 in stage 7.0 failed 1 times, most recent failure: Lost task 1.0 in stage 7.0 (TID 15) (pimenta executor driver): TaskResultLost (result lost from block manager)
Driver stacktrace:

[Stage 7:>                                                          (0 + 1) / 2]

In [45]:
# Check for population data
pop_fields_present = [col for col in df.columns if any(col.startswith(f'AF_{pop}') for pop in populations)]
if pop_fields_present:
    print(f"\n=== Population-specific data available ===")
    print(f"Populations: {pop_fields_present}")
else:
    print("\n⚠️  No population-specific data in this VCF")

# Check for prediction scores
pred_fields_present = [col for col in df.columns if col in prediction_fields.keys()]
if pred_fields_present:
    print(f"\n=== Prediction scores available ===")
    print(f"Scores: {pred_fields_present}")
else:
    print("\n⚠️  No prediction scores in this VCF")


⚠️  No population-specific data in this VCF

⚠️  No prediction scores in this VCF
