In [1]:
import hail as hl
import numpy as np
hl.init(log='/tmp/hail.log')

Running on Apache Spark version 2.4.4
SparkUI available at http://ukbb-nb-m.c.ukbb-round2.internal:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.23-aaf52cafe5ef
LOGGING: writing to /tmp/hail.log


In [3]:
def get_mt(chr_ls):
    variants = hl.import_table('gs://nbaya/split/hapmap3_variants.tsv')
    variants = variants.annotate(**hl.parse_variant(variants.v))
    variants = variants.key_by('locus','alleles') 
    gt0 = hl.import_bgen(path='gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr'+str(set(chr_ls)).replace(' ','')+'_v3.bgen',
                         entry_fields=['GT'],
                         n_partitions = 1000,
                         sample_file = 'gs://ukb31063/ukb31063.autosomes.sample',
                         variants=variants)
    return gt0
def ls_sim(ref_mt, Nsim, mu=0.01, Nref=None, recomb_ls=None):
    '''
    Simulates `Nsim` new individuals based on the genotypes in `ref_mt`.
    
    '''
    assert (mu > 0) and (mu < 1), 'Mutation rate (recombination rate) must be in (0,1)'
    assert 'locus' in ref_mt.row, '`locus` row field must be in `ref_mt`'

    Nref = ref_mt.count_cols() if Nref is None else Nref

    it0 = ref_mt.rows().key_by('locus')
    Nsnps = it0.count() 
    it0 = it0.add_index('tmp_idx')
    it0 = it0.annotate(recomb = (hl.rand_bool(mu)|(it0.tmp_idx==0)|(it0.tmp_idx==Nsnps-1)))
    it0 = it0.filter(it0.recomb == 1)
    prev_array = hl.scan._prev_nonnull(it0.locus)
    it0 = it0.annotate(interval = hl.interval(prev_array, it0.locus))
    it0 = it0.filter(it0.tmp_idx==0,keep=False)
    recomb_ls = it0.locus.collect()
    
    ## simulation
    it = it0.annotate(hap = hl.range(Nsim).map(lambda i: [hl.int(hl.rand_unif(0, Nref)), 
                                                          hl.int(hl.rand_unif(0, Nref))]))
    ht = ref_mt.localize_entries(entries_array_field_name='entries')
    ht = ht.annotate(it = it[ht.locus])
    ht = ht.annotate(sim_gt = ht.it.hap.map(lambda i: hl.parse_call(
            hl.str(ht.entries[i[0]].GT[0])+'|'+hl.str(ht.entries[i[1]].GT[1]))))
    ht = ht.annotate_globals(cols = hl.range(Nsim).map(lambda i: hl.struct(col_idx=i)))
    ht = ht.annotate(sim_gt = ht.sim_gt.map(lambda x: hl.struct(GT=x)))
    mt_sim = ht._unlocalize_entries('sim_gt','cols',['col_idx'])
    mt_sim = mt_sim.annotate_globals(recomb_ls = recomb_ls)
    return mt_sim

In [6]:
ref_mt = hl.methods.balding_nichols_model(2,1000,200)

# ref_mt = ref_mt.add_row_index('tmp_row_idx')
# ref_mt = ref_mt.filter_rows(ref_mt.tmp_row_idx<200)
row_ct, Nref= ref_mt.count()
print(f'... reference dataset: {row_ct} rows, {Nref} cols ...')

Nsim=int(1e7)
mu=1e-1
mt_sim = ls_sim(ref_mt=ref_mt, Nsim=Nsim, Nref=Nref,
                                       mu=mu)

2019-10-04 14:15:45 Hail: INFO: balding_nichols_model: generating genotypes for 2 populations, 1000 samples, and 200 variants...


... reference dataset: 200 rows, 1000 cols ...


2019-10-04 14:15:46 Hail: INFO: Coerced sorted dataset


In [10]:
len(mt_sim.recomb_ls.collect()[0])

16

In [13]:
sim_path = f'gs://nbaya/risk_gradients/ls_sim.Nref_{Nref}.Nsim_{Nsim}.Nsnps_{mt_sim.count_rows()}.mt'
print(f'... writing to {sim_path} ...')
mt_sim.write(sim_path)

... writing to gs://nbaya/risk_gradients/ls_sim.Nref_1000.Nsim_10000000.Nsnps_200.mt ...


2019-10-04 14:19:18 Hail: INFO: Coerced sorted dataset
2019-10-04 14:19:19 Hail: INFO: Coerced sorted dataset


KeyboardInterrupt: 

In [31]:
mt = gt0
snps = mt.locus.collect()

In [32]:
def ls_sim(ref_mt, Nsim, mu=0.01, Nref=None, snps=None):
    assert (mu > 0) and (mu < 1), 'Mutation rate (recombination rate) must be in (0,1)'
    assert 'locus' in ref_mt.row, '`locus` row field must be in `ref_mt`'

    Nref = ref_mt.count_cols() if Nref is None else Nref
    snps = ref_mt['locus'] if snps is None else snps
    
    ## Get intervals of non-recombination
    recomb_idx0 = np.random.binomial(1,mu,size=len(snps)-2) #subtract 2 from length because recombination events can't occur at first or last SNP
    recomb_idx1 = np.insert(arr=recomb_idx0,obj=0,values=1)
    recomb_idx2 = np.append(arr=recomb_idx1,values=1)
    recomb_idx3 = recomb_idx2==1
    recomb_ls = np.asarray(snps)[recomb_idx3]
    interval_ls = list(zip(recomb_ls[:-1], recomb_ls[1:]))
    print(f'number of recombination events: {len(interval_ls)-1}')

    ## get table of intervals
    it0 = hl.utils.range_table(len(interval_ls))
    intervals = hl.literal(interval_ls)
    it0 = it0.key_by(intervals = hl.interval(intervals[it0.idx][0], intervals[it0.idx][1]))
    it0.describe()
    
    ## simulation
    it = it0.annotate(hap = hl.range(Nsim).map(lambda i: [hl.int(hl.rand_unif(0, Nref)), 
                                                          hl.int(hl.rand_unif(0, Nref))]))
    ht = mt.localize_entries(entries_array_field_name='entries')
    ht = ht.annotate(it = it[ht.locus])
    ht = ht.annotate(sim_gt = ht.it.hap.map(lambda i: hl.parse_call(
            hl.str(ht.entries[i[0]].GT[0])+'|'+hl.str(ht.entries[i[1]].GT[1]))))
    ht = ht.annotate_globals(cols = hl.range(Nsim).map(lambda i: hl.struct(col_idx=i)))
    ht = ht.annotate(sim_gt = ht.sim_gt.map(lambda x: hl.struct(GT=x)))
    mt_sim = ht._unlocalize_entries('sim_gt','cols',['col_idx'])
    
Nsim = int(1e7)

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'idx': int32 
    'intervals': interval<locus<GRCh37>> 
----------------------------------------
Key: ['intervals']
----------------------------------------


In [34]:
mt_sim.count()

(15913, 10000000)

In [35]:
15913*Nsim

159130000000

In [40]:
sim_path = f'gs://nbaya/risk_gradients/ls_sim.ukb.Nref_{Nref}.Nsnps_{len(snps)}.r_{recomb_rate}.Nsim_{Nsim}.mt'
mt_sim.write(sim_path)

2019-10-03 12:59:59 Hail: INFO: Coerced sorted dataset


FatalError: SparkException: Job aborted due to stage failure: Task 1 in stage 42.0 failed 20 times, most recent failure: Lost task 1.19 in stage 42.0 (TID 3576, ukbb-nb-sw-77cj.c.ukbb-round2.internal, executor 167): ExecutorLostFailure (executor 167 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits.  12.0 GB of 12 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead or disabling yarn.nodemanager.vmem-check-enabled because of YARN-4714.
Driver stacktrace:

Java stack trace:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 42.0 failed 20 times, most recent failure: Lost task 1.19 in stage 42.0 (TID 3576, ukbb-nb-sw-77cj.c.ukbb-round2.internal, executor 167): ExecutorLostFailure (executor 167 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits.  12.0 GB of 12 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead or disabling yarn.nodemanager.vmem-check-enabled because of YARN-4714.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1890)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2111)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2049)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at is.hail.sparkextras.ContextRDD.collect(ContextRDD.scala:222)
	at is.hail.rvd.RVD.writeRowsSplit(RVD.scala:850)
	at is.hail.expr.ir.MatrixValue.write(MatrixValue.scala:222)
	at is.hail.expr.ir.MatrixNativeWriter.apply(MatrixWriter.scala:39)
	at is.hail.expr.ir.WrappedMatrixWriter.apply(MatrixWriter.scala:24)
	at is.hail.expr.ir.Interpret$.apply(Interpret.scala:742)
	at is.hail.expr.ir.Interpret$.apply(Interpret.scala:91)
	at is.hail.expr.ir.CompileAndEvaluate$$anonfun$1.apply(CompileAndEvaluate.scala:33)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:24)
	at is.hail.expr.ir.CompileAndEvaluate$.apply(CompileAndEvaluate.scala:33)
	at is.hail.backend.Backend$$anonfun$execute$1.apply(Backend.scala:56)
	at is.hail.backend.Backend$$anonfun$execute$1.apply(Backend.scala:56)
	at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:8)
	at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:7)
	at is.hail.utils.package$.using(package.scala:596)
	at is.hail.annotations.Region$.scoped(Region.scala:18)
	at is.hail.expr.ir.ExecuteContext$.scoped(ExecuteContext.scala:7)
	at is.hail.backend.Backend.execute(Backend.scala:56)
	at is.hail.backend.Backend.executeJSON(Backend.scala:62)
	at sun.reflect.GeneratedMethodAccessor47.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



Hail version: 0.2.23-aaf52cafe5ef
Error summary: SparkException: Job aborted due to stage failure: Task 1 in stage 42.0 failed 20 times, most recent failure: Lost task 1.19 in stage 42.0 (TID 3576, ukbb-nb-sw-77cj.c.ukbb-round2.internal, executor 167): ExecutorLostFailure (executor 167 exited caused by one of the running tasks) Reason: Container killed by YARN for exceeding memory limits.  12.0 GB of 12 GB physical memory used. Consider boosting spark.yarn.executor.memoryOverhead or disabling yarn.nodemanager.vmem-check-enabled because of YARN-4714.
Driver stacktrace: