# Convert VCF to MatrixTable

In [1]:
import hail as hl
hl.init(default_reference='GRCh38', spark_conf={'spark.driver.memory': '10g', 'spark.local.dir': '/home/olavur/tmp'})

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-7889d4ff4c-z7fmq:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/hail-20210301-1352-0.2.61-3c86d3ba497a.log


In [8]:
import shutil, os

In [20]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'

In [17]:
vcf_gz_path = '/fargen/data/multi_sample_data/joint_genotyping/outs/variants/variants.vcf.gz'

In [18]:
mt = hl.import_vcf(vcf_gz_path, force_bgz=True, reference_genome='GRCh38', array_elements_required=False)

In [21]:
mt.write(BASE_DIR + '/data/mt/variants.mt', overwrite=True)

2021-03-01 14:08:49 Hail: INFO: Coerced sorted dataset
2021-03-01 14:11:14 Hail: INFO: wrote matrix table with 3110759 rows and 474 columns in 96 partitions to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/mt/variants.mt
    Total size: 12.67 GiB
    * Rows/entries: 12.67 GiB
    * Columns: 1.87 KiB
    * Globals: 11.00 B
    * Smallest partition: 30958 rows (134.19 MiB)
    * Largest partition:  37610 rows (137.39 MiB)


## Filter variants

Keep only variants in the VQSR tranche between 99.9 and 100, both SNPs and indels.

In [11]:
mt_filtered = mt.filter_rows(mt.filters.contains('VQSRTrancheSNP99.90to100.00') | mt.filters.contains('VQSRTrancheINDEL99.90to100.00'))

In [12]:
mt_filtered.count()

2021-03-01 14:01:34 Hail: INFO: Coerced sorted dataset


(617463, 474)

In [13]:
mt.count()

2021-03-01 14:02:21 Hail: INFO: Coerced sorted dataset


(3110759, 474)

Write the filtered variants to file.

In [14]:
mt_filtered.write('/home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/mt/filtered.mt', overwrite=True)

2020-12-03 14:40:51 Hail: INFO: Coerced sorted dataset
2020-12-03 14:42:18 Hail: INFO: wrote matrix table with 185771 rows and 48 columns in 5 partitions to data/mt/filtered.mt
    Total size: 110.51 MiB
    * Rows/entries: 110.51 MiB
    * Columns: 221.00 B
    * Globals: 11.00 B
    * Smallest partition: 36459 rows (21.72 MiB)
    * Largest partition:  37814 rows (22.68 MiB)
