In [1]:
from tables import *
from gzip import *

In [10]:
REFERENCE_VARIANTS_FILE_PATH = './Homo_sapiens.vcf.gz'

In [2]:
class ReferenceVariants(IsDescription):
    """
    Column descriptor.
    """
   
    # TODO: Match with VCF specification
    chrom = StringCol(16)
    start = Int32Col()
    end = Int32Col()
    id_ = StringCol(16)
    ref = StringCol(256)
    alt = StringCol(256)

    
def get_variant_start_and_end_positions(pos, ref, alt):
    """
    Get variant (@pos: ref ==> alt) start and end positions.
    :param pos: int;
    :param ref: str;
    :param alt: str;
    :return: (str, str);
    """
    
    # TODO: Check accuracy

    if len(ref) == len(alt):
        s, e = pos, pos + len(alt) - 1

    elif len(ref) < len(alt):
        s, e = pos, pos + 1

    else:  # len(alt) < len(ref)
        s, e = pos + 1, pos + len(ref) - len(alt)

    return s, e

In [3]:
# Create HDF5
hdf5 = open_file('table.hdf5', mode='w', title='Reference Variants')

In [4]:
# Create chromosome group
group = hdf5.create_group('/', 'chromosome_1', title='Chromosome 1')

# Create chromosome table
table = hdf5.create_table(group, 'reference_variants', ReferenceVariants, title='Chromosome 1 Reference Variants')
print(hdf5)

In [8]:
# Populate chromosome table
variant = table.row
with open(REFERENCE_VARIANT_FILE_PATH) as f:
    i = 0
    for row in f:
        
        row = row.decode()
        
        if row.startswith('#'):
            continue
            
        i += 1
        if i % 1000000 == 0:
            print('Processing variant {} ...'.format(i))
        
        chrom, pos, id_, ref, alt, qual, filter_, info = row.split('\t')[:8] 
        start, end = get_variant_start_and_end_positions(int(pos), ref, alt)
        
        variant['chrom'] = chrom
        variant['start'] = start
        variant['end'] = end
        variant['id_'] = id_
        variant['ref'] = ref
        variant['alt'] = alt
        variant.append()

table.flush()

Processing variant 1000000 ...
Processing variant 2000000 ...
Processing variant 3000000 ...
Processing variant 4000000 ...
Processing variant 5000000 ...
Processing variant 6000000 ...
Processing variant 7000000 ...
Processing variant 8000000 ...
Processing variant 9000000 ...
Processing variant 10000000 ...
Processing variant 11000000 ...
Processing variant 12000000 ...
Processing variant 13000000 ...
Processing variant 14000000 ...
Processing variant 15000000 ...
Processing variant 16000000 ...
Processing variant 17000000 ...
Processing variant 18000000 ...
Processing variant 19000000 ...
Processing variant 20000000 ...
Processing variant 21000000 ...
Processing variant 22000000 ...
Processing variant 23000000 ...
Processing variant 24000000 ...
Processing variant 25000000 ...
Processing variant 26000000 ...
Processing variant 27000000 ...
Processing variant 28000000 ...
Processing variant 29000000 ...
Processing variant 30000000 ...
Processing variant 31000000 ...
Processing varian

In [14]:
hdf5.close()