In [1]:
from typing import Sequence, Tuple

from pysam import FastaFile

In [2]:
REFERENCE = FastaFile('../ref/fasta.gz/human_g1k_v37.fasta')

In [3]:
def truncate_right(alleles: Sequence[str]) -> Tuple[int, Sequence[str]]:
    n_trunc = 0
    for idx in range(1, min(map(len, alleles)) + 1):
        if len({allele[-idx] for allele in alleles}) > 1:
            break
        n_trunc += 1
    return n_trunc, tuple(allele[:-n_trunc] if n_trunc > 0 else allele for allele in alleles)

In [4]:
# truncate_right(('ABCD', 'ABCD'))

In [5]:
def truncate_left(alleles: Sequence[str], min_len=0) -> Tuple[int, Sequence[str]]:
    n_trunc = 0
    for idx in range(0, min(map(len, alleles)) - min_len):
        if len({allele[idx] for allele in alleles}) > 1:
            break
        n_trunc += 1
    return n_trunc, tuple(allele[n_trunc:] if n_trunc > 0 else allele for allele in alleles)

In [6]:
# truncate_left(('ABCD', 'ABCD'), 2)

In [7]:
def extend_left(alleles: Sequence[str], reference: FastaFile, contig: str, start: int) -> Sequence[str]:
    ref_left = reference.fetch(region=contig, start=start - 1, end=start)
    return tuple(''.join((ref_left, allele)) for allele in alleles_n)

In [8]:
def normalize(contig: str, pos: int, alleles: Tuple[str], reference: FastaFile):
    while True:
        alleles_prev = alleles
        _, alleles = truncate_right(alleles)

        if any(len(allele) == 0 for allele in alleles):
            alleles = extend_left(alleles, reference, contig, pos - 1)
            alleles = tuple(''.join((left_base, allele)) for allele in alleles)
            pos -= 1
            
        if alleles == alleles_prev:
            break
            
    n_trunc, alleles = truncate_left(alleles, min_len=2)
    pos -= n_trunc
    
    return contig, pos, alleles

In [9]:
normalize(contig='1', pos=931393, alleles=('G', 'T'), reference=REFERENCE)

('1', 931393, ('G', 'T'))

In [10]:
normalize(contig='1', pos=1654129, alleles=('TAAAAAAAT', 'TAAAAAAT'), reference=REFERENCE)

('1', 1654129, ('TA', 'T'))

In [11]:
import requests

In [12]:
def exac(contig, pos, alleles):
    var = f'{contig}-{pos+1}-{"-".join(alleles)}'
    url = f'http://exac.hms.harvard.edu/rest/variant/consequences/{var}'
    print(url)
    return requests.get(url).json()

In [13]:
exac(*('1', 931392, ('G', 'T')))

http://exac.hms.harvard.edu/rest/variant/consequences/1-931393-G-T


In [14]:
exac(*normalize('1', 6209058, ('GTCCTCCTCCTCCT', 'GTCCTCCTCCT'), REFERENCE))

http://exac.hms.harvard.edu/rest/variant/consequences/1-6209059-GTCC-G
