In [2]:
import re
from collections import Counter
import pandas as pd
import math
import pysam
import operator

class PileupRecord:
    
    def __init__(self,line):
        fields = line.split("	")
        self.seq = fields[0]
        self.pos = int(fields[1])
        self.ref = fields[2]
        self.rCount = int(fields[3])
        self.rRes = fields[4]
        self.qual = fields[5][:-1]
        
    def print(self):
        print('{}    {}    {}    {}    {}    {}'.format(self.seq, self.pos, self.ref, self.rCount, self.rRes, self.qual))

In [226]:
def DetermineIndelString(readResult):
    
    ind_num = [ind.start() for ind in (re.finditer(r'[0-9]', readResult))]
    if len(ind_num) > 0:
        string = readResult[2:ind_num[0]]
        if len(ind_num) != 1:
            for i in range(0,len(ind_num)-1):
                string += int(readResult[ind_num[i]])*readResult[ind_num[i]+1:ind_num[i+1]]
        string += int(readResult[ind_num[-1]])*readResult[ind_num[-1]+1:]
    else:
        string = readResult[2:]
    return string

assert DetermineIndelString(".+5AC3TG") == "ACACACACACTGTGTG"
assert DetermineIndelString(".+2AT") == "ATAT"
assert DetermineIndelString(".+TC3AC2TT") == "TCACACACTTTT"
assert DetermineIndelString(".+GACT") == "GACT"

In [227]:
def DetectPolymorphicSite(readResult):
    
    '''Detects variants at particular location, returns counts and types of them.
       input: string, info about particular location alignment results from all reads
       output: dataframe, columns: variant, count, type'''
    
    readResult = readResult.upper().replace(',','.')

    irrelevant = list(set(re.findall(r'\^[\W][^\.]', readResult)))
    for s in irrelevant:
        readResult = readResult.replace(s,'')
    
    occ = re.findall(r'[\.][+-][ACGT]*[0-9]*[ACGT]*[0-9]*[ACGT]*', readResult)
    var = []
    variants = list(set(occ))
    varCounts = [occ.count(indel) for indel in variants]
    varTypes = ['indel']*len(variants)
    for i in range(0,len(variants)):
        var.append([variants[i], varCounts[i], varTypes[i]])
        
    for s in variants:
        readResult = readResult.replace(s,'')
    
    occ = re.findall(r'[AGCT]', readResult)
    SNVs = list(set(occ))
    SNVCounts = [occ.count(SNV) for SNV in SNVs]
    SNVTypes = ['SNV']*len(SNVs)
    for i in range(0,len(SNVs)):
        var.append([SNVs[i], SNVCounts[i], SNVTypes[i]])

    matchCount = len(re.findall(r'[\.]', readResult))
    var.append(['.', matchCount, 'match'])
    
    var.sort(key = lambda x: x[1], reverse = True)

    if len(var) > 2:
        return var[:2]
    else:
        return var 
    
assert DetectPolymorphicSite("^!^E") == [['.', 0, 'match']]
assert DetectPolymorphicSite(".,.,.,....,,,.,.,.,.,^!A.,.,.") == [['.', 26, 'match']]
assert DetectPolymorphicSite(".,..,.,.,.,.A,.,a,.^H.a.") == [['.', 19, 'match'], ['A', 3, 'SNV']]
assert DetectPolymorphicSite(".+3AC..") == [['.', 2, 'match'], [".+3AC", 1, 'indel']]
assert DetectPolymorphicSite(",+3ac..") == [['.', 2, 'match'], [".+3AC", 1, 'indel']]
assert DetectPolymorphicSite('..,.+G3AC') == [['.', 3, 'match'], [".+G3AC", 1, 'indel']]
assert DetectPolymorphicSite('..,.+G3AC2T') == [['.', 3, 'match'], [".+G3AC2T", 1, 'indel']]
assert DetectPolymorphicSite('..,.-G3AC2T') == [['.', 3, 'match'], [".-G3AC2T", 1, 'indel']]
assert DetectPolymorphicSite('..,.-G3AC2T.-G3AC2T.-G3AC2T.-G3AC2T') == [[".-G3AC2T", 4, 'indel'], ['.', 3, 'match']]
assert DetectPolymorphicSite('a.A.a,a.-G3AC2T.-G3AC2T.a.-G3AC2T.-G3AC2T') == [['A', 5, 'SNV'], [".-G3AC2T",4, 'indel']]
assert DetectPolymorphicSite('TC,aTtcTc') == [['T', 4, 'SNV'], ["C", 3, 'SNV']]


    

In [274]:
def Genotyping(var):
    
    '''Determines genotype.'''
    
    if len(var) == 1:
        if var[0][2] == 'match':
            genotype = (0,0)
        else:
            genotype = (1,1)
        P = [1.0]
    if len(var) == 2:
        pr = [0]*3
        k1 = var[0][1]
        k2 = var[1][1]
        # k1 = 40
        # k2 = 1
        p0 = 0.8
        p1 = 0.8
        p2 = 0.8

        pr[0] = math.factorial(k1+k2)//math.factorial(k1)//math.factorial(k2)*(p0**k1)*(1-p0)**(k2) # a1a1
        pr[1] = math.factorial(k1+k2)//math.factorial(k1+k2)//math.factorial(0)*(p2**(k1+k2))*(1-p2)**0 # a1a2
        pr[2] = math.factorial(k1+k2)//math.factorial(k2)//math.factorial(k1)*p1**k2*(1-p1)**(k1) # a2a2
    
        P = [pr[0]/(pr[0]+pr[1]+pr[2]), pr[1]/(pr[0]+pr[1]+pr[2]), pr[2]/(pr[0]+pr[1]+pr[2])]
        index, value = max(enumerate(P), key=operator.itemgetter(1))
        if var[0][2] == 'match': 
            if index == 0:
                genotype = (0,0)
            elif index == 1:
                genotype = (0,1)
            else:
                genotype =(1,1)
        elif var[1][2] == 'match':
            if index == 0:
                genotype = (1,1)
            elif index == 1:
                genotype = (0,1)
            else:
                genotype = (0,0)
            P.reverse()
        else:
            if index == 0:
                genotype = (1,1)
            elif index == 1:
                genotype = (1,2)
            else:
                genotype = (2,2)
  
    return genotype

assert Genotyping([['.', 26, 'match']]) == (0,0)
assert Genotyping([['.', 19, 'match'], ['A', 3, 'SNV']]) == (0,0)
assert Genotyping([['.', 2, 'match'], [".+3AC", 2, 'indel']]) == (0,1)
assert Genotyping([['.', 3, 'match'], [".+G3AC2T", 1, 'indel']]) == (0,1)
assert Genotyping([['.', 50, 'match'], [".+G3AC2T", 5, 'indel']]) == (0,0)
assert Genotyping([['T', 4, 'SNV'], ["C", 3, 'SNV']]) == (1,2)
assert Genotyping([['T', 4, 'SNV']]) == (1,1)
assert Genotyping([['T', 4, 'SNV'], [".", 2, 'match']]) == (0,1)

In [280]:
def DetermineAltsField(polymorphic_site):
    
    polymorphic_site = [elem for elem in polymorphic_site if(elem[0] != '.')]
    if len(polymorphic_site) == 0:
        alts = ['.']
    else:
        alts = [p[0] for p in polymorphic_site]
    
    return alts

assert DetermineAltsField([['.', 26, 'match']]) == ['.']
assert DetermineAltsField(([['.', 19, 'match'], ['A', 3, 'SNV']])) == ['A']
assert DetermineAltsField([['.', 2, 'match'], [".+3AC", 2, 'indel']]) == ['.+3AC']
assert DetermineAltsField([['T', 4, 'SNV'], ["C", 3, 'SNV']]) == ['T','C']
