In [19]:
import numpy as np
import sys
import os
import re

In [2]:
def isRevComp(seq1: 'str', seq2: 'str') -> 'bool':
    ''' Checks if seq is a reverse complement '''
    return seq1 == compliment(seq2)[::-1]

In [3]:
def compliment(seq1: 'str') -> 'str':
    ''' Returns complement of a seq '''
    seq1 = seq1.upper()
    baseMap = dict(A='T', T='A', G='C', C='G')
    compliment = ''
    for base in seq1:
        compliment += baseMap[base]
    return compliment

In [4]:
def isPal(seq1: 'str') -> 'bool':
    ''' Checks if seq is a palindrome '''
    iterLen = len(seq1)//2
    revHalf = seq1[-iterLen:][::-1]
    for idx in range(0,iterLen):
        if seq1[idx] != revHalf[idx]:
            return False
    return True

In [5]:
class K_mer():
    def __init__(self, text:'str', k:'int'):
        self.text = text
        self.k = k
        self.oriDict = self.FreqWord()
        self.maxFreqMer = self.MaxFreqMer()
    
    def FreqWord(self) -> 'dict of freq patterns':
        oriDict = dict()
        for i in range(len(self.text)-self.k):
            pattern = self.text[i:i+self.k]
            if pattern in oriDict:
                oriDict[pattern] += 1
            else:
                oriDict[pattern] = 1
        return oriDict
    
    def MaxFreqMer(self) -> 'tuple of most frequent k-mer':
        return max(self.oriDict.items(), key=lambda x:x[1])
    
def RangeMer(text:'str', min_k:'int', max_k:'int') -> 'list of most frequent k-mers in given k-range':
    '''Returns dict of most frequent k-mers'''
    kmerDict = dict()
    for k in range(min_k,max_k+1):
        kmerDict[f'{k}-mer'] = K_mer(text,k).MaxFreqMer()
    return kmerDict

In [6]:
# Vibrio cholera chrosome oriC
VCoriC = 'atcaatgatcaacgtaagcttctaagcatgatcaaggtgctcacacagtttatccacaac\
ctgagtggatgacatcaagataggtcgttgtatctccttcctctcgtactctcatgacca\
cggaaagatgatcaagagaggatgatttcttggccatatcgcaatgaatacttgtgactt\
gtgcttccaattgacatcttcagcgccatattgcgctggccaaggtgacggagcgggatt\
acgaaagcatgatcatggctgttgttctgtttatcttgttttgactgagacttgttagga\
tagacggtttttcatcactgactagccaaagccttactctgcctgacatcgaccgtaaat\
tgataatgaatttacatgcttccgcgacgatttacctcttgatcatcgatccgattgaag\
atcttcaattgttaattctcttgcctcgactcatagccatgatgagctcttgatcatgtt\
tccttaaccctctattttttacggaagaatgatcaagctgctgctcttgatcatcgtttc'

# Thermotoga Petrophila chromosome oriC
TPoriC = 'aactctatacctcctttttgtcgaatttgtgtgatttatagagaaaatcttattaactga\
aactaaaatggtaggtttggtggtaggttttgtgtacattttgtagtatctgatttttaa\
ttacataccgtatattgtattaaattgacgaacaattgcatggaattgaatatatgcaaa\
acaaacctaccaccaaactctgtattgaccattttaggacaacttcagggtggtaggttt\
ctgaagctctcatcaatagactattttagtctttacaaacaatattaccgttcagattca\
agattctacaacgctgttttaatgggcgttgcagaaaacttaccacctaaaatccagtat\
ccaagccgatttcagagaaacctaccacttacctaccacttacctaccacccgggtggta\
agttgcagacattattaaaaacctcatcagaagcttgttcaaaaatttcaatactcgaaa\
cctaccacctgcgtcccctattatttactactactaataatagcagtataattgatctga'

# No frequent 9-mers? Page 26
NOoriC = 'aatgatgatgacgtcaaaaggatccggataaaacatggtgattgcctcgcataacgcggt\
atgaaaatggattgaagcccgggccgtggattctactcaactttgtcggcttgagaaaga\
cctgggatcctgggtattaaaaagaagatctatttatttagagatctgttctattgtgat\
ctcttattaggatcgcactgccctgtggataacaaggatccggcttttaagatcaacaac\
ctggaaaggatcattaactgtgaatgatcggtgatcctggaccgtataagctgggatcag\
aatgaggggttatacacaactcaaaaactgaacaacagttgttctttggataactaccgg\
ttgatccaagcttcctgacagagttatccacagtagatcgcacgatctgtatacttattt\
gagtaaattaacccacgatcccagccattcttctgccggatcttccggaatgtcgtgatc\
aagaatgttgatcttcagtg'

In [7]:
Cholera = K_mer(VCoriC,9)
Thermo = K_mer(TPoriC,9)
No = K_mer(NOoriC,9)

# Finds top 10 most frequent k_mers
c9 = sorted(list(Cholera.FreqWord().items()), key=lambda x:x[1])[::-1][:11] # top 10 most frequent 9-mers
t9 = sorted(list(Thermo.FreqWord().items()), key=lambda x:x[1])[::-1][:11] # top 10 most frequent 9-mers
n9 = sorted(list(No.FreqWord().items()), key=lambda x:x[1])[::-1][:11] # top 10 most frequent 9-mers
print(c9,'\n\n',t9, '\n\n', n9)

[('cttgatcat', 3), ('tcttgatca', 3), ('ctcttgatc', 3), ('atgatcaag', 3), ('gctcttgat', 2), ('tgatcatcg', 2), ('ttgatcatc', 2), ('gcatgatca', 2), ('agcatgatc', 2), ('aagcatgat', 2), ('aatgatcaa', 2)] 

 [('acctaccac', 5), ('cctaccacc', 3), ('aacctacca', 3), ('aaacctacc', 3), ('ggtaggttt', 3), ('tggtaggtt', 3), ('tacctacca', 2), ('ttacctacc', 2), ('cttacctac', 2), ('acttaccta', 2), ('cacttacct', 2)] 

 [('aggatccgg', 2), ('aaggatccg', 2), ('atcttcagt', 1), ('gatcttcag', 1), ('tgatcttca', 1), ('ttgatcttc', 1), ('gttgatctt', 1), ('tgttgatct', 1), ('atgttgatc', 1), ('aatgttgat', 1), ('gaatgttga', 1)]


In [8]:
len('CTGCAATGCATGACAAGCCTGCAGT')
# (L,t)-Clump, where:
# L is length of typical length of oriC/text/read
# t is the number of times a k-mer occurs
# k will just be the k-mer seq length
# To find a clump, look for repeating "k-mers" in a region of length "L" while recording "t"

25

In [9]:
def gcSkew(seq: 'str') -> 'list':
    skewDict = dict(A=0, T=0, G=1, C=-1)
    skewList = []
    skewFactor = 0
    for base in seq:
        skewFactor += skewDict[base]
        skewList.append(skewFactor) 
    return skewList

In [10]:
se = 'CATGGGCATCGGCCATACGCC'
skewList = gcSkew(se)
print(skewList)
print(len(skewList))
minGC = min(gcSkew(se))
skewList.index(minGC)

[-1, -1, -1, 0, 1, 2, 1, 1, 1, 0, 1, 2, 1, 0, 0, 0, 0, -1, 0, -1, -2]
21


20

In [15]:
def hamDist(pattern:'str', text:'str') -> 'int':
    ''' Returns hamdist between a reference and target seq of same length'''
    assert(len(pattern) == len(text))
    pattern = pattern.upper()
    text = text.upper()
    hamDist = 0
    for base in range(len(text)):
        if pattern[base] != text[base]:
            hamDist += 1
    return hamDist

def hamDict(pattern: 'str', text: 'str', hamdist: 'int', r=False) -> 'dict':
    ''' finds all k-mers within allowed hamdist (inclusive) and returns a dict '''
    pattern = pattern.upper()
    reference = text.upper()
    targetDict = dict()
    oriD = K_mer(text, len(pattern)).oriDict
    for seq,freq in oriD.items():
        if r:
            if (hamDist(pattern, seq) <= hamdist) or (hamDist(compliment(pattern)[::-1], seq) <= hamdist):
                targetDict[seq] = freq
        else:
            if hamDist(pattern, seq) <= hamdist:
                targetDict[seq] = freq
    return targetDict

In [18]:
hamDict('ATGATCAAG',VCoriC,1,r=True)

{'atgatcaac': 1,
 'atgatcaag': 3,
 'catgatcat': 1,
 'atgatcatg': 1,
 'cttgatcat': 3}

In [None]:
hamDict('AAAAA','AACAAGCATAAACATTAAAGAG', 1)

In [26]:
for i in range(20):
    print(np.random.randint(1,11))

1
4
4
10
1
3
6
10
7
2
6
4
4
6
4
7
5
9
1
8


In [43]:
probs = [.2,.1,0,.7]
same = [.25]*4
def sEntropy(probList: 'list'):
    sumParts = []
    for p in probList:
        if p > 0:
            sumPart = p*(np.log2(p))
        else:
            sumPart = 0
        sumParts.append(sumPart)
    print(sumParts)
    return -sum(sumParts)
    

In [45]:
probsE = sEntropy(probs)
sameE = sEntropy(same)

[-0.46438561897747244, -0.33219280948873625, 0, -0.3602012209808308]
[-0.5, -0.5, -0.5, -0.5]


[0.25, 0.25, 0.25, 0.25]

In [16]:
def pal(word:"str", b:"int", e:"int"):
    if word[b] != word[e]:
        return False
    elif b >= e:
        return True
    elif word[b] == word[e]:
        return pal(word, b+1, e-1)

def palindrome(word:"str"):
    return pal(word, 0, len(word)-1)

In [17]:
c= "cat"
d = "dd"
x = "x"
r = "racecar"

In [18]:
palindrome(d)

TypeError: pal() takes 1 positional argument but 3 were given