# Notes

### NULL Model
- For a Markov(1) Model, we consider a preceding base
    - Ex:  
    
        $P(A|T_{prev}) = \frac{P(TA)}{P(T)} = \frac{\frac{c(TA)}{N}}{\frac{c(T)}{N}} = \frac{c(TA)}{c(T)}$  
        
        - N is not the same in the cases of TA and T, but the difference is negligible for now
- Increasing the order of a Markov Model makes our model more informed
- Check Week 2 Overview Video 3 for more details on detailed Markov Model for 4-mer described by $k_{1}k_{2}k_{3}k_{4}$

In [3]:
import sys
import os 
import numpy as np
from fastaReader import FastAreader

# holds files
dataList = os.listdir('mdata')
print(f"{os.listdir()} \n{os.listdir('mdata')}")
print('mdata/'+dataList[0])

### SEQUENCE LIST ###
seqList = []
readData = FastAreader('mdata/'+dataList[0]).readFasta()
for line in readData:
    seqList.append(line[1])

### REVERESE COMPLIMENT FUNCTION ###
def revComp(seqStr):
    newStr = ''
    baseDict = {'A':'T', 'T':'A', 'C':'G', 'G':'C', 'N':'N'}
    for char in seqStr:
        newStr += baseDict[char]
    return newStr[::-1]

['.ipynb_checkpoints', 'Chan_Nicholas_serachForTheMissing.ipynb', 'data', 'dummm.txt', 'fastaReader.py', 'mdata', 'notes.md', 'pasgn2.ipynb', 'program.py', 'testing.ipynb', '__pycache__'] 
['Arthrospira-platensis-NIES-39.fna', 'Ecoli-UMN026.fa', 'Synechococcus7002.fna', 'Zm4-genomic.fna']
mdata/Arthrospira-platensis-NIES-39.fna


In [20]:
class SearchForMissing:
    def __init__(self, seqList, minMotif, maxMotif, zCut):
        self.seqList = seqList
        self.minMotif = minMotif
        self.maxMotif = maxMotif
        self.zCut = zCut
        
        self.N = self.genomeLen() # - self.maxMotif
        self.ctDict = self.ctDictMake()
        self.prDict = self.prDictMake()
        self.output = self.outputMake()

    def genomeLen(self):
        '''
        Takes the sum of sequence lengths from the list of sequences, seqList.
        seqList is made with the FastAreader class and has 1 fasta seq per entry.
        '''
        totLen = 0
        for seq in seqList:
            totLen += len(seq)
        return totLen

    def ctDictMake(self):
        '''
        Stores a dict of dicts
        {1:{'A':25, 'C':33,...}, ... ,8:{'AAAAGGGG':12, 'AAAATTTT':11, ...}}
        Reads over each sequence in the fasta. For each sequence, a sliding window
        moves over its respective sequence length and k-mers are counted.
        '''
        
        massDict = dict() # dictionary that holds dictionaries which hold k-mers and their respective counts
        for k in range(1, self.maxMotif+1): # Initialize dictionaries for min to max motif sizes within massDict; MAYBE START WITH 1 AND NOT MIN MOTIF
            massDict[k] = dict()
        print(massDict)
        for seq in self.seqList:
            for k in range(1, self.maxMotif+1):
                for kmer_idx in range((len(seq)-k)):
                    kmer = seq[kmer_idx:kmer_idx+k]
                    if 'N' not in kmer:
                        if kmer in massDict[k]:
                            massDict[k][kmer] += 1
                        else:
                            massDict[k][kmer] = 1
        return massDict
                                
    def prDictMake(self): # VERY DEPENDENT ON ctDict
        massDict = dict() # dictionary that holds dictionaries which hold k-mers and their respective counts
        for k in range(self.minMotif, self.maxMotif+1): # Initialize dictionaries for min to max motif sizes within massDict
            massDict[k] = dict()
#         print(massDict)
        for k in range(self.minMotif, self.maxMotif+1):
            for kmer in self.ctDict[k].keys():
                if (kmer[:-1] not in self.ctDict[k-1]) or (kmer[1:] not in self.ctDict[k-1]) or (kmer[1:-1] not in self.ctDict[k-2]): # If sub seq is not in sequence count 
                    massDict[k][kmer] = 0
                else:
                    massDict[k][kmer] = (1/self.N) * ((self.ctDict[k-1][kmer[:-1]] * self.ctDict[k-1][kmer[1:]]) / self.ctDict[k-2][kmer[1:-1]])
        return massDict

    def outputMake(self):
        finalList = []
        for k in range(self.minMotif, self.maxMotif+1):
            banned = set()
            for kmer in self.ctDict[k].keys():
                if kmer not in banned: # interesting condition, is it implied that the reverse complement of the kmer is excluded?
                    # sequenceReverse entry
                    reverseCompKmer = revComp(kmer) # K-mer reverse compliment variable
                    tmpList = sorted([kmer,reverseCompKmer]) # Temporary list used for sorting kmer and reverse kmer for output
                    sequenceReverse = f"{tmpList[0]}:{tmpList[1]}" # sequence: reverse entry for output
                    # count entry
                    if kmer != reverseCompKmer: # Case 1: k-mer is not a palindrome
                        if reverseCompKmer in self.ctDict[k]:
                            count = self.ctDict[k][kmer] + self.ctDict[k][reverseCompKmer] 
                        else:
                            count = self.ctDict[k][kmer]
                    else: # Case 2: k-mer is a palindrome
                        count = self.ctDict[k][kmer] # interesting change
#                         if kmer == "GCGATCGC":
#                             print(f'ct of GCGATCGC: {count}')
                    # expectation entry
                    if reverseCompKmer in self.ctDict[k]:
                        Pr = self.prDict[k][kmer] + self.prDict[k][reverseCompKmer] 
#                         if kmer == "GCGATCGC":
#                             print(f'pr of GCGATCGC: {Pr}')
                    else: 
                        Pr = self.prDict[k][kmer]
                    expectation = Pr * (self.N) # look at this
                    # zscore entry
                    numerator = count - expectation
                    sd = (expectation * (1 - Pr))**0.5
                    zscore = numerator/sd
                    entry = (sequenceReverse, count, expectation, zscore, k)
                    banned.add(kmer) # Mark
                    banned.add(revComp(kmer))
                    finalList.append(entry)
#         finalList.sort(key=lambda x:x[-2]) # sort by z-score
#         finalList.sort(reverse=True, key=lambda x:x[-1]) # sort by motif length
        
        return finalList

In [21]:
sfm = SearchForMissing(seqList, 3, 8, 0)

l = sfm.output
l.sort(key=lambda x:x[-2])
l.sort(reverse=True, key=lambda x:x[-1])

{1: {}, 2: {}, 3: {}, 4: {}, 5: {}, 6: {}, 7: {}, 8: {}}


In [22]:
print(f'N = {sfm.N}')
for line in l:
    if line[-1] < 8:
        break

    print(line)
# expectation and z score for non palindromes are doubled :-(
'''
Expectation = Prob * Genome Length

Prob = Pr(GCGATCGC) = (1/N) * (count(GCGATCG) * count(CGATCGC))/count(CGATCG)

OR if kmer's reverse complement is different

Prob = Pr(CGATCGCC) + Pr(GGCGATCG)

Z = (observed - expectation)/sd

sd = sqrt(N*P * (1-P))
'''

N = 6788435
('GCGATCGC:GCGATCGC', 7683, 15876.382478446401, -65.10223493467082, 8)
('AATTAATT:AATTAATT', 558, 1290.1022353145038, -20.38454117072713, 8)
('AAAATTTT:AAAATTTT', 501, 1092.5187558906694, -17.89736186597799, 8)
('TTAATTAA:TTAATTAA', 537, 1102.20874134225, -17.02598335646839, 8)
('AAATATTT:AAATATTT', 365, 852.584289496911, -16.699686369886066, 8)
('ATAATTAT:ATAATTAT', 364, 840.3257702412229, -16.432631605479443, 8)
('CAAATTTG:CAAATTTG', 245, 579.1215834118756, -13.88475372293913, 8)
('ACAATTGT:ACAATTGT', 115, 375.3628180039139, -13.438950223510378, 8)
('TTTTAAAA:TTTTAAAA', 298, 625.0773652694612, -13.082887292564482, 8)
('TAATATTA:TAATATTA', 296, 611.332744924978, -12.754099420638923, 8)
('ATTTAAAT:ATTTAAAT', 195, 472.0488622754492, -12.751990888788157, 8)
('AAACGTTT:AAACGTTT', 89, 305.1719352351581, -12.374759766064596, 8)
('TGATATCA:TGATATCA', 300, 592.1850101282917, -12.007377655226804, 8)
('CCCTAGGG:CCCTAGGG', 232, 498.0984490896831, -11.923415610357333, 8)
('AATATATT:AA

('TCTAGTCA:TGACTAGA', 127, 166.30078883329233, -3.0476102988353855, 8)
('GAGTATCC:GGATACTC', 112, 149.22035031974863, -3.046991215037936, 8)
('CATTCATC:GATGAATG', 238, 289.87123456048505, -3.04672528616986, 8)
('AAACACCT:AGGTGTTT', 234, 285.47454913271173, -3.0466178715991377, 8)
('CCATAATC:GATTATGG', 452, 521.5258400127466, -3.044563329715412, 8)
('AAGATATA:TATATCTT', 183, 229.0745140801343, -3.0442457219122803, 8)
('AGGCGGAT:ATCCGCCT', 144, 185.43203108367044, -3.042634447312455, 8)
('ATGGGACA:TGTCCCAT', 112, 149.14920184573174, -3.0418920058039864, 8)
('CCACTAGC:GCTAGTGG', 192, 239.00545869537925, -3.040549166264462, 8)
('GAATACTC:GAGTATTC', 111, 147.98022138801608, -3.0399917433458126, 8)
('CGTACCCC:GGGGTACG', 72, 102.81105765175714, -3.0387150854799856, 8)
('CGGGGAAG:CTTCCCCG', 245, 297.3390397576638, -3.035353626027701, 8)
('AACCATAT:ATATGGTT', 250, 302.81508315077457, -3.035141219094244, 8)
('AATTGAGT:ACTCAATT', 398, 463.3248031496064, -3.034940324246597, 8)
('CCGTTTAG:CTAAACGG'

('ATGAGCAT:ATGCTCAT', 124, 151.99955037246175, -2.271092211323397, 8)
('AGCAACTG:CAGTTGCT', 232, 269.25455572058087, -2.2704213996021485, 8)
('CAAGCATC:GATGCTTG', 181, 214.2273981867168, -2.2702081689672284, 8)
('CGTTCAAG:CTTGAACG', 92, 116.50200487538734, -2.2700668622569484, 8)
('CGGTGTGC:GCACACCG', 41, 58.335989974339505, -2.269771962363637, 8)
('AATTGTTA:TAACAATT', 474, 526.0573175519381, -2.2697711977513313, 8)
('CGCCTGAA:TTCAGGCG', 191, 225.04698806400194, -2.2695998543329146, 8)
('CTAAAGTA:TACTTTAG', 191, 225.04274056537966, -2.269338127570031, 8)
('GAAACCCC:GGGGTTTC', 459, 510.24412337001723, -2.268670294560692, 8)
('GTCGTCCA:TGGACGAC', 57, 76.88850174216029, -2.2681607357706706, 8)
('ATAGAAGA:TCTTCTAT', 202, 236.9057398897369, -2.267862055535011, 8)
('AGGAAGTA:TACTTCCT', 136, 165.1411904724654, -2.267696170204318, 8)
('GAGGAGGC:GCCTCCTC', 124, 151.95200216069824, -2.267590184635454, 8)
('GATAGGCC:GGCCTATC', 24, 37.97237005911146, -2.2674485519771124, 8)
('GGTAATTA:TAATTACC', 4

('GAATTATA:TATAATTC', 243, 272.03459887070113, -1.760404461663202, 8)
('CAAAAACC:GGTTTTTG', 648, 694.3631897536558, -1.7595522792985012, 8)
('GAAACGGC:GCCGTTTC', 223, 250.86147928064338, -1.759118947386416, 8)
('GTGCCGTA:TACGGCAC', 32, 43.615356422943016, -1.75879095347408, 8)
('CCCCTTCG:CGAAGGGG', 107, 126.80347477784247, -1.7586518712139019, 8)
('TGATTAAA:TTTAATCA', 843, 895.6221053490693, -1.7584679958222225, 8)
('GGGATCCC:GGGATCCC', 6, 12.122448979591839, -1.7584514412115093, 8)
('AGGCCACG:CGTGGCCT', 8, 14.754312270980426, -1.7584180047286444, 8)
('CCTTTATA:TATAAAGG', 184, 209.4457299428322, -1.758271182310302, 8)
('CACAACTG:CAGTTGTG', 224, 251.90506961929393, -1.7582179700598914, 8)
('AGCAGGAG:CTCCTGCT', 148, 170.98516621634337, -1.7578179733859012, 8)
('AAGCGTAG:CTACGCTT', 33, 44.75990039117636, -1.7577638123919792, 8)
('AAGGTACA:TGTACCTT', 143, 165.6175171863079, -1.7575071152578126, 8)
('TCAGAGCA:TGCTCTGA', 129, 150.55965704231448, -1.7570833191794217, 8)
('TGATCGCA:TGCGATCA', 

('GAATAAAA:TTTTATTC', 300, 323.32177459513395, -1.297043254095478, 8)
('ATGGCCGG:CCGGCCAT', 69, 80.64733129966775, -1.2969821773610994, 8)
('CCAGCAAG:CTTGCTGG', 159, 176.21563720265164, -1.2969000780440698, 8)
('TTCTTAAA:TTTAAGAA', 241, 261.9898669763826, -1.2968084141506393, 8)
('CGGTAGAG:CTCTACCG', 119, 134.0050038181283, -1.296223495057408, 8)
('CTGAGGAA:TTCCTCAG', 307, 330.56581075425487, -1.2961760515726048, 8)
('GTAAGCCA:TGGCTTAC', 170, 187.7594124359444, -1.29608576430839, 8)
('CCATTGTA:TACAATGG', 229, 249.4702789625433, -1.2960517265470817, 8)
('CTCGGGCC:GGCCCGAG', 1, 3.383808095952024, -1.2958911861239981, 8)
('ATCGAAAT:ATTTCGAT', 243, 264.051422145148, -1.2955245028664493, 8)
('ATCTTGGG:CCCAAGAT', 314, 337.80979052675883, -1.2954786633569069, 8)
('GGGAGTGA:TCACTCCC', 179, 197.19011850270573, -1.295386188123652, 8)
('GCAGCTCC:GGAGCTGC', 33, 41.32717076184593, -1.2953315846824218, 8)
('ATATCGAG:CTCGATAT', 217, 236.93794359650775, -1.2953008034086972, 8)
('GTTAGTCA:TGACTAAC', 20

('AGTAATTG:CAATTACT', 390, 407.95911456487477, -0.889179928459961, 8)
('CACACACG:CGTGTGTG', 15, 18.86122381814335, -0.8890796000797185, 8)
('ATTCAGCA:TGCTGAAT', 334, 350.64583731245096, -0.8889606119166267, 8)
('ATGACCCG:CGGGTCAT', 194, 206.78118631300833, -0.8888372948889408, 8)
('CGGGGACC:GGTCCCCG', 38, 43.88762237568295, -0.8887309900774093, 8)
('GGCCGACA:TGTCGGCC', 30, 35.27769099138092, -0.8885768844660411, 8)
('ACGCTACA:TGTAGCGT', 75, 83.09838155305634, -0.888391940600923, 8)
('AAGAGCAG:CTGCTCTT', 92, 100.92430796995674, -0.8883413692434324, 8)
('AATCGACA:TGTCGATT', 259, 273.69014710831215, -0.8879843633544363, 8)
('GTACTTCA:TGAAGTAC', 69, 76.7784396960725, -0.8877185682999669, 8)
('CTGACTTC:GAAGTCAG', 149, 160.23627311978345, -0.8876607052509142, 8)
('CCGTGGCG:CGCCACGG', 130, 140.52064596691594, -0.8875173105771293, 8)
('GACTAAAC:GTTTAGTC', 246, 260.3177697138967, -0.8874252728158509, 8)
('GGAACCGA:TCGGTTCC', 277, 292.1674438135897, -0.8873722235860801, 8)
('CGGGTTCC:GGAACCCG', 

('ACGACCTA:TAGGTCGT', 84, 89.50757095821474, -0.5821475561524828, 8)
('CGTTAGCC:GGCTAACG', 112, 118.33120681878191, -0.5820236591210766, 8)
('AGTAAGGA:TCCTTACT', 97, 102.90386888664823, -0.5820015538510861, 8)
('AAAACGTT:AACGTTTT', 252, 261.4073731343284, -0.5818590797939122, 8)
('ACTTTGCA:TGCAAAGT', 114, 120.38340881524594, -0.5817992960873252, 8)
('GAACTCCC:GGGAGTTC', 248, 257.33116966207456, -0.5816987366332025, 8)
('CACACGGG:CCCGTGTG', 69, 74.00391816920944, -0.5816814430234503, 8)
('ATCAACGG:CCGTTGAT', 296, 306.1757262081905, -0.5815537077919413, 8)
('CATAATCC:GGATTATG', 365, 376.2795472374461, -0.5814982307070427, 8)
('CATTTGAG:CTCAAATG', 237, 246.11822442169927, -0.5812276460853453, 8)
('AATCAGGC:GCCTGATT', 287, 297.0160348065353, -0.581186313211459, 8)
('TAGGACAA:TTGTCCTA', 268, 277.6844514855186, -0.5811766124277277, 8)
('ATACGCAC:GTGCGTAT', 28, 31.246786632390748, -0.5808340498840775, 8)
('CAGCTCCC:GGGAGCTG', 59, 63.6330655872494, -0.5808032803867079, 8)
('AGGCCGTG:CACGGCCT',

('AAATACAG:CTGTATTT', 347, 350.882426719179, -0.20726868041109817, 8)
('AAACCCGG:CCGGGTTT', 505, 509.67570477307044, -0.20711732085576684, 8)
('ACCCTCGA:TCGAGGGT', 160, 162.63985929486321, -0.20700102077971286, 8)
('AGGTGTCG:CGACACCT', 102, 104.11049226695553, -0.20684253204416098, 8)
('AGGAAGCG:CGCTTCCT', 230, 233.15786648714544, -0.2068122151543682, 8)
('TGAGAGAA:TTCTCTCA', 166, 168.68507583348816, -0.20673957606505505, 8)
('AGCTCACT:AGTGAGCT', 40, 41.32682781743457, -0.20639532436223063, 8)
('AATGTCCA:TGGACATT', 143, 145.488935807268, -0.2063494739611937, 8)
('AGAAAGCC:GGCTTTCT', 259, 262.33608304547596, -0.20597581496777667, 8)
('AGGGGTTC:GAACCCCT', 310, 313.64727038465753, -0.20594783801922714, 8)
('CACGAACC:GGTTCGTG', 64, 65.66772268566251, -0.20580217452167873, 8)
('TCGACAGA:TCTGTCGA', 93, 95.0035207593362, -0.20555452213897643, 8)
('CACAATAC:GTATTGTG', 126, 128.32845993940646, -0.20554711437523926, 8)
('CGTACCGA:TCGGTACG', 33, 34.19989544045708, -0.20517862159980912, 8)
('ATACG

('GACACCCA:TGGGTGTC', 194, 193.21277617813828, 0.05663520643082427, 8)
('CGGTTGGC:GCCAACCG', 149, 148.30823521942477, 0.05680421438413765, 8)
('GCCACGGA:TCCGTGGC', 81, 80.48967413864675, 0.056882682306655385, 8)
('ACTTAGGC:GCCTAAGT', 92, 91.45288483551354, 0.05721145237489497, 8)
('GCGTAGAC:GTCTACGC', 47, 46.608991838490006, 0.05727333318570333, 8)
('AACGGACA:TGTCCGTT', 129, 128.35033126832735, 0.05734526626072995, 8)
('GACATGAC:GTCATGTC', 91, 90.45292309170391, 0.05752279423773498, 8)
('AGATCATC:GATGATCT', 175, 174.24048817466317, 0.05753943248700516, 8)
('CAATCACC:GGTGATTG', 485, 483.73384030418254, 0.057570595810108684, 8)
('CCGACCAC:GTGGTCGG', 147, 146.3014277889505, 0.05775528126950976, 8)
('GGTGTATA:TATACACC', 140, 139.31749802590684, 0.05782365439549029, 8)
('AGCAATTC:GAATTGCT', 422, 420.80543114493366, 0.05823498667546952, 8)
('TGTGCCAA:TTGGCACA', 171, 170.2381015542101, 0.058394796527042506, 8)
('AAAGGTAA:TTACCTTT', 379, 377.86056441803385, 0.05861868615017884, 8)
('ACAAGTTC:G

('GATGATCA:TGATCATC', 337, 331.2292239057875, 0.31708850331301347, 8)
('GAGGGGCC:GGCCCCTC', 21, 19.595427365080443, 0.3172980914816529, 8)
('CAACAGTG:CACTGTTG', 238, 233.15195924354342, 0.31750764421524746, 8)
('CCAGTCGC:GCGACTGG', 192, 187.64835680243024, 0.31767782372659426, 8)
('AAGAACGT:ACGTTCTT', 151, 147.1436865676793, 0.3179114834720008, 8)
('GAGACGCC:GGCGTCTC', 5, 4.337783455764932, 0.31795541910473346, 8)
('GATCATGC:GCATGATC', 87, 84.08265095676984, 0.31815436406555536, 8)
('ATTCCTTA:TAAGGAAT', 264, 258.8806889887504, 0.3181778685128589, 8)
('CAACTAGC:GCTAGTTG', 235, 230.16994914055206, 0.3183719193845404, 8)
('GACTCACC:GGTGAGTC', 175, 170.8383103827222, 0.3184069348562055, 8)
('GCCTGAAA:TTTCAGGC', 299, 293.54451187287003, 0.3184241677620002, 8)
('ACTGGCGG:CCGCCAGT', 295, 289.5803447297489, 0.31849042909977043, 8)
('CAATAAAG:CTTTATTG', 363, 356.97698027114564, 0.3187906676143301, 8)
('GTACGGAC:GTCCGTAC', 41, 39.0079476861167, 0.31895207832699096, 8)
('CAGTCTAA:TTAGACTG', 185, 

('ATGAGCAG:CTGCTCAT', 145, 137.15581619550292, 0.6697998321680735, 8)
('GTCCGTGA:TCACGGAC', 71, 65.57447848101266, 0.6700021021101988, 8)
('CACTGGAA:TTCCAGTG', 180, 171.230588741236, 0.670170396902546, 8)
('CTGTTGTA:TACAACAG', 280, 268.9998292402463, 0.6707057501057048, 8)
('CACATGTA:TACATGTG', 2, 1.25, 0.6708204550112713, 8)
('GCAAATGC:GCATTTGC', 191, 181.95111851242885, 0.670846805095312, 8)
('AATATCTC:GAGATATT', 451, 436.96994608587505, 0.6711931785563302, 8)
('GTGCGCGA:TCGCGCAC', 99, 92.53920450224352, 0.6716237941749499, 8)
('ATTTTTGA:TCAAAAAT', 1003, 981.9523520408206, 0.6717223158490411, 8)
('TAAAGTCA:TGACTTTA', 325, 313.1124966963876, 0.6718161906746782, 8)
('TGCGTAAA:TTTACGCA', 137, 129.35875222445426, 0.6718471998339746, 8)
('CGGAGGGA:TCCCTCCG', 92, 85.77721705490488, 0.6718950176332413, 8)
('AAGCCATA:TATGGCTT', 508, 493.0750773366425, 0.6721579453307628, 8)
('AACTGTGA:TCACAGTT', 243, 232.7456550034454, 0.6721629632764304, 8)
('CCCCTGAA:TTCAGGGG', 283, 271.91251769075666, 0.6

('ACCCTAGA:TCTAGGGT', 361, 342.7950253730034, 0.9832949033174069, 8)
('ATCGAAGC:GCTTCGAT', 202, 188.498379188784, 0.9834179394653347, 8)
('CTCCAAAG:CTTTGGAG', 233, 218.46321455179273, 0.98352682396762, 8)
('CTCAGCGC:GCGCTGAG', 103, 93.48757898466566, 0.983823286011094, 8)
('CAGCAACG:CGTTGCTG', 163, 150.9098968290627, 0.9841832584817355, 8)
('ACATCATC:GATGATGT', 459, 438.3866190088211, 0.9845428191534541, 8)
('CACTAGGA:TCCTAGTG', 165, 152.8277748522982, 0.9846314889051485, 8)
('GTCGACCA:TGGTCGAC', 2, 1.0101252219569687, 0.9849012378982102, 8)
('GACACGTC:GACGTGTC', 1, 0.38709677419354843, 0.9851041380806895, 8)
('AAGGCACC:GGTGCCTT', 30, 25.067407139120288, 0.985193106729359, 8)
('GAAGAAAA:TTTTCTTC', 434, 413.9480918725781, 0.9855893575620478, 8)
('AAGGTGTC:GACACCTT', 108, 98.2306758056758, 0.9856985055965554, 8)
('GCCACGTA:TACGTGGC', 17, 13.392230854605996, 0.9858542315820874, 8)
('ATCGTAGA:TCTACGAT', 87, 78.27721459417562, 0.9859162176537533, 8)
('CTACTCGC:GCGAGTAG', 126, 115.4074652560

('CACCGTAA:TTACGGTG', 117, 102.89153120999342, 1.390891835736611, 8)
('ATTTGCCA:TGGCAAAT', 539, 507.662397052753, 1.3908955423256806, 8)
('AGTCAGGG:CCCTGACT', 237, 216.53245789140922, 1.3909487105418084, 8)
('ATTAACCG:CGGTTAAT', 525, 494.0828141758237, 1.390963991849713, 8)
('CTCGGGAA:TTCCCGAG', 89, 76.804607301374, 1.391568565342655, 8)
('ATATGTTC:GAACATAT', 220, 200.3040198518443, 1.391679316455598, 8)
('CCACGATA:TATCGTGG', 124, 109.43833141498578, 1.391970116902622, 8)
('AGGCAATC:GATTGCCT', 165, 148.06213609336996, 1.3920068420928886, 8)
('CACTAATC:GATTAGTG', 230, 209.83434058268608, 1.392133415617581, 8)
('ATAGCGAT:ATCGCTAT', 769, 731.3533761063792, 1.3921492432042155, 8)
('CCCGGGAA:TTCCCGGG', 109, 95.40216923393665, 1.392174128137719, 8)
('CTTAGAAA:TTTCTAAG', 413, 385.657800170198, 1.3923381722132862, 8)
('GATCTTGA:TCAAGATC', 160, 143.32606225023625, 1.3927724768191483, 8)
('GGGGGACA:TGTCCCCC', 214, 194.57059655545387, 1.3929229216042396, 8)
('GGAGCATA:TATGCTCC', 116, 101.93335409

('CTCCGGTC:GACCGGAG', 150, 129.532199255331, 1.7984006643071597, 8)
('GATGCAAA:TTTGCATC', 272, 243.9076436409376, 1.7988016027431475, 8)
('ACACTGGG:CCCAGTGT', 139, 119.34832847535458, 1.798850799843937, 8)
('TGAGAGCA:TGCTCTCA', 145, 124.89469338040236, 1.7990478013225513, 8)
('ACCCTCAA:TTGAGGGT', 591, 548.8498572541946, 1.7992430308334715, 8)
('AGAGCGGG:CCCGCTCT', 64, 51.132580095960385, 1.7994685285151515, 8)
('ATGCGTGG:CCACGCAT', 38, 28.407624633431084, 1.7997393548251992, 8)
('CATTGACG:CGTCAATG', 216, 191.11637499758197, 1.7999929814198958, 8)
('AATGAAAG:CTTTCATT', 242, 215.57120929044748, 1.8000675733276914, 8)
('CTTGTGTA:TACACAAG', 83, 68.13953399331804, 1.8002591572654532, 8)
('ATTTTGCC:GGCAAAAT', 690, 644.3001198428727, 1.8004935331621243, 8)
('AAAGGGGT:ACCCCTTT', 369, 335.996625726649, 1.8005352765694556, 8)
('ACTCCACG:CGTGGAGT', 75, 60.94314163888894, 1.800643754621747, 8)
('ATTCAAAC:GTTTGAAT', 339, 307.422754491018, 1.8010097874422812, 8)
('ACCGAGCC:GGCTCGGT', 113, 95.4050174

('AAGATGCG:CGCATCTT', 324, 286.15786339362023, 2.2370822949165126, 8)
('AGTCCTGA:TCAGGACT', 326, 288.0328420941562, 2.237156145733174, 8)
('CATATGCG:CGCATATG', 25, 16.03915290008005, 2.23747847635796, 8)
('ATCGGTGC:GCACCGAT', 175, 147.79426822890383, 2.2378776750742344, 8)
('CGTAATAC:GTATTACG', 84, 65.83993171319543, 2.238076347097666, 8)
('AGAGAAAG:CTTTCTCT', 157, 131.34990398571506, 2.238093968923946, 8)
('GACCTGAA:TTCAGGTC', 267, 232.84855692511331, 2.238101264814891, 8)
('ACGAGGTA:TACCTCGT', 102, 81.7623979549765, 2.2381276727767343, 8)
('TCACCAGA:TCTGGTGA', 362, 321.84491189540694, 2.238347725119957, 8)
('GGGCAACC:GGTTGCCC', 195, 166.14741315594978, 2.2384287732934802, 8)
('TGCGGCCA:TGGCCGCA', 99, 79.09191176470588, 2.2385461400707722, 8)
('AAATGCCG:CGGCATTT', 307, 270.2022576876254, 2.23864552041336, 8)
('CCCCTCGG:CCGAGGGG', 173, 145.95158561800164, 2.23893801944998, 8)
('CGGGCTCA:TGAGCCCG', 64, 48.420095799152136, 2.2389969892771857, 8)
('TCACCCCA:TGGGGTGA', 570, 518.99290747345

('CGTCGCAG:CTGCGACG', 142, 110.44689034029454, 3.0024027792867107, 8)
('AAACGCAA:TTGCGTTT', 196, 158.23227842820992, 3.002467440504764, 8)
('ATATGTGA:TCACATAT', 104, 77.55365900902648, 3.003080179928857, 8)
('AGGCTAAC:GTTAGCCT', 224, 183.33199995877226, 3.0035823368868684, 8)
('AAAATGTC:GACATTTT', 413, 356.28743407489986, 3.0046261420223863, 8)
('ACGTCAGC:GCTGACGT', 54, 35.9767644054991, 3.004850430555415, 8)
('CCCAAAAC:GTTTTGGG', 988, 897.9617869650231, 3.0048767615184326, 8)
('AAATGGTC:GACCATTT', 241, 198.6425618841625, 3.0053834147223513, 8)
('TATACGAA:TTCGTATA', 57, 38.37956253019475, 3.0056686237080803, 8)
('ACAGTAGA:TCTACTGT', 226, 185.10710157465286, 3.0056767692811444, 8)
('GACGCAGA:TCTGCGTC', 131, 100.81775972306028, 3.005980616173087, 8)
('TATTCTCA:TGAGAATA', 412, 355.33544727730174, 3.006100978203156, 8)
('AATCTATC:GATAGATT', 289, 242.21519214787864, 3.0061587316841947, 8)
('AACATCAC:GTGATGTT', 277, 231.2721767843536, 3.006950176234509, 8)
('CAGGGTGA:TCACCCTG', 296, 248.5762

('TCCCGCCA:TGGCGGGA', 410, 309.26138089461114, 5.728528008599062, 8)
('ACTCAACC:GGTTGAGT', 450, 343.7738223750033, 5.729360501938033, 8)
('CACTGTTC:GAACAGTG', 232, 159.59995285242812, 5.730962609074836, 8)
('TAATCCTA:TAGGATTA', 562, 441.520812680224, 5.733905919934052, 8)
('CTGCGATA:TATCGCAG', 105, 60.41204860988317, 5.736640965015449, 8)
('CCGGAAAA:TTTTCCGG', 628, 499.735625920976, 5.737885486777376, 8)
('ATTCATTG:CAATGAAT', 362, 267.9801545257462, 5.743502160050024, 8)
('ATTGGCTC:GAGCCAAT', 280, 198.955980531595, 5.745777935941546, 8)
('ACAAATGG:CCATTTGT', 387, 289.2602101880544, 5.746935263349442, 8)
('AGGGGAAA:TTTCCCCT', 827, 677.1931236318775, 5.757012094778121, 8)
('CCGGTGGA:TCCACCGG', 335, 244.89053834853058, 5.758272665730026, 8)
('AAACCTAA:TTAGGTTT', 399, 299.3700635573671, 5.7583126966265565, 8)
('CAGGGACA:TGTCCCTG', 243, 168.29684955441596, 5.758459384683433, 8)
('GATTCGAA:TTCGAATC', 27, 9.370162590311846, 5.759370842433017, 8)
('AACATTTG:CAAATGTT', 388, 289.7815884146277, 5

"\nExpectation = Prob * Genome Length\n\nProb = Pr(GCGATCGC) = (1/N) * (count(GCGATCG) * count(CGATCGC))/count(CGATCG)\n\nOR if kmer's reverse complement is different\n\nProb = Pr(CGATCGCC) + Pr(GGCGATCG)\n\nZ = (observed - expectation)/sd\n\nsd = sqrt(N*P * (1-P))\n"

In [58]:
s = sfm.prDict[8]['GCGATCGC'] +sfm.prDict[8][revComp('GCGATCGC')]


0.004677479412691261

In [71]:
k = 8
kmer = 'GCGATCGC'

num = sfm.ctDict[k-1][kmer[:-1]] * sfm.ctDict[k-1][kmer[1:]]
den = sfm.ctDict[k-2][kmer[1:-1]]*
pr = num/den
print(f'pr = {pr}')

pr = 7938.1912392232


In [67]:
numm = sfm.ctDict[7]['GCGATCG'] * sfm.ctDict[7]['CGATCGC']
denn = sfm.ctDict[6]['CGATCG'] * sfm.N
prr = numm/denn
print(f'prr = {prr}')


prr = 0.0011693698531728151


In [8]:
ct = sfm.ctDict[8]['GCGATCGC']
print(ct)

7683


In [77]:
sfm.ctDict[7]['CGATCGC']

9570

In [78]:
sfm.ctDict[7][revComp('GCGATCG')]

9570

In [79]:
sfm.ctDict[7][revComp('CGATCGC')]

9525