# Notes

### NULL Model
- For a Markov(1) Model, we consider a preceding base
    - Ex:  
    
        $P(A|T_{prev}) = \frac{P(TA)}{P(T)} = \frac{\frac{c(TA)}{N}}{\frac{c(T)}{N}} = \frac{c(TA)}{c(T)}$  
        
        - N is not the same in the cases of TA and T, but the difference is negligible for now
- Increasing the order of a Markov Model makes our model more informed
- Check Week 2 Overview Video 3 for more details on detailed Markov Model for 4-mer described by $k_{1}k_{2}k_{3}k_{4}$

In [170]:
import sys
import os 
import numpy as np
from fastaReader import FastAreader

# holds files
dataList = os.listdir('mdata')
print(f"{os.listdir()} \n{os.listdir('mdata')}")
print('mdata/'+dataList[0])

### SEQUENCE LIST ###
seqList = []
readData = FastAreader('mdata/'+dataList[0]).readFasta()
for line in readData:
    seqList.append(line[1])
    
### SEQUENCE STRING ###
# seqString = ''.join(seqList)

### SEQUENCE LENGTH ###
# seqLength = len(seqList[0])

### REVERESE COMPLIMENT FUNCTION ###
def revComp(seqStr):
    newStr = ''
    baseDict = {'A':'T', 'T':'A', 'C':'G', 'G':'C', 'N':'N'}
    for char in seqStr:
        newStr += baseDict[char]
    return newStr[::-1]

### PALINDROME LIST ###
# BAD, uses seqString which we shouldn't use due to possible multiple fasta's
# palList = []
# for i in range(seqLength - 8):
#     palList.append(f"{seqString[i:8+i]}:{revComp(seqString[i:8+i])}")

['.ipynb_checkpoints', 'data', 'dummm.txt', 'fastaReader.py', 'mdata', 'notes.md', 'program.py', 'testing.ipynb', '__pycache__'] 
['Arthrospira-platensis-NIES-39.fna', 'Ecoli-UMN026.fa', 'Synechococcus7002.fna', 'Zm4-genomic.fna']
mdata/Arthrospira-platensis-NIES-39.fna


In [203]:
class SearchForMissing:
    def __init__(self, seqList, minMotif, maxMotif, zCut):
        self.seqList = seqList
        self.minMotif = minMotif
        self.maxMotif = maxMotif
        self.zCut = zCut
        
        self.N = self.genomeLen() - self.maxMotif
        self.ctDict = self.ctDictMake()
        self.prDict = self.prDictMake()

    def genomeLen(self):
        '''
        Takes the sum of sequence lengths from the list of sequences, seqList.
        seqList is made with the FastAreader class and has 1 fasta seq per entry.
        '''
        totLen = 0
        for seq in seqList:
            totLen += len(seq)
        return totLen

    def ctDictMake(self):
        '''
        Stores a dict of dicts
        {1:{'A':25, 'C':33,...}, ... ,8:{'AAAAGGGG':12, 'AAAATTTT':11, ...}}
        Reads over each sequence in the fasta. For each sequence, a sliding window
        moves over its respective sequence length and k-mers are counted.
        '''
        
        massDict = dict() # dictionary that holds dictionaries which hold k-mers and their respective counts
        for k in range(1, self.maxMotif+1): # Initialize dictionaries for min to max motif sizes within massDict; MAYBE START WITH 1 AND NOT MIN MOTIF
            massDict[k] = dict()
        print(massDict)
        for seq in self.seqList:
            for k in range(1, self.maxMotif+1):
                for kmer_idx in range((len(seq)-k)):
                    kmer = seq[kmer_idx:kmer_idx+k]
                    if 'N' not in kmer:
                        if kmer in massDict[k]:
                            massDict[k][kmer] += 1
                        else:
                            massDict[k][kmer] = 1
        return massDict
                            
    # Friggin destroy this method    
    def finalCtDictMake(self):              
        finalDict = dict()
        banned = set()
        for k in range(1, self.maxMotif+1): # Initialize dictionaries for min to max motif sizes within massDict; MAYBE START WITH 1 AND NOT MIN MOTIF
            finalDict[k] = dict()
        for k in range(1, self.maxMotif+1):
            for kmer in self.ctDict[k].keys():
                if kmer not in banned:
                    tmpPair = sorted([kmer, revComp(kmer)])
                    if revComp(kmer) in self.ctDict[k]: # Case 1: k-mer and its reverse complement exist in massDict, proceed to combine their keys and vals
                        finalDict[k][f'{tmpPair[0]}:{tmpPair[1]}'] = self.ctDict[k][kmer] + self.ctDict[k][revComp(kmer)] # arranges revere complement palindromes in alphabetical order
                    if kmer == revComp(kmer): # Case 2: k-mer and its reverse complement are the same, proceed to make a single entry for both using existing count info
                        finalDict[k][f'{tmpPair[0]}:{tmpPair[1]}'] = self.ctDict[k][kmer]
                    # Case 3: k-mer exists in massDict but not its reverse complement, proceed to combine their keys and vals (by default the other would be 0)
                    # Case 3 is covered by condition for Case 2
                    banned.add(kmer)
                    banned.add(revComp(kmer))
        return finalDict
    
    def meanAndSD(self):
        
                
                                
#     def ctDictMake(self):
#         '''
#         Stores a dict of dicts
#         {1:{'A':25, 'C':33,...}, ... ,8:{'AAAAGGGG':12, 'AAAATTTT':11, ...}}
#         Reads over each sequence in the fasta. For each sequence, a sliding window
#         moves over its respective sequence length and k-mers are counted.
#         '''
#         massDict = dict() # dictionary that holds dictionaries which hold k-mers and their respective counts
#         for k in range(1, self.maxMotif+1): # Initialize dictionaries for min to max motif sizes within massDict; MAYBE START WITH 1 AND NOT MIN MOTIF
#             massDict[k] = dict()
#         print(massDict)
#         for seq in self.seqList:
#             for k in range(1, self.maxMotif+1):
#                 for kmer_idx in range((len(seq)-k)):
#                     kmer = seq[kmer_idx:kmer_idx+k]
#                     if 'N' not in kmer:
#                         if kmer in massDict[k]:
#                             massDict[k][kmer] += 1
#                         else:
#                             massDict[k][kmer] = 1
#                         # Used to match other ppl's output \-_-/, works but is fucking slow
#                         if revComp(kmer) != kmer:
#                             if revComp(kmer) in massDict[k]:
#                                 massDict[k][revComp(kmer)] += 1
#                             else:
#                                 massDict[k][revComp(kmer)] = 1
#         return massDict

    def prDictMake(self): # VERY DEPENDENT ON ctDict
        massDict = dict() # dictionary that holds dictionaries which hold k-mers and their respective counts
        for k in range(self.minMotif, self.maxMotif+1): # Initialize dictionaries for min to max motif sizes within massDict
            massDict[k] = dict()
#         print(massDict)
        for k in range(self.minMotif, self.maxMotif+1):
            for kmer in self.ctDict[k].keys():
                massDict[k][kmer] = (1/self.N) * ((self.ctDict[k-1][kmer[:-1]] * self.ctDict[k-1][kmer[1:]]) / self.ctDict[k-2][kmer[1:-1]])
        return massDict    
    

In [205]:
sfm = SearchForMissing(seqList, 3, 8, 0)
countDict = sfm.ctDict

{1: {}, 2: {}, 3: {}, 4: {}, 5: {}, 6: {}, 7: {}, 8: {}}


In [216]:
# AAAATTTA:TAAATTTT	
a = countDict[8]['AAAATTTA']
b = countDict[8][revComp('AAAATTTA')]
print(a, b)

Genome Len N: 6788427
418 416


In [319]:
# Basically Expected count = Pr(K) * N
# Making some sort of double stranded choice with *2 I think 
pra = sfm.prDict[8]['AAAATTTA']
prb = sfm.prDict[8][revComp('AAAATTTA')]
prab = pra + prb
N = sfm.N # GOOD
exp = (prab * sfm.N*2)  # The necessity of needing to find the expected counts of each to match Harrison's answers indicate the need to combine counts   # GOOD
obs = (sfm.ctDict[8]['AAAATTTA'] + sfm.ctDict[8][revComp('AAAATTTA')]) # GOOD
print('Pr(AAAATTTA) = {}'.format(str(pra)))
print('Pr(TAAATTTT) = {}'.format(str(prb)))
# Genome length kinda
print(f'Genome Len N = {N}')
print(f"Expect (np) = {exp}") # Composed from: rev comp counts from BOTH strands
print(f"count = {obs}")

# Pr(a:b) to use for z is calculated by summing Pr(a) and Pr(b) where a and b are rev comps
pr = exp/(N)

sd = (2*(obs)*(1-prab))**.5
print(f"sd = {sd}")
Z = ((obs)-exp)/sd +1 # dogshit z score soln
Z

Pr(AAAATTTA) = 6.44041436287385e-05
Pr(TAAATTTT) = 6.350870716171334e-05
Genome Len N = 6788427
Expect (np) = 1736.6540999057495
count = 834
sd = 40.83854357546167


-21.10299439885315

In [248]:
mean = 

SyntaxError: invalid syntax (<ipython-input-248-e908386a6b4f>, line 1)

In [208]:
fakeExpect = 1736.6540999057495


6788427

KeyError: 'AAAATTTT:TTTTAAAA'

In [105]:
aaa = []
ttt = aaa
ttt.append('apple')
print(aaa, ttt)

['apple'] ['apple']


In [47]:
with open('mdata/Synechococcus7002.fna', 'r') as myfile:
    st = ''
    dd = dict()
    for line in myfile:
        st += line.rstrip()
    rn = len(st)
    for i in range(rn-8):
        if st[i:i+8] in dd:
            dd[st[i:i+8]] += 1
        else:
            dd[st[i:i+8]] = 1
        

In [48]:
dd['AAAATTTT']

418

In [85]:
# Might just save trouble and make keep counts for revComps separate (Not necessary to combine them now)
newDict = dict()
exclude = set()
for key in x[8]:
    if key not in exclude:
        if revComp(key) in x[8]:
            newDict[f"{key}:{revComp(key)}"] = (x[8][key]+x[8][revComp(key)])
        else: 
            newDict[f"{key}:{revComp(key)}"] = (x[8][key])
    exclude.add(key)
    exclude.add(revComp(key))
newDict

{'AGCGTTTA:TAAACGCT': 261,
 'GCGTTTAA:TTAAACGC': 288,
 'CGTTTAAC:GTTAAACG': 256,
 'GTTTAACC:GGTTAAAC': 495,
 'TTTAACCG:CGGTTAAA': 456,
 'TTAACCGA:TCGGTTAA': 394,
 'TAACCGAG:CTCGGTTA': 286,
 'AACCGAGA:TCTCGGTT': 321,
 'ACCGAGAG:CTCTCGGT': 168,
 'CCGAGAGG:CCTCTCGG': 112,
 'CGAGAGGT:ACCTCTCG': 82,
 'GAGAGGTC:GACCTCTC': 118,
 'AGAGGTCG:CGACCTCT': 95,
 'GAGGTCGC:GCGACCTC': 106,
 'AGGTCGCC:GGCGACCT': 237,
 'GGTCGCCG:CGGCGACC': 186,
 'GTCGCCGT:ACGGCGAC': 192,
 'TCGCCGTG:CACGGCGA': 118,
 'CGCCGTGA:TCACGGCG': 113,
 'GCCGTGAA:TTCACGGC': 97,
 'CCGTGAAA:TTTCACGG': 139,
 'CGTGAAAT:ATTTCACG': 99,
 'GTGAAATT:AATTTCAC': 398,
 'TGAAATTC:GAATTTCA': 465,
 'GAAATTCA:TGAATTTC': 544,
 'AAATTCAT:ATGAATTT': 637,
 'AATTCATT:AATGAATT': 432,
 'ATTCATTG:CAATGAAT': 362,
 'TTCATTGA:TCAATGAA': 326,
 'TCATTGAC:GTCAATGA': 255,
 'CATTGACA:TGTCAATG': 319,
 'ATTGACAG:CTGTCAAT': 439,
 'TTGACAGC:GCTGTCAA': 309,
 'TGACAGCA:TGCTGTCA': 277,
 'GACAGCAA:TTGCTGTC': 245,
 'ACAGCAAT:ATTGCTGT': 387,
 'CAGCAATT:AATTGCTG': 815,
 'AGC