# Notes

### NULL Model
- For a Markov(1) Model, we consider a preceding base
    - Ex:  
    
        $P(A|T_{prev}) = \frac{P(TA)}{P(T)} = \frac{\frac{c(TA)}{N}}{\frac{c(T)}{N}} = \frac{c(TA)}{c(T)}$  
        
        - N is not the same in the cases of TA and T, but the difference is negligible for now
- Increasing the order of a Markov Model makes our model more informed
- Check Week 2 Overview Video 3 for more details on detailed Markov Model for 4-mer described by $k_{1}k_{2}k_{3}k_{4}$

In [4]:
import sys
import os 
import numpy as np
from fastaReader import FastAreader

# holds files
dataList = os.listdir('mdata')
print(f"{os.listdir()} \n{os.listdir('mdata')}")
print('mdata/'+dataList[0])

### SEQUENCE LIST ###
seqList = []
readData = FastAreader('mdata/'+dataList[0]).readFasta()
for line in readData:
    seqList.append(line[1])
    
### SEQUENCE STRING ###
# seqString = ''.join(seqList)

### SEQUENCE LENGTH ###
# seqLength = len(seqList[0])

### REVERESE COMPLIMENT FUNCTION ###
def revComp(seqStr):
    newStr = ''
    baseDict = {'A':'T', 'T':'A', 'C':'G', 'G':'C', 'N':'N'}
    for char in seqStr:
        newStr += baseDict[char]
    return newStr[::-1]

### PALINDROME LIST ###
# BAD, uses seqString which we shouldn't use due to possible multiple fasta's
# palList = []
# for i in range(seqLength - 8):
#     palList.append(f"{seqString[i:8+i]}:{revComp(seqString[i:8+i])}")

['.ipynb_checkpoints', 'Chan_Nicholas_serachForTheMissing.ipynb', 'data', 'dummm.txt', 'fastaReader.py', 'mdata', 'notes.md', 'pasgn2.ipynb', 'program.py', 'testing.ipynb', '__pycache__'] 
['Arthrospira-platensis-NIES-39.fna', 'Ecoli-UMN026.fa', 'Synechococcus7002.fna', 'Zm4-genomic.fna']
mdata/Arthrospira-platensis-NIES-39.fna


In [186]:
class SearchForMissing:
    def __init__(self, seqList, minMotif, maxMotif, zCut):
        self.seqList = seqList
        self.minMotif = minMotif
        self.maxMotif = maxMotif
        self.zCut = zCut
        
        self.N = self.genomeLen() # - self.maxMotif
        self.ctDict = self.ctDictMake()
        self.prDict = self.prDictMake()
        self.output = self.outputMake()

    def genomeLen(self):
        '''
        Takes the sum of sequence lengths from the list of sequences, seqList.
        seqList is made with the FastAreader class and has 1 fasta seq per entry.
        '''
        totLen = 0
        for seq in seqList:
            totLen += len(seq)
        return totLen

    def ctDictMake(self):
        '''
        Stores a dict of dicts
        {1:{'A':25, 'C':33,...}, ... ,8:{'AAAAGGGG':12, 'AAAATTTT':11, ...}}
        Reads over each sequence in the fasta. For each sequence, a sliding window
        moves over its respective sequence length and k-mers are counted.
        '''
        
        massDict = dict() # dictionary that holds dictionaries which hold k-mers and their respective counts
        for k in range(1, self.maxMotif+1): # Initialize dictionaries for min to max motif sizes within massDict; MAYBE START WITH 1 AND NOT MIN MOTIF
            massDict[k] = dict()
        print(massDict)
        for seq in self.seqList:
            for k in range(1, self.maxMotif+1):
                for kmer_idx in range((len(seq)-k)):
                    kmer = seq[kmer_idx:kmer_idx+k]
                    if 'N' not in kmer:
                        if kmer in massDict[k]:
                            massDict[k][kmer] += 1
                        else:
                            massDict[k][kmer] = 1
        return massDict
                                
#     def ctDictMake(self):
#         '''
#         Stores a dict of dicts
#         {1:{'A':25, 'C':33,...}, ... ,8:{'AAAAGGGG':12, 'AAAATTTT':11, ...}}
#         Reads over each sequence in the fasta. For each sequence, a sliding window
#         moves over its respective sequence length and k-mers are counted.
#         '''
#         massDict = dict() # dictionary that holds dictionaries which hold k-mers and their respective counts
#         for k in range(1, self.maxMotif+1): # Initialize dictionaries for min to max motif sizes within massDict; MAYBE START WITH 1 AND NOT MIN MOTIF
#             massDict[k] = dict()
#         print(massDict)
#         for seq in self.seqList:
#             for k in range(1, self.maxMotif+1):
#                 for kmer_idx in range((len(seq)-k)):
#                     kmer = seq[kmer_idx:kmer_idx+k]
#                     if 'N' not in kmer:
#                         if kmer in massDict[k]:
#                             massDict[k][kmer] += 1
#                         else:
#                             massDict[k][kmer] = 1
#                         # Used to match other ppl's output \-_-/, works but is fucking slow
#                         if revComp(kmer) != kmer:
#                             if revComp(kmer) in massDict[k]:
#                                 massDict[k][revComp(kmer)] += 1
#                             else:
#                                 massDict[k][revComp(kmer)] = 1
#         return massDict

    def prDictMake(self): # VERY DEPENDENT ON ctDict
        massDict = dict() # dictionary that holds dictionaries which hold k-mers and their respective counts
        for k in range(self.minMotif, self.maxMotif+1): # Initialize dictionaries for min to max motif sizes within massDict
            massDict[k] = dict()
#         print(massDict)
        for k in range(self.minMotif, self.maxMotif+1):
            for kmer in self.ctDict[k].keys():
                if (kmer[:-1] not in self.ctDict[k-1]) or (kmer[1:] not in self.ctDict[k-1]) or (kmer[1:-1] not in self.ctDict[k-2]): # If sub seq is not in sequence count 
                    massDict[k][kmer] = 0
                else:
                    massDict[k][kmer] = (1/self.N) * ((self.ctDict[k-1][kmer[:-1]] * self.ctDict[k-1][kmer[1:]]) / self.ctDict[k-2][kmer[1:-1]])
        return massDict
    
    def outputMake(self):
        finalList = []
        for k in range(self.minMotif, self.maxMotif+1):
            banned = set()
            for kmer in self.ctDict[k].keys():
                if kmer not in banned: # interesting condition, is it implied that the reverse complement of the kmer is excluded?
                    # sequenceReverse entry
                    reverseCompKmer = revComp(kmer) # K-mer reverse compliment variable
                    tmpList = sorted([kmer,reverseCompKmer]) # Temporary list used for sorting kmer and reverse kmer for output
                    sequenceReverse = f"{tmpList[0]}:{tmpList[1]}" # sequence: reverse entry for output
                    # count entry
                    if kmer != reverseCompKmer: # Case 1: k-mer is not a palindrome
                        if reverseCompKmer in self.ctDict[k]:
                            count = self.ctDict[k][kmer] + self.ctDict[k][reverseCompKmer] 
                        else:
                            count = self.ctDict[k][kmer]
                    else: # Case 2: k-mer is a palindrome
                        count = count = self.ctDict[k][kmer]
                    # expectation entry
                    if reverseCompKmer in self.ctDict[k]:
                        Pr = self.prDict[k][kmer] + self.prDict[k][reverseCompKmer] 
                    else: 
                        Pr = self.prDict[k][kmer]
                    expectation = Pr * (self.N * 2)
                    # zscore entry
                    numerator = count - expectation
                    sd = np.sqrt(expectation * (1 - Pr))
                    zscore = numerator/sd
                    entry = (sequenceReverse, count, expectation, zscore, k)
                    banned.add(kmer) # Mark
                    banned.add(revComp(kmer))
                    finalList.append(entry)
#         finalList.sort(key=lambda x:x[-2]) # sort by z-score
#         finalList.sort(reverse=True, key=lambda x:x[-1]) # sort by motif length
        
        return finalList
                
                

    

In [187]:
sfm = SearchForMissing(seqList, 3, 8, 0)
countDict = sfm.ctDict

{1: {}, 2: {}, 3: {}, 4: {}, 5: {}, 6: {}, 7: {}, 8: {}}


In [171]:
sfm.ctDict[8]['CGATCGCC']

3678

In [172]:
l = sfm.output
l.sort(key=lambda x:x[-2])
l.sort(reverse=True, key=lambda x:x[-1])
print(f'N = {sfm.N}')
l

N = 6788435


[('GCGATCGC:GCGATCGC', 7683, 31752.764956892803, -135.23512934204487, 8),
 ('CGATCGCC:GGCGATCG', 7306, 14524.351606899156, -59.926937124085015, 8),
 ('AGCGATCG:CGATCGCT', 5297, 10388.884581782975, -49.97586418575263, 8),
 ('CGATCGCA:TGCGATCG', 4648, 9161.715996622794, -47.17288073608652, 8),
 ('AATTAATT:AATTAATT', 558, 2580.2044706290076, -39.81431755038995, 8),
 ('AAAATTTT:AAAATTTT', 501, 2185.0375117813387, -36.02941931529665, 8),
 ('GATCGCCA:TGGCGATC', 2235, 4701.747420803496, -35.98075297342726, 8),
 ('TTAATTAA:TTAATTAA', 537, 2204.4174826845, -35.51670658988039, 8),
 ('CGATCGCG:CGCGATCG', 1844, 4115.047814695081, -35.40826612573001, 8),
 ('AAATATTT:AAATATTT', 365, 1705.168578993822, -32.456601611388336, 8),
 ('ATAATTAT:ATAATTAT', 364, 1680.6515404824459, -32.11876918943631, 8),
 ('CGGCGATC:GATCGCCG', 998, 2587.5670894526033, -31.251757258912278, 8),
 ('AGGCGATC:GATCGCCT', 3104, 5292.8355989804595, -30.092177217217355, 8),
 ('AAGCGATC:GATCGCTT', 2262, 4131.711876713841, -29.0921464

In [112]:
# Basically Expected count = Pr(K) * N
# Making some sort of double stranded choice with *2 I think 

N = sfm.N + 8# GOOD
prab = sfm.prDict[8]['GCGATCGC'] + sfm.prDict[8][revComp('GCGATCGC')] # Keep as the addition of 2 probabilities

obs = (sfm.ctDict[8]['GCGATCGC'])# + sfm.ctDict[8][revComp('AATTAATT')] # GOOD, will need to make a condition for palindromes and non-palindromes
exp = (prab * sfm.N)*2  # The necessity of needing to find the expected counts of each to match Harrison's answers indicate the need to combine counts   # WEIRD, for palindromes, *4, but for regs, *2
# exp = (prab * sfm.N)*4 # Use this for GCGATCGC

sd = np.sqrt(exp*(1-prab))

num = obs-exp
den = sd
z=num/den

print(f'Genome Len N = {N}')
print(f'Pr(a)+Pr(b) = {prab}')
print()
print(f"Observed (s) = {obs}")
print(f"Expect (np) = {exp}") # Composed from: rev comp counts from BOTH strands
print(f"Z-score = {z}")

Genome Len N = 6788435
Pr(a)+Pr(b) = 0.002338742462494831

Observed (s) = 7683
Expect (np) = 31752.764956892795
Z-score = -135.23512952884582


In [5]:
print(f"{os.listdir()} \n{os.listdir('mdata')}")
print('mdata/'+dataList[0])

### SEQUENCE LIST ###
dat = 'mdata/'+dataList[0]

    

['.ipynb_checkpoints', 'Chan_Nicholas_serachForTheMissing.ipynb', 'data', 'dummm.txt', 'fastaReader.py', 'mdata', 'notes.md', 'pasgn2.ipynb', 'program.py', 'testing.ipynb', '__pycache__'] 
['Arthrospira-platensis-NIES-39.fna', 'Ecoli-UMN026.fa', 'Synechococcus7002.fna', 'Zm4-genomic.fna']
mdata/Arthrospira-platensis-NIES-39.fna


In [15]:
def qsearch(k):
    with open(dat, 'r') as myfile:
        st = ''
        dd = dict()
        for line in myfile:
            st += line.rstrip()
        rn = len(st)
        for i in range(rn-k):
            if st[i:i+k] in dd:
                dd[st[i:i+k]] += 1
            else:
                dd[st[i:i+k]] = 1
    return dd

dd = qsearch(7)

In [20]:
dd['AAAATTTA']

1727

In [202]:
#'CGATCGCC' 'GGCGATCG'
print("A:{} C: {} T:{} G:{}".format(dd['A'], dd['C'], dd['T'], dd['G']))

A:1862755 C: 1486689 T:1866935 G:1476491
