In [1]:
#!/usr/bin/env python3

########################################################################
# File: Chan_Nicholas_missingMotif.ipynb
# Purpose: to find hidden motifs
#   main(infile='FILE_PATH',outfile='FILE_PATH', inCL=['--minMotif', '3', '--maxMotif', '8', '--cutoff', '0'])
#
# Author: Nicholas Chan
# History: 10/18/2021 Created
########################################################################

# Command Line Class
Provided by Dr. B for parsing command line arguments

In [2]:
########################################################################
# CommandLine
########################################################################


class CommandLine():
    """
    Handle the command line, usage and help requests.

    CommandLine uses argparse,
    it implements a standard command line argument parser with various argument options,
    a standard usage and help, and an error termination exception Usage.
    """

    def __init__(self, inOpts = None):
        """
        CommandLine constructor.
        
        Implement a parser to interpret the command line argv string using argparse.
        """
        import argparse
        self.parser = argparse.ArgumentParser(
            description='Program prolog - a brief description of what this thing does',
            epilog='Program epilog - some other stuff you feel compelled to say',
            add_help=True,  # default is True
            prefix_chars='-',
            usage='%(prog)s [options] -option1[default] <input >output'
            )

        self.parser.add_argument('-m', '--minMotif', type=int, default=3, action='store', help='minimum motif length to consider for Markov Model')
        self.parser.add_argument('-M', '--maxMotif', type=int, default=8, action='store', help='maximum motif length to consider for Markov Model')
        self.parser.add_argument('-c', '--cutoff', type=int, default=-4, action='store', help='Significant Z-score cutoff value')
        # Command line option to use Gibbs sampling to find the optimal consensus motif.
#         self.parser.add_argument('-g', '--gibbsampling', type=float, default=1, action='store', help='implement Gibbs sampling')

        if inOpts is None:
            self.args = self.parser.parse_args()
        else:
            self.args = self.parser.parse_args(inOpts)

# FastAreader Class
Provided by Dr. B for reading Fasta Files.

In [3]:

import sys

class FastAreader():
    """
    Read in files and preprocess.
    """
    def __init__(self, fname=''):
        """ Contructor: saves attribute fname. """
        self.fname = fname

    def doOpen(self):
        """ Open a file."""
        if self.fname == '':
            return sys.stdin
        else:
            return open(self.fname)

    def readFasta(self):
        """ Read in a fasta file and yield header and sequences separately"""
        header = ''
        sequence = ''

        with self.doOpen() as fileH:

            header = ''
            sequence = ''

            # skip to first fasta header
            line = fileH.readline()
            while not line.startswith('>'):
                line = fileH.readline()
            header = line[1:].rstrip()

            # Separate headers and sequences
            for line in fileH:
                if line.startswith('>'):
                    yield header, sequence
                    header = line[1:].rstrip()
                    sequence = ''
                else:
                    sequence += ''.join(line.rstrip().split()).upper()

        yield header, sequence

# Assignment 2 Notes

### NULL Model
- For a Markov(1) Model, we consider a preceding base
    - Ex:  
    
        $P(A|T_{prev}) = \frac{P(TA)}{P(T)} = \frac{\frac{c(TA)}{N}}{\frac{c(T)}{N}} = \frac{c(TA)}{c(T)}$  
        
        - N is not the same in the cases of TA and T, but the difference is negligible for now
- Increasing the order of a Markov Model makes our model more informed
- Check Week 2 Overview Video 3 for more details on detailed Markov Model for 4-mer described by $k_{1}k_{2}k_{3}k_{4}$

In [18]:
## LATEST TO WORK KINDA
import sys
import os 
import numpy as np
from fastaReader import FastAreader

class SearchForMissing:
    def __init__(self, seqList, minMotif, maxMotif, zCut):
        self.seqList = seqList
        self.minMotif = minMotif
        self.maxMotif = maxMotif
        self.zCut = zCut
        
        self.N = self.genomeLen() # - self.maxMotif
        self.ctDict = self.ctDictMake()
        self.prDict = self.prDictMake()
        self.output = self.outputMake()

    def genomeLen(self):
        '''
        Takes the sum of sequence lengths from the list of sequences, seqList.
        seqList is made with the FastAreader class and has 1 fasta seq per entry.
        '''
        totLen = 0
        for seq in self.seqList:
            totLen += len(seq)
        return totLen

    def ctDictMake(self):
        '''
        Stores a dict of dicts
        {1:{'A':25, 'C':33,...}, ... ,8:{'AAAAGGGG':12, 'AAAATTTT':11, ...}}
        Reads over each sequence in the fasta. For each sequence, a sliding window
        moves over its respective sequence length and k-mers are counted.
        '''
        
        massDict = dict() # dictionary that holds dictionaries which hold k-mers and their respective counts
        for k in range(1, self.maxMotif+1): # Initialize dictionaries for min to max motif sizes within massDict; MAYBE START WITH 1 AND NOT MIN MOTIF
            massDict[k] = dict()
        for seq in self.seqList:
            for k in range(1, self.maxMotif+1):
                for kmer_idx in range((len(seq)-k)):
                    kmer = seq[kmer_idx:kmer_idx+k]
                    if 'N' not in kmer:
                        if kmer in massDict[k]:
                            massDict[k][kmer] += 1
                        else:
                            massDict[k][kmer] = 1
        return massDict
                                
    def prDictMake(self): # VERY DEPENDENT ON ctDict
        massDict = dict() # dictionary that holds dictionaries which hold k-mers and their respective counts
        for k in range(self.minMotif, self.maxMotif+1): # Initialize dictionaries for min to max motif sizes within massDict
            massDict[k] = dict()
#         print(massDict)
        for k in range(self.minMotif, self.maxMotif+1):
            for kmer in self.ctDict[k].keys():
                if (kmer[:-1] not in self.ctDict[k-1]) or (kmer[1:] not in self.ctDict[k-1]) or (kmer[1:-1] not in self.ctDict[k-2]): # If sub seq is not in sequence count 
                    massDict[k][kmer] = 0
                else:
                    massDict[k][kmer] = (1/self.N) * ((self.ctDict[k-1][kmer[:-1]] * self.ctDict[k-1][kmer[1:]]) / self.ctDict[k-2][kmer[1:-1]])
        return massDict

    def outputMake(self):
        finalList = []
        for k in range(self.minMotif, self.maxMotif+1):
            banned = set()
            for kmer in self.ctDict[k].keys():
                if kmer not in banned: # interesting condition, is it implied that the reverse complement of the kmer is excluded?
                    # sequenceReverse entry
                    reverseCompKmer = revComp(kmer) # K-mer reverse compliment variable
                    tmpList = sorted([kmer,reverseCompKmer]) # Temporary list used for sorting kmer and reverse kmer for output
                    sequenceReverse = f"{tmpList[0]}:{tmpList[1]}" # sequence: reverse entry for output
                    # count entry
                    if kmer != reverseCompKmer: # Case 1: k-mer is not a palindrome
                        if reverseCompKmer in self.ctDict[k]:
                            count = self.ctDict[k][kmer] + self.ctDict[k][reverseCompKmer] 
                        else:
                            count = self.ctDict[k][kmer]
                    else: # Case 2: k-mer is a palindrome
                        count = count = self.ctDict[k][kmer]
                    # expectation entry
                    if reverseCompKmer in self.ctDict[k]:
#                         Pr = self.prDict[k][kmer] + self.prDict[k][reverseCompKmer] 
                        Lkmer = self.ctDict[k-1][kmer[:-1]] #+ self.ctDict[k-1][revComp(kmer[:-1])]
                        Rkmer = self.ctDict[k-1][kmer[1:]] #+ self.ctDict[k-1][revComp(kmer[:-1])]
                        Mkmer = self.ctDict[k-2][kmer[1:-1]]# + self.ctDict[k-2][revComp(kmer[1:-1])]
                    else: 
                        Pr = self.prDict[k][kmer]
                    if kmer == reverseCompKmer:    
                        expectation =  ((Lkmer+Rkmer)**2) / (Mkmer)
                    else:
                        expectation =  (Lkmer*Rkmer) / (Mkmer)
                        
                    # zscore entry
                    numerator = count - expectation
                    sd = (expectation * (1 - expectation/self.N))**0.5
                    zscore = numerator/sd
                    entry = (sequenceReverse, count, expectation, zscore, k)
                    banned.add(kmer) # Mark
                    banned.add(revComp(kmer))
                    finalList.append(entry)
        finalList.sort(key=lambda x:x[-2]) # sort by z-score
        finalList.sort(reverse=True, key=lambda x:x[-1]) # sort by motif length
        return finalList

In [19]:
### REVERESE COMPLIMENT FUNCTION ###
def revComp(seqStr):
    newStr = ''
    baseDict = {'A':'T', 'T':'A', 'C':'G', 'G':'C', 'N':'N'}
    for char in seqStr:
        newStr += baseDict[char]
    return newStr[::-1]

def main(infile, outfile='', inCL=None):
    '''
    This is the main function. Arguments are parsed here with the CommandLine class
    provided by Dr. B. Infile and outfile are also read here, with an outfile file being 
    optional. IF no outfile is provided, output is printed. Infile is read in through the
    readData function which returns fasta file information for the missingMotif to read.
    This way, input only needs to be read once. missingMotif is called on a fasta file holding
    either one or many sequences.
    '''

    myCommandLine = CommandLine(inCL)
    minMotif = myCommandLine.args.minMotif
    maxMotif = myCommandLine.args.maxMotif
    cutoff = myCommandLine.args.cutoff
    
    def readData(infile:'str') -> 'tuple(list,str)':
        '''
        Reads in fasta file as input. 
        Returns [0]: a list of seqs [1]: a string of catted seqs.
        '''
        seqList = []
        readData = FastAreader(infile).readFasta()
        for line in readData:
            seqList.append(line[1]) # Append seq and not head to seqList
        return seqList
    
    seqList = readData(infile) # Saves output of FastAreader(infile).readFasta() as seqList
    
    sfm = SearchForMissing(seqList, minMotif, maxMotif, cutoff)
    
    print(f"N={sfm.N}")
    for line in sfm.output:
        if line[3] >= cutoff:
            break
        print('{0:17}\t{1:0d}\t{2:0.2f}\t{3:0.2f}'.format(line[0], line[1], line[2],line[3]))

In [20]:
if __name__ == "__main__":
    '''
    This is where the main function is executed.
    Usage:
        main(infile='FILE_PATH', outfile='FILE_PATH', inCL=[-ipk] )
    Arguments:
        --minMotif
            Minimum motif length to consider for Markov Model
        --maxMotif
            Maximum motif length to consider for Markov Model
        --cutoff
            Significant Z-score cutoff value
    '''
#     main(infile='mdata/Arthrospira-platensis-NIES-39.fna', inCL=['-m 3', '-M 8', '-c -4'])
    main(infile='mdata/Arthrospira-platensis-NIES-39.fna', inCL=['--minMotif', '3', '--maxMotif', '8', '--cutoff', '0'])

N=6788435
GCGATCGC:GCGATCGC	7683	31752.94	-135.39
AATTAATT:AATTAATT	558	2580.47	-39.82
AAAATTTT:AAAATTTT	501	2185.49	-36.04
TTAATTAA:TTAATTAA	537	2205.76	-35.54
AAATATTT:AAATATTT	365	1705.30	-32.46
ATAATTAT:ATAATTAT	364	1681.01	-32.13
TTTTAAAA:TTTTAAAA	298	1251.69	-26.96
CAAATTTG:CAAATTTG	245	1158.42	-26.84
TAATATTA:TAATATTA	296	1222.71	-26.50
TAAATTTA:TAAATTTA	413	1380.29	-26.04
GATTAATC:GATTAATC	390	1342.07	-25.99
TGATATCA:TGATATCA	300	1184.38	-25.70
TATTAATA:TATTAATA	385	1310.43	-25.57
ATCGCGAT:ATCGCGAT	294	1155.39	-25.34
AATATATT:AATATATT	255	1051.57	-24.57
ATTTAAAT:ATTTAAAT	195	945.67	-24.41
GTTTAAAC:GTTTAAAC	269	1057.29	-24.25
CCCTAGGG:CCCTAGGG	232	997.07	-24.23
CAATATTG:CAATATTG	230	971.23	-23.79
ACAATTGT:ACAATTGT	115	750.77	-23.20
CATTAATG:CATTAATG	165	800.99	-22.47
CTTTAAAG:CTTTAAAG	260	935.23	-22.08
TTAGCTAA:TTAGCTAA	226	865.22	-21.73
GGTTAACC:GGTTAACC	191	798.36	-21.50
TTGATCAA:TTGATCAA	225	851.57	-21.47
AGTTAACT:AGTTAACT	174	760.13	-21.26
AGATATCT:AGATATCT	186	776.94	-21.20

In [15]:
# INSPECTION

# INSPECTION TEAM
# Jodi Lee
# Maxim Firsov
# Hsiang-Yun Lu (Eloise)
# Gabriel Aguiar

# RESPONSES
# - Markdown comments
# - Get more docstrings
# - Fix Null model
# - Finish main function

# CORRECTIONS
# - Made some markdown comments
# - Made some more docstrings and comments
# - Made main function print out correct format
# - Main function was put together for the most part