In [1]:
#!/usr/bin/env python3

########################################################################
# File: Chan_Nicholas_missingMotif.ipynb
# Purpose: to find hidden motifs
#   main(infile='FILE_PATH',outfile='FILE_PATH', inCL=['--minMotif', '3', '--maxMotif', '8', '--cutoff', '0'])
#
# Author: Nicholas Chan
# History: 10/18/2021 Created
########################################################################

# Command Line Class
Provided by Dr. B for parsing command line arguments

In [2]:
########################################################################
# CommandLine
########################################################################


class CommandLine():
    """
    Handle the command line, usage and help requests.

    CommandLine uses argparse,
    it implements a standard command line argument parser with various argument options,
    a standard usage and help, and an error termination exception Usage.
    """

    def __init__(self, inOpts = None):
        """
        CommandLine constructor.
        
        Implement a parser to interpret the command line argv string using argparse.
        """
        import argparse
        self.parser = argparse.ArgumentParser(
            description='Program prolog - a brief description of what this thing does',
            epilog='Program epilog - some other stuff you feel compelled to say',
            add_help=True,  # default is True
            prefix_chars='-',
            usage='%(prog)s [options] -option1[default] <input >output'
            )

        self.parser.add_argument('-m', '--minMotif', type=int, default=3, action='store', help='minimum motif length to consider for Markov Model')
        self.parser.add_argument('-M', '--maxMotif', type=int, default=8, action='store', help='maximum motif length to consider for Markov Model')
        self.parser.add_argument('-c', '--cutoff', type=int, default=-4, action='store', help='Significant Z-score cutoff value')
        # Command line option to use Gibbs sampling to find the optimal consensus motif.
#         self.parser.add_argument('-g', '--gibbsampling', type=float, default=1, action='store', help='implement Gibbs sampling')

        if inOpts is None:
            self.args = self.parser.parse_args()
        else:
            self.args = self.parser.parse_args(inOpts)

# FastAreader Class
Provided by Dr. B for reading Fasta Files.

In [3]:

import sys

class FastAreader():
    """
    Read in files and preprocess.
    """
    def __init__(self, fname=''):
        """ Contructor: saves attribute fname. """
        self.fname = fname

    def doOpen(self):
        """ Open a file."""
        if self.fname == '':
            return sys.stdin
        else:
            return open(self.fname)

    def readFasta(self):
        """ Read in a fasta file and yield header and sequences separately"""
        header = ''
        sequence = ''

        with self.doOpen() as fileH:

            header = ''
            sequence = ''

            # skip to first fasta header
            line = fileH.readline()
            while not line.startswith('>'):
                line = fileH.readline()
            header = line[1:].rstrip()

            # Separate headers and sequences
            for line in fileH:
                if line.startswith('>'):
                    yield header, sequence
                    header = line[1:].rstrip()
                    sequence = ''
                else:
                    sequence += ''.join(line.rstrip().split()).upper()

        yield header, sequence

# Assignment 2: Missing Motif
  
For this assingment, I used a Markov(K-2) Model to find the probabilities of K-mers (of length K) which I assumed
followed the binomial distribution. With these statistics I was able to print out motifs by order of their z-score (ascending order)
and then by their length (descending order). Through the nature of this output, we can assume that the motifs 
hiding from us are those with lower z-score (closer to the top relative to motifs of the same length).

### NULL Model
- For a Markov(1) Model, we consider a preceding base
    - Ex:  
    
        $P(A|T_{prev}) = \frac{P(TA)}{P(T)} = \frac{\frac{c(TA)}{N}}{\frac{c(T)}{N}} = \frac{c(TA)}{c(T)}$  
        
        - N is not the same in the cases of TA and T, but the difference is negligible for now
- For a Markov(8) Model, we consider a preceding base
    - Ex:  
        
        $Pr(K) = N * E(K) = \frac{1}{N} \frac{c(k_{1}k_{2}k_{3}k_{4}k_{5}k_{6}k_{7}) * c(k_{2}k_{3}k_{4}k_{5}k_{6}k_{7}k_{8})}{c(k_{2}k_{3}k_{4}k_{5}k_{6}k_{7})} $  
- Increasing the order of a Markov Model makes our model more informed
- Check Week 2 Overview Video 3 for more details on detailed Markov Model for 4-mer described by $k_{1}k_{2}k_{3}k_{4}$

In [4]:
## LATEST TO WORK KINDA
import sys
import os 

class SearchForMissing:
    '''
    Initializes a length for the seqList fed as input, a dictionary, 
    a dictionary with nucleotide counts from a given list of sequences 
    (read in with FastAreader), and a list of motif (including the motif's 
    reverse complement) statistics.
    '''
    def __init__(self, seqList, minMotif, maxMotif, zCut):
        self.seqList = seqList
        self.minMotif = minMotif
        self.maxMotif = maxMotif
        self.zCut = zCut
        
        self.N = self.genomeLen() 
        self.ctDict = self.ctDictMake()
        self.output = self.outputMake()

    def genomeLen(self):
        '''
        Takes the sum of sequence lengths from the list of sequences, seqList.
        seqList is made with the FastAreader class and has 1 fasta seq per entry.
        '''
        totLen = 0
        for seq in self.seqList:
            totLen += len(seq)
        return totLen

    def ctDictMake(self):
        '''
        Stores a dict of dicts
        {1:{'A':25, 'C':33,...}, ... ,8:{'AAAAGGGG':12, 'AAAATTTT':11, ...}}
        Reads over each sequence in the fasta. For each sequence, a sliding window
        moves over its respective sequence length and k-mers are counted.
        '''
        
        finalDict = dict() # The final dict returned, stores "collapsed counts" of motifs and their reverse complements
        massDict = dict() # Dictionary that holds dictionaries which hold k-mers and their respective counts
        for k in range(1, self.maxMotif+1): # Initialize dictionaries for min to max motif sizes within massDict
            massDict[k] = dict()
        for k in range(1, self.maxMotif+1): # Initialize dictionaries for min to max motif sizes within massDict
            finalDict[k] = dict()
        for seq in self.seqList: # Loop through seqs in a fasta file
            for k in range(1, self.maxMotif+1): # Loop through possible motif lengths
                for kmer_idx in range((len(seq)-k)): # Loop through all sliding frame indices of a sequence string
                    kmer = seq[kmer_idx:kmer_idx+k]
                    if 'N' not in kmer: # Ignore motifs with N as a nucleotide
                        if kmer in massDict[k]: # If motif exists in dict, increment
                            massDict[k][kmer] += 1
                        else: # If motif is new to dict, create an entry with value = 1
                            massDict[k][kmer] = 1
        # Finished composing massDict
        # Begin composing finalDict
        for k in range(1, self.maxMotif+1): # Loop through possible motif lengths
            banned = set() # Create a set to hold already considered motifs, this prevents overcounting
            for kmer in massDict[k]: # Loop through all motif entries in massDict
                if kmer not in banned: # If motif is not considered yet, proceed
                    banned.add(kmer)
                    banned.add(revComp(kmer))
                    if kmer != revComp(kmer): # If k-mer and its reverse complements are the same, collapse their counts
                        if revComp(kmer) in massDict[k]: # If k-mer and its reverse complements exist, collapse their counts
                            finalDict[k][kmer] = massDict[k][kmer] + massDict[k][revComp(kmer)]
                            finalDict[k][revComp(kmer)] = massDict[k][kmer] + massDict[k][revComp(kmer)]
                        else: # If reverse complement isn't found, but the k-mer is
                            finalDict[k][revComp(kmer)] = massDict[k][kmer]
                    else: # If k-mer and its reverse complements are the same, make an entry for the reverse complement with the k-mer's count, REVERSE COMPLEMENT PALINDROME
                        finalDict[k][revComp(kmer)] = massDict[k][kmer]
                else: # I k-mer is already considered, continue
                    continue
        return finalDict
    
    
    def outputMake(self):
        '''
        Creates a list containing sorted output. This method calculates statistics such
        as motif counts, motif expectation, motif standard deviation, and motif z-score
        following the binomial distribution as well as a Markov(K-2) Model where K = motif length.
        Entries are taken from ctDict keys and are transformed into output through this method.
        '''
        finalList = [] # List which will hold tuples of printable output 
        for k in range(self.minMotif, self.maxMotif+1):
            banned = set()
            for kmer in self.ctDict[k]:
                if kmer not in banned:
                    banned.add(kmer) 
                    banned.add(revComp(kmer))
                    # Compute sequenceReverse entry
                    reverseCompKmer = revComp(kmer) # K-mer reverse compliment variable
                    tmpList = sorted([kmer,reverseCompKmer]) # Temporary list used for sorting kmer and reverse kmer for output
                    sequenceReverse = (tmpList[0],tmpList[1]) # sequence: reverse entry for output
                    # Compute count entry
                    count = self.ctDict[k][kmer]
                    # Compute expectation entry
                    Lkmer = self.ctDict[k-1][kmer[:-1]] 
                    Rkmer = self.ctDict[k-1][kmer[1:]] 
                    Mkmer = self.ctDict[k-2][kmer[1:-1]]
                    expectation = (Lkmer*Rkmer)/Mkmer # Mean following binomial distribution
                    # Compute zscore entry
                    numerator = count - expectation
                    sd = (expectation * (1 - expectation/self.N))**0.5 # Standard deviation following binomial distribution
                    zscore = numerator/sd
                    entry = (tmpList, count, expectation, zscore, k) # k (motif length) has been included for sorting purposes
                    # Append finished entry into list
                    finalList.append(entry)
                else:
                    continue
        finalList.sort(key=lambda x:x[-2]) # Sort by z-score
        finalList.sort(reverse=True, key=lambda x:x[-1]) # Sort by motif length
        return finalList


# Main function 
Main function is written here. This function handles argument parsing, input, and output.

In [5]:
### REVERESE COMPLIMENT FUNCTION ###
def revComp(seqStr):
    newStr = ''
    baseDict = {'A':'T', 'T':'A', 'C':'G', 'G':'C', 'N':'N'}
    for char in seqStr:
        newStr += baseDict[char]
    return newStr[::-1]

def main(infile, outfile='', inCL=None):
    '''
    This is the main function. Arguments are parsed here with the CommandLine class
    provided by Dr. B. Infile and outfile are also read here, with an outfile file being 
    optional. IF no outfile is provided, output is printed. Infile is read in through the
    readData function which returns fasta file information for the missingMotif to read.
    This way, input only needs to be read once. missingMotif is called on a fasta file holding
    either one or many sequences. Statistics such as observed counts, expected values, and z-score
    are printed to output after a call to main(). Output is sorted by order of z-score (in ascending 
    order technically) and by order of motif lenght (in descending order).
    '''

    myCommandLine = CommandLine(inCL)
    minMotif = myCommandLine.args.minMotif
    maxMotif = myCommandLine.args.maxMotif
    cutoff = myCommandLine.args.cutoff
    
    def readData(infile:'str') -> 'tuple(list,str)':
        '''
        Reads in fasta file as input. 
        Returns [0]: a list of seqs [1]: a string of catted seqs.
        '''
        seqList = []
        readData = FastAreader(infile).readFasta()
        for line in readData:
            seqList.append(line[1]) # Append seq and not head to seqList
        return seqList
    
    seqList = readData(infile) # Saves output of FastAreader(infile).readFasta() as seqList
    
    sfm = SearchForMissing(seqList, minMotif, maxMotif, cutoff)

    if len(outfile) > 0:
        with open(outfile, 'w') as myfile:
            myfile.write(f'N = {sfm.N}\n')
            myfile.write("{0:8}:{1:>8}\t{2:5}\t{3:>7}\t{4:1}\n".format("sequence","reverse","count","Expect","Zscore"))
            for line in sfm.output:
                if line[3] < cutoff:
                    myfile.write('{0:8}:{1:8}\t{2:0d}\t{3:0.2f}\t{4:0.2f}\n'.format(line[0][0],line[0][1], line[1], line[2],line[3]))   
    else:        
        print(f'N = {sfm.N}')
        print("{0:8}:{1:>8}\t{2:5}\t{3:>7}\t{4:1}".format("sequence","reverse","count","Expect","Zscore"))
        for line in sfm.output:
            if line[3] < cutoff:
                print('{0:8}:{1:8}\t{2:0d}\t{3:0.2f}\t{4:0.2f}'.format(line[0][0],line[0][1], line[1], line[2],line[3]))

In [6]:
if __name__ == "__main__":
    '''
    Usage:
        main(infile='FILE_PATH', outfile='FILE_PATH', inCL=[-ipk] )
    Arguments:
        --minMotif
            Minimum motif length to consider for Markov Model
        --maxMotif
            Maximum motif length to consider for Markov Model
        --cutoff
            Significant Z-score cutoff value
    '''
    main(infile='mdata/Arthrospira-platensis-NIES-39.fna',outfile='', inCL=['--minMotif', '3', '--maxMotif', '8', '--cutoff', '-4'])

N = 6788435
sequence: reverse	count	 Expect	Zscore
GCGATCGC:GCGATCGC	7683	31752.94	-135.39
AATTAATT:AATTAATT	558	2580.47	-39.82
AAAATTTT:AAAATTTT	501	2185.49	-36.04
TTAATTAA:TTAATTAA	537	2205.76	-35.54
AAATATTT:AAATATTT	365	1705.30	-32.46
ATAATTAT:ATAATTAT	364	1681.01	-32.13
TTTTAAAA:TTTTAAAA	298	1251.69	-26.96
CAAATTTG:CAAATTTG	245	1158.42	-26.84
TAATATTA:TAATATTA	296	1222.71	-26.50
TAAATTTA:TAAATTTA	413	1380.29	-26.04
GATTAATC:GATTAATC	390	1342.07	-25.99
TGATATCA:TGATATCA	300	1184.38	-25.70
TATTAATA:TATTAATA	385	1310.43	-25.57
ATCGCGAT:ATCGCGAT	294	1155.39	-25.34
AATATATT:AATATATT	255	1051.57	-24.57
ATTTAAAT:ATTTAAAT	195	945.67	-24.41
GTTTAAAC:GTTTAAAC	269	1057.29	-24.25
CCCTAGGG:CCCTAGGG	232	997.07	-24.23
CAATATTG:CAATATTG	230	971.23	-23.79
ACAATTGT:ACAATTGT	115	750.77	-23.20
CATTAATG:CATTAATG	165	800.99	-22.47
CTTTAAAG:CTTTAAAG	260	935.23	-22.08
TTAGCTAA:TTAGCTAA	226	865.22	-21.73
AAAATTTA:TAAATTTT	834	1736.84	-21.67
GGTTAACC:GGTTAACC	191	798.36	-21.50
TTGATCAA:TTGATCAA	225	851.57	

CTCGAC  :GTCGAG  	1740	2886.86	-21.35
ACCGGG  :CCCGGT  	3897	5359.24	-19.98
AACGTA  :TACGTT  	1188	2081.95	-19.60
AAGCTC  :GAGCTT  	1317	2241.92	-19.54
AGGCCG  :CGGCCT  	430	1067.63	-19.52
GCATGA  :TCATGC  	1758	2780.43	-19.39
AGGCCA  :TGGCCT  	712	1445.86	-19.30
AGGCCT  :AGGCCT  	241	766.73	-18.99
ACATGA  :TCATGT  	2000	2979.85	-17.95
ACATGG  :CCATGT  	2093	3063.83	-17.54
GGGCCA  :TGGCCC  	758	1406.35	-17.29
CCGCGC  :GCGCGG  	1545	2336.11	-16.37
ACGCGC  :GCGCGT  	970	1626.98	-16.29
ATGCAG  :CTGCAT  	2323	3250.96	-16.28
GTCGAA  :TTCGAC  	1962	2821.17	-16.18
GATCCC  :GGGATC  	929	1559.99	-15.98
CTCGAA  :TTCGAG  	2452	3363.73	-15.72
CCATGC  :GCATGG  	2036	2858.79	-15.39
CACGTA  :TACGTG  	214	583.47	-15.30
CGCGCC  :GGCGCG  	2347	3208.70	-15.22
CACGTG  :CACGTG  	139	464.79	-15.11
ATGCAC  :GTGCAT  	1580	2303.09	-15.07
ATTCCG  :CGGAAT  	3148	4095.08	-14.80
GCCGGA  :TCCGGC  	3345	4296.71	-14.52
ACTAGC  :GCTAGT  	2882	3764.86	-14.39
GAGCCC  :GGGCTC  	853	1376.73	-14.12
ACACGT  :ACGTGT  	69	317

In [7]:
# INSPECTION

# INSPECTION TEAM
# Jodi Lee
# Maxim Firsov
# Hsiang-Yun Lu (Eloise)
# Gabriel Aguiar

# RESPONSES
# - Markdown comments
# - Get more docstrings
# - Fix Null model
# - Finish main function

# CORRECTIONS
# - Made markdown comments
# - Made more docstrings and comments
# - Infile is only read once in the main function
# - Main function was put together
# - Null model was fixed