In [1]:
#!/usr/bin/env python3

########################################################################
# File: Chan_Nicholas_randomizedMotifSearch.ipynb
# Purpose: to find the promoter motif
#   main(infile='FILE_PATH',outfile='FILE_PATH', inCL=['-i 1000', '-p 1', '-k 13'])
#
# Author: Nicholas Chan
# History: 10/10/2021 Created
########################################################################

# Command Line Class
Provided by Dr. B for parsing command line arguments

In [2]:
########################################################################
# CommandLine
########################################################################


class CommandLine():
    """
    Handle the command line, usage and help requests.

    CommandLine uses argparse,
    it implements a standard command line argument parser with various argument options,
    a standard usage and help, and an error termination exception Usage.
    """

    def __init__(self, inOpts = None):
        """
        CommandLine constructor.
        
        Implement a parser to interpret the command line argv string using argparse.
        """
        import argparse
        self.parser = argparse.ArgumentParser(
            description='Program prolog - a brief description of what this thing does',
            epilog='Program epilog - some other stuff you feel compelled to say',
            add_help=True,  # default is True
            prefix_chars='-',
            usage='%(prog)s [options] -option1[default] <input >output'
            )

        self.parser.add_argument('-i', '--iterations', type=int, default=1000, action='store', help='number of times of iterations to find consensus motif')
        self.parser.add_argument('-k', '--motifLength', type=int, default=13, action='store', help='length of the target promoter motif')
        self.parser.add_argument('-p', '--pseudocount', type=float, default=1, action='store', help='number of pseudocounts')
        # Command line option to use Gibbs sampling to find the optimal consensus motif.
#         self.parser.add_argument('-g', '--gibbsampling', type=float, default=1, action='store', help='implement Gibbs sampling')

        if inOpts is None:
            self.args = self.parser.parse_args()
        else:
            self.args = self.parser.parse_args(inOpts)

# FastAreader Class
Provided by Dr. B for reading Fasta Files.

In [3]:

import sys

class FastAreader():
    """
    Read in files and preprocess.
    """
    def __init__(self, fname=''):
        """ Contructor: saves attribute fname. """
        self.fname = fname

    def doOpen(self):
        """ Open a file."""
        if self.fname == '':
            return sys.stdin
        else:
            return open(self.fname)

    def readFasta(self):
        """ Read in a fasta file and yield header and sequences separately"""
        header = ''
        sequence = ''

        with self.doOpen() as fileH:

            header = ''
            sequence = ''

            # skip to first fasta header
            line = fileH.readline()
            while not line.startswith('>'):
                line = fileH.readline()
            header = line[1:].rstrip()

            # Separate headers and sequences
            for line in fileH:
                if line.startswith('>'):
                    yield header, sequence
                    header = line[1:].rstrip()
                    sequence = ''
                else:
                    sequence += ''.join(line.rstrip().split()).upper()

        yield header, sequence

# Assignment 2 Notes

### NULL Model
- For a Markov(1) Model, we consider a preceding base
    - Ex:  
    
        $P(A|T_{prev}) = \frac{P(TA)}{P(T)} = \frac{\frac{c(TA)}{N}}{\frac{c(T)}{N}} = \frac{c(TA)}{c(T)}$  
        
        - N is not the same in the cases of TA and T, but the difference is negligible for now
- Increasing the order of a Markov Model makes our model more informed
- Check Week 2 Overview Video 3 for more details on detailed Markov Model for 4-mer described by $k_{1}k_{2}k_{3}k_{4}$

In [None]:
import sys
import os 
import numpy as np
from fastaReader import FastAreader

# holds files
dataList = os.listdir('mdata')
print(f"{os.listdir()} \n{os.listdir('mdata')}")
print('mdata/'+dataList[0])

### SEQUENCE LIST ###
seqList = []
readData = FastAreader('mdata/'+dataList[0]).readFasta()
for line in readData:
    seqList.append(line[1])

### REVERESE COMPLIMENT FUNCTION ###
def revComp(seqStr):
    newStr = ''
    baseDict = {'A':'T', 'T':'A', 'C':'G', 'G':'C', 'N':'N'}
    for char in seqStr:
        newStr += baseDict[char]
    return newStr[::-1]