# Finding a Motif in DNA
**Given:** Two DNA strings *s* and *t* (each of length at most 1 kbp).

**Return:** All locations of *t* as a substring of *s*.

# Sample Dataset

In [1]:
%%file Sample_Dataset.txt
GATATATGCATATACTT
ATAT



Overwriting Sample_Dataset.txt


# Sample Output

In [2]:
%%file Sample_Output.txt
2 4 10



Overwriting Sample_Output.txt


# Solution

In [3]:
import re

def findMotifInDNA(s, t):
    "Given two DNA strings s and t (each of length at most 1 kbp), all locations of t as a substring of s."
    
    regex_lookahead_query = "(?=%s)" % re.escape(t)
    
    motif_locations = []
    
    for motif_location in re.finditer(regex_lookahead_query, s) :
        motif_locations.append(motif_location.start())
    
    one_indexed_motif_locations = [motif_location + 1 for motif_location in motif_locations]
    
    return one_indexed_motif_locations

def findMotifInDNAFromFileToFile(input_file_path, output_file_path):
    "Wraps countPointMutations to read from input_file_path and write to output_file_path"
    
    input_file = open(input_file_path,'r')
    output_file = open(output_file_path,'w')
    
    input_strings = input_file.readlines()
    
    s = input_strings[0].strip()
    t = input_strings[1].strip()
    
    one_indexed_motif_locations = findMotifInDNA(s, t)
    
    one_indexed_motif_locations_as_strings = [str(motif_location) for motif_location in one_indexed_motif_locations]
    
    output_string = " ".join(one_indexed_motif_locations_as_strings)
    
    output_file.write("%s\n" % output_string)
    
    input_file.close()
    output_file.close()
    
    return


# Test Solution

In [4]:
findMotifInDNAFromFileToFile("Sample_Dataset.txt", "Test_Output.txt")

In [5]:
%%bash
echo Sample_Output.txt
md5sum Sample_Output.txt
cat Sample_Output.txt

Sample_Output.txt
bf37e5ca54e2dde9fa36d57a40fa8901  Sample_Output.txt
2 4 10


In [6]:
%%bash
echo Test_Output.txt
md5sum Test_Output.txt
cat Test_Output.txt

Test_Output.txt
bf37e5ca54e2dde9fa36d57a40fa8901  Test_Output.txt
2 4 10


In [7]:
%%bash
if [ $(md5sum Sample_Output.txt|cut -f1 -d' ') == $(md5sum Test_Output.txt|cut -f1 -d' ') ]
then
    echo Sample output matches test output.
else
    echo Sample output does not Match test output.
fi

Sample output matches test output.


# Downloaded Dataset

In [8]:
%%bash
cp ~/Downloads/rosalind_subs.txt ./
cat rosalind_subs.txt

GGCAGTATGCAGTATGGCAGTATGCAGTATGCAGTATGACAGTATGAGGTTTCAGTATGCCAGTATGTACAGTATGCGTCAGTATGTCAGTATGTGGCAGTATGCAGTATGCAGTATGCAGTATGGAACAGTATGCAGTATGCAGTATGTCAGTATGAGTTCAGTATGATCAGTATGCCTTGGACCCAGTATGCAGTATGATACAGTATGCAGTATGTCAGTATGATTCAGTATGTCAGTATGCCAGTATGCAGTATGATAACACAGTATGCAACCAGTATGCAGTATGCACAGTATGCCAGTATGCAGTATGGAGACATCGTCAGTATGCAGTATGCAGTATGCGCAGTATGCAGTATGAGCACAGTATGAGCTCACGCAGTATGCAGTATGTAACAGTATGCAGTATGTTCCAGTATGATTTTGTCAGTATGGTGCAGTATGCAGTATGGAAAGCAGTATGGATCAGTATGGTCAGTATGCAGTATGTACAGTATGCAGTATGATGCGCAGTATGCCGTCCAGTATGCGCAGTATGCAGTATGGAGATCAGTATGTCCAACCAGTATGTGGCAGTATGCAGTATGCAGTATGTACCCCAGTATGCAGTATGCAGTATGCAGTATGCAACAGTATGCCAGTATGACCAGTATGCAGTATGGTCGCCAGTATGAGCAGTATGTCGCAGTATGCAGTATGGGAAGCAGTATGCCAGTATGCTCAGTATGCCAGTATGGATCAGTATGTACCAGTATGATGTACAGTATGGGTGAACAGTATGCAGTATGTCAGTATGGCAGTATGTACCAGTATGTCATAACAGTATGGCAGTATGAGTGGGGCAGTATGCAGTATGCGGCAGTATGCCAGTATGCAGTATGCAGTATGCTGCAGTATGATCAGTATGACGGAACACACAGTATGACGGCAGTATGCAGTATGGCTGCAGTATGTCTTCAATTTAACAGTATGCAGTATGATCAGTATG
CAGTATGCA


# Solution to Downloaded Dataset

In [9]:
findMotifInDNAFromFileToFile("rosalind_subs.txt", "Solution_Output.txt")

In [10]:
%%bash
cat Solution_Output.txt

3 18 25 98 105 112 129 136 187 204 245 265 276 283 300 324 331 347 380 397 438 476 492 532 574 581 600 607 614 621 648 686 775 843 868 875 929 966
