Chapter 7 follow along with book: Regular Expressions

In [None]:
# need to import the regular expression module to use them 

In [1]:
import re

In [None]:
# need to use the module name in front of the function from the module 

In [2]:
re.search(pattern, string)

NameError: name 'pattern' is not defined

In [None]:
# The r stands for raw, which is Python's description for a string where special characters are ignored.
# for use when the regular expression character has some other meaning

In [3]:
print(r"\t\n")

\t\n


In [4]:
dna = "ATCGCGAATTCAC"
if re.search(r"GAATTC", dna):
    print("restriction site found!")

restriction site found!


In [None]:
# re.search is a true/false function that determines whether or not a pattern appears somewhere in a string. 
# It takes two arguments, both strings. The first argument is the pattern that you want to search for, 
#and the second argument is the string that you want to search in

In [None]:
# search for a specific restriction site

In [7]:
dna = "ATCGCGAATTCAC"
if re.search(r"GGACC", dna) or re.search(r"GGTCC", dna):
    print("restriction site found!")

In [None]:
# do this with a regular expression 
# so (A|T) means either A or T
# this is alternation

In [8]:
dna = "ATCGCGAATTCAC"
if re.search(r"GG(A|T)CC", dna):
    print("restriction site found!")

In [None]:
# search for GCNGC motif 

In [9]:
dna = "ATCGCGAATTCAC"
if re.search(r"GC(A|T|G|C)GC", dna):
    print("restriction site found!")

In [None]:
# or A pair of square brackets with a list of characters inside them can 
# represent any one of these characters

In [10]:
dna = "ATCGCGAATTCAC"
if re.search(r"GC[ATGC]GC", dna):
    print("restriction site found!")

In [None]:
# a period . can be ANY character

In [None]:
# Putting a caret ^ at the start of a character group like this [^XYZ] will negate it, 
# and match any character that isn't in the group

In [None]:
# quantifiers let us describe variation in the number of times a section of a pattern is repeated

In [None]:
# A question mark immediately following a character means that that character is optional 
# – it can match either zero or one times
# if we want to apply a question mark to more than one character, we can group the characters in parentheses

In [None]:
# A plus sign immediately following a character or group means that the character or group must be present 
# but can be repeated any number of times 

In [None]:
# An asterisk immediately following a character or group means that the character or group is optional, 
# but can also be repeated

In [None]:
# Following a character or group with a single number inside curly brackets 
# will match exactly that number of repeats
# Following a character or group with a pair of numbers inside curly brackets separated with a comma 
# allows us to specify an acceptable range of number of repeats

In [None]:
# the caret symbol ^ matches the start of a string, and the dollar symbol $ matches the end of a string

In [None]:
^ATG[ATGC]{30,1000}A{5,10}$
^ATG # an ATG start codon at the beginning of the sequence
[ATGC]{30,1000} # followed by between 30 and 1000 bases which can be A, T, G or C
A{5,10}$ # followed by a poly-A tail of between 5 and 10 bases at the end of the sequence

In [None]:
# re.search will identify a pattern occurring anywhere in the string, 
# whereas re.match will only identify a pattern if it matches the entire string

In [None]:
# the results of a re.search function return a match object 

In [11]:
dna = "ATGACGTACGTACGACTG"
# store the match object in the variable m
m = re.search(r"GA[ATGC]{3}AC", dna)
print(m.group())

GACGTAC


In [None]:
# By calling the group method on the resulting match object
# we can see the part of the DNA sequence that matched, and figure out what the middle three bases were

In [None]:
# we can specify the match to extract by capturing it in ()
# We can now refer to the captured bits of the pattern by 
# supplying an argument to the group method. group(1) will return the 
# bit of the string matched by the section of the pattern in the first set of parentheses, 
# group(2) will return the bit matched by the second

In [12]:
dna = "ATGACGTACGTACGACTG"
# store the match object in the variable m
# two sets or () means 2 groups 
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("entire match: " + m.group())
print("first bit: " + m.group(1))
print("second bit: " + m.group(2))

entire match: GACGTACGTAC
first bit: CGT
second bit: GT


In [None]:
# The start and end methods get the positions of the start and end of the pattern on the sequence

In [13]:
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("start: " + str(m.start()))
print("end: " + str(m.end()))

start: 2
end: 13


In [None]:
# We can get the start and end positions of individual groups by 
# supplying a number as the argument to start and end

In [14]:
dna = "ATGACGTACGTACGACTG"
m = re.search(r"GA([ATGC]{3})AC([ATGC]{2})AC", dna)
print("start: " + str(m.start()))
print("end: " + str(m.end()))
print("group one start: " + str(m.start(1)))
print("group one end: " + str(m.end(1)))
print("group two start: " + str(m.start(2)))
print("group two end: " + str(m.end(2)))

start: 2
end: 13
group one start: 4
group one end: 7
group two start: 9
group two end: 11


In [None]:
# split the DNA string wherever we see a base that isn't A, T, G or C

In [15]:
dna = "ACTNGCATRGCTACGTYACGATSCGAWTCG"
runs = re.split(r"[^ATGC]", dna)
print(runs)

['ACT', 'GCAT', 'GCTACGT', 'ACGAT', 'CGA', 'TCG']


In [None]:
# re.findall returns a list of all matches of a pattern in a string. 
# The first argument is the pattern, and the second argument is the string

In [16]:
dna = "ACTGCATTATATCGTACGAAATTATACGCGCG"
runs = re.findall(r"[AT]{4,100}", dna)
print(runs)

['ATTATAT', 'AAATTATA']


In [None]:
# the return object is a list of strings
# if we want positions, we need a match object 
# finditer returns a sequence of match objects, 
# so to do anything useful with it, we need to use the return value in a loop

In [17]:
dna = "ACTGCATTATATCGTACGAAATTATACGCGCG"
runs = re.finditer(r"[AT]{3,100}", dna)
for match in runs:
    run_start = match.start()
    run_end = match.end()
    print("AT rich region from " + str(run_start) + " to " + str(run_end))

AT rich region from 5 to 12
AT rich region from 18 to 26


Exercises

In [None]:
# Accession names 
# xkn59438, yhdck2, eihd39d9, chdsye847, hedle3455, xjhd53e, 45da, de37dp

In [None]:
# Write a program that will print only the accession names that satisfy the following criteria separately
# contain the number 5
# contain the letter d or e
# contain the letters d and e in that order
# contain the letters d and e in that order with a single letter between them
# contain both the letters d and e in any order
# start with x or y
# start with x or y and end with e
# contain three or more numbers in a row
# end with d followed by either a, r or p

In [None]:
accession = ["xkn59438", "yhdck2", "eihd39d9", "chdsye847", "hedle3455", "xjhd53e", "45da", "de37dp"] 
# make list of accession names 
# think all of these have to be a loop to go through each accession

In [27]:
accession = ["xkn59438", "yhdck2", "eihd39d9", "chdsye847", "hedle3455", "xjhd53e", "45da", "de37dp"] 
# make list of accession names 
# which contain the number 5 
for acc in accession:
    if re.search("5", acc): # conditional so it'll only print if the condition is true
        print(acc)

xkn59438
hedle3455
xjhd53e
45da


In [32]:
accession = ["xkn59438", "yhdck2", "eihd39d9", "chdsye847", "hedle3455", "xjhd53e", "45da", "de37dp"] 
# make list of accession names 
# which contain the letter d or e
for acc in accession:
    if re.search(r"d|e", acc): # use pipe to specify d or e 
        print(acc)

yhdck2
eihd39d9
chdsye847
hedle3455
xjhd53e
45da
de37dp


In [33]:
accession = ["xkn59438", "yhdck2", "eihd39d9", "chdsye847", "hedle3455", "xjhd53e", "45da", "de37dp"] 
# make list of accession names 
# which contain the letters d and e in that order
for acc in accession:
    if re.search(r"de", acc): # specific string just for de 
        print(acc)

de37dp


In [34]:
accession = ["xkn59438", "yhdck2", "eihd39d9", "chdsye847", "hedle3455", "xjhd53e", "45da", "de37dp"] 
# make list of accession names 
# which contain the letters d and e in that order with a single letter between them
for acc in accession:
    if re.search(r"d.e", acc): # this will to any character between d and e, not sure how to specify just letter 
        print(acc)

hedle3455


In [36]:
accession = ["xkn59438", "yhdck2", "eihd39d9", "chdsye847", "hedle3455", "xjhd53e", "45da", "de37dp"] 
# make list of accession names 
# which contain both the letters d and e in any order
for acc in accession:
    if re.search(r"d", acc) and re.search(r"e", acc): # need to do and to specify both
        print(acc)

eihd39d9
chdsye847
hedle3455
xjhd53e
de37dp


In [46]:
accession = ["xkn59438", "yhdck2", "eihd39d9", "chdsye847", "hedle3455", "xjhd53e", "45da", "de37dp"] 
# make list of accession names 
# which start with x or y
for acc in accession:
    if acc.startswith("x") or acc.startswith("y"):
        print(acc)

xkn59438
yhdck2
xjhd53e


In [48]:
accession = ["xkn59438", "yhdck2", "eihd39d9", "chdsye847", "hedle3455", "xjhd53e", "45da", "de37dp"] 
# make list of accession names 
# which start with x or y and ends with e 
for acc in accession:
    if (acc.startswith("x") or acc.startswith("y")) and acc.endswith("e"): # need parenthesises around x and y 
        print(acc)

xjhd53e


In [53]:
accession = ["xkn59438", "yhdck2", "eihd39d9", "chdsye847", "hedle3455", "xjhd53e", "45da", "de37dp"] 
# make list of accession names 
# which contain three or more numbers in a row
for acc in accession:
    if re.search("[0-9]{3}", acc): # search for 0-9 three times in a row 
        print(acc)

xkn59438
chdsye847
hedle3455


In [54]:
accession = ["xkn59438", "yhdck2", "eihd39d9", "chdsye847", "hedle3455", "xjhd53e", "45da", "de37dp"] 
# make list of accession names 
# which end with d followed by either a, r or p 
# this doesn't make any sense because how can it end with d and also a, r or p????
for acc in accession:
    if acc.endswith("da") or acc.endswith("dr") or acc.endswith("dp"): # there might be a better way but idk
        print(acc)

45da
de37dp


In [None]:
# double digest
# Predict the fragment lengths that we will get if we digest the sequence with two made-up 
# restriction enzymes – AbcI, whose recognition site is ANT*AAT, and AbcII, whose recognition 
# site is GCRW*TG (asterisks indicate the position of the cut site)

In [57]:
digestfile = open("/Users/maggieschedl/Desktop/Github/Python_4_Biologists_Learning_Group/exercises_and_examples/regular_expressions/exercises/dna.txt", "r")
# open the file in read mode
# check what it looks like 
file = digestfile.read()
print(file)

ATGGCAATAACCCCCCGTTTCTACTTCTAGAGGAGAAAAGTATTGACATGAGCGCTCCCGGCACAAGGGCCAAAGAAGTCTCCAATTTCTTATTTCCGAATGACATGCGTCTCCTTGCGGGTAAATCACCGACCGCAATTCATAGAAGCCTGGGGGAACAGATAGGTCTAATTAGCTTAAGAGAGTAAATCCTGGGATCATTCAGTAGTAACCATAAACTTACGCTGGGGCTTCTTCGGCGGATTTTTACAGTTACCAACCAGGAGATTTGAAGTAAATCAGTTGAGGATTTAGCCGCGCTATCCGGTAATCTCCAAATTAAAACATACCGTTCCATGAAGGCTAGAATTACTTACCGGCCTTTTCCATGCCTGCGCTATACCCCCCCACTCTCCCGCTTATCCGTCCGAGCGGAGGCAGTGCGATCCTCCGTTAAGATATTCTTACGTGTGACGTAGCTATGTATTTTGCAGAGCTGGCGAACGCGTTGAACACTTCACAGATGGTAGGGATTCGGGTAAAGGGCGTATAATTGGGGACTAACATAGGCGTAGACTACGATGGCGCCAACTCAATCGCAGCTCGAGCGCCCTGAATAACGTACTCATCTCAACTCATTCTCGGCAATCTACCGAGCGACTCGATTATCAACGGCTGTCTAGCAGTTCTAATCTTTTGCCAGCATCGTAATAGCCTCCAAGAGATTGATGATAGCTATCGGCACAGAACTGAGACGGCGCCGATGGATAGCGGACTTTCGGTCAACCACAATTCCCCACGGGACAGGTCCTGCGGTGCGCATCACTCTGAATGTACAAGCAACCCAAGTGGGCCGAGCCTGGACTCAGCTGGTTCCTGCGTGAGCTCGAGACTCGGGATGACAGCTCTTTAAACATAGAGCGGGGGCGTCGAACGGTCGAGAAAGTCATAGTACCTCGGGTACCAACTTACTCAGGTTATTGCTTGAAGCTGTACTATTTTAGGGGGGGAGCGCTGAAGG

In [92]:
digestfile = open("/Users/maggieschedl/Desktop/Github/Python_4_Biologists_Learning_Group/exercises_and_examples/regular_expressions/exercises/dna.txt")
# open the file in read mode
# start by finding all sites that are ANTAAT where N is any base ATCG 
starts = [] #create empty list for the cutsite start positions

for line in digestfile:
    AbcI = re.finditer(r"A[ATCG]{1}TAAT", line) # find the cutsite
    for match in AbcI: # tell me each match
        AbcI_start = match.start() # tell me where it starts
        starts.append(str(AbcI_start)) # add these numbers to a list 
        print("cutsite starts at " + str(AbcI_start)) # what are the cutsites
        print(starts) # check the list
        
# separate list into first and seconds 
# add 3 because the fragment includes 3 bases of the cutsite 
first = int(starts[0]) + 3
second = int(starts[1]) + 3 
print(first) # check number

# this stuff doesn't work because chunk too long?
# file = digestfile.read()
# print(file)
# make the first fragment
# first_frag = file[0:10]
# print(first_frag)

cutsite starts at 1140
['1140']
cutsite starts at 1625
['1140', '1625']
1143




In [102]:
# keep going because chunk was too long 
# read in file again
digestfile = open("/Users/maggieschedl/Desktop/Github/Python_4_Biologists_Learning_Group/exercises_and_examples/regular_expressions/exercises/dna.txt")
file = digestfile.read()
print(file)
# first fragment 
first_frag = file[0:(first + 1)] # # not inclusive to have to add 1
print(first_frag)
second_frag = file[first:(second + 1)] # not inclusive to have to add 1 
print(second_frag)
# need the length for the last fragment 
length = len(file)
third_frag = file[second:length]
print(third_frag)
# now get the lengths 

print("The AbcI enzyme makes 3 fragments with lengths of " + str(len(first_frag)) + ", " + str(len(second_frag)) + ", and " + str(len(third_frag)) + " bases.")

ATGGCAATAACCCCCCGTTTCTACTTCTAGAGGAGAAAAGTATTGACATGAGCGCTCCCGGCACAAGGGCCAAAGAAGTCTCCAATTTCTTATTTCCGAATGACATGCGTCTCCTTGCGGGTAAATCACCGACCGCAATTCATAGAAGCCTGGGGGAACAGATAGGTCTAATTAGCTTAAGAGAGTAAATCCTGGGATCATTCAGTAGTAACCATAAACTTACGCTGGGGCTTCTTCGGCGGATTTTTACAGTTACCAACCAGGAGATTTGAAGTAAATCAGTTGAGGATTTAGCCGCGCTATCCGGTAATCTCCAAATTAAAACATACCGTTCCATGAAGGCTAGAATTACTTACCGGCCTTTTCCATGCCTGCGCTATACCCCCCCACTCTCCCGCTTATCCGTCCGAGCGGAGGCAGTGCGATCCTCCGTTAAGATATTCTTACGTGTGACGTAGCTATGTATTTTGCAGAGCTGGCGAACGCGTTGAACACTTCACAGATGGTAGGGATTCGGGTAAAGGGCGTATAATTGGGGACTAACATAGGCGTAGACTACGATGGCGCCAACTCAATCGCAGCTCGAGCGCCCTGAATAACGTACTCATCTCAACTCATTCTCGGCAATCTACCGAGCGACTCGATTATCAACGGCTGTCTAGCAGTTCTAATCTTTTGCCAGCATCGTAATAGCCTCCAAGAGATTGATGATAGCTATCGGCACAGAACTGAGACGGCGCCGATGGATAGCGGACTTTCGGTCAACCACAATTCCCCACGGGACAGGTCCTGCGGTGCGCATCACTCTGAATGTACAAGCAACCCAAGTGGGCCGAGCCTGGACTCAGCTGGTTCCTGCGTGAGCTCGAGACTCGGGATGACAGCTCTTTAAACATAGAGCGGGGGCGTCGAACGGTCGAGAAAGTCATAGTACCTCGGGTACCAACTTACTCAGGTTATTGCTTGAAGCTGTACTATTTTAGGGGGGGAGCGCTGAAGG

In [None]:
# Next is AbcII enzyme with GCRW*TG cutsite 
# R is G or C 
# W is A or T 

In [106]:
# going to do this a lot like the last one 
digestfile = open("/Users/maggieschedl/Desktop/Github/Python_4_Biologists_Learning_Group/exercises_and_examples/regular_expressions/exercises/dna.txt")
# open the file in read mode
beginnings = [] #create empty list for the cutsite start positions

for line in digestfile:
    AbcII = re.finditer(r"GC[GC]{1}[AT]{1}TG", line) # find the cutsite
    for match in AbcII: # tell me each match
        AbcII_start = match.start() # tell me where it starts
        beginnings.append(str(AbcII_start)) # add these numbers to a list 
        print("AbcII cutsite starts at " + str(AbcII_start)) # what are the cutsites
        print(beginnings) # check the list

# ok there is only one cutsite for this enzyme
# so this didn't have to be a list but thats ok didn't know 
# add 3 because the fragment includes 3 bases of the cutsite 
AbcII_st = int(beginnings[0]) + 3
print(AbcII_st) # check number


AbcII cutsite starts at 484
['484']
487


In [112]:
digestfile = open("/Users/maggieschedl/Desktop/Github/Python_4_Biologists_Learning_Group/exercises_and_examples/regular_expressions/exercises/dna.txt")
file = digestfile.read()
# need to read it in a separate chunk for some reason 
# get the first chunk for this one
new_first = file[0:(AbcII_st + 1)] # not inclusive to have to add 1
new_second = file[AbcII_st:length]

print("The AbcII enzyme makes 2 fragments legnths of " + str(len(new_first)) + " and " +  str(len(new_second)) + " bases.")

The AbcII enzyme makes 2 fragments legnths of 488 and 1526 bases.
