In [1]:
# imports module dealing with regex
import re

In [8]:
# print a raw string (ignores the escape character)
print(r"\t\n")
print("\tspam\n")
print(r"\tspam\n")

# this won't work, though:
# print(r""")


\t\n
	spam

\tspam\n


In [None]:
## Searching for a pattern in a string

In [9]:
dna = "ATCGCGAATTCAC"

# regular expression search
# syntax: re.search(pattern, string)
# (is the pattern contained within the string?)
if re.search(r"GAATTC", dna):
    print("restriction site found!")

restriction site found!


In [17]:
# (A|T) = "A or T at this position"
dna = "ATCGCGAATTCAC"
if re.search(r"GG(A|T)CC", dna):
    print("restriction site found!")
else:
    print("restriction site not found!")

print("\n" + "another sequence ..." + "\n")

dna = "ATCGCGGACCTCAC"
if re.search(r"GG(A|T)CC", dna):
    print("restriction site found!")
else:
    print("restriction site not found!")

restriction site not found!

another sequence ...

restriction site found!


In [18]:
 ## Character groups

In [20]:
# GC(A|T|G|C) = GC[ATGC]
dna = "ATCGCGAATTCAC"
if re.search(r"GC[ATGC]GC",dna):
    print("restriction site found!")
else:
    print("restriction site not found!")
    
print("\n" + "another sequence ..." + "\n")
dna = "ATCGCGGCAGCATTCAC"
if re.search(r"GC[ATGC]GC",dna):
    print("restriction site found!")
else:
    print("restriction site not found!")
   

restriction site not found!

another sequence ...

restriction site found!


In [22]:
# putting a caret at the front will negate...
# [^XYZ]

In [35]:
# a question mark following a character:
# match 0 or 1 times
if re.search(r"GAT?C", "GATC"):
    print("yes")
else:
    print("no")

if re.search(r"GAT?C", "GAC"):
    print("yes")
else:
    print("no")

if re.search(r"GAt?C", "GATC"):
    print("yes")
else:
    print("no")

if re.search(r"GA(T|t)?", "GAtC"):
    print("yes")
else:
    print("no")

if re.search(r"GA(T|t)?C", "GAtC"):
    print("yes")
else:
    print("no")
    
if re.search(r"GA[^ACG]C", "GATC"):
    print("yes")
else:
    print("no")

yes
yes
no
yes
yes
yes


In [44]:
# an asterisk means that the character is optional OR repeated >0 times
if re.search(r"GGGA*TTT", "GGGAAAAAAAaaaAAAAAAAAAAAAAAATTT"):
    print("yes")
print("\n"+"+")
if re.search(r"GGGA*TTT", "GGGAAAAAAAAAAAAAAAAAAAAAATTT"):
    print("yes")


+
yes


In [45]:
# to match a range of repeats, use curly brackets

In [48]:
# you can combine these!!!
# ^AUG[AUGC]{30,1000}A{5,10}$
# the beginning has to be AUG
# then, A or U or G or C repeats between 30 and 1000 times
# then, an A tail of 5 to 10 nucleotides at the end
# in action:
rna = "AUGAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAUGCAAAAA"
if re.search(r"^AUG[AUGC]{30,1000}A{5,10}$", rna):
    print("combining stuff works!!!")

# re.match : match the ENTIRE STRING, not just search thru the string for matches

combining stuff works!!!


In [54]:
dna = "ATCGCGYAATTCAC"
if re.search(r"[^ATGC]", dna):
    print("ambiguous base found!")

ambiguous base found!


In [56]:
dna = "GCATNCGGAACGATC"
m = re.search(r"[^ATGC]", dna)

# m is now a match object
if m:
    print("ambiguous base found!")
    ambig = m.group()
    print("the base is " + ambig)
    
print("\n \n")

# this only seems to match the first instance ...
dna = "GCATNYNGGATTC"
m = re.search(r"[^ATGC]", dna)
if m:
    print("ambiguous base(s) found!")
    ambig = m.group()
    print("the base(s) is/are " + ambig)

ambiguous base found!
the base is N

 

ambiguous base(s) found!
the base(s) is/are N


In [71]:
# here's how to fix the loop
dna = "GCATNYNGGATTC"
m = re.search(r"([^ATGC]+)", dna)
if m:
    print("ambiguous base(s) found!")
    ambig = m.group()
    print("the base(s) is/are:")
    for letter in ambig:
        print(letter)
        
print("\n")
dna = "GCATNYGNGATTC"
m = re.search(r"([^ATGC]+)", dna)
if m:
    print("ambiguous base(s) found!")
    ambig = m.group()
    print("the base(s) is/are:")
    for letter in ambig:
        print(letter)

ambiguous base(s) found!
the base(s) is/are:
N
Y
N


ambiguous base(s) found!
the base(s) is/are:
N
Y


In [65]:
scientific_name = "Homo sapiens"
m = re.search("(.+) (.+)", scientific_name)
if m:
    genus = m.group(1)
    species = m.group(2)
    print("genus is " + genus + ", species is " + species)

genus is Homo, species is sapiens


In [69]:
# First thing matched
print(m.group(1))

# Second character matched
print(m.group()[1])

# First character in second item matched
print(m.group(2)[0])

Homo
o
s


In [70]:
dna = "CGATNCGGAACGATC"
m = re.search(r"[^ATGC]", dna)
if m:
    print("ambiguous base found!")
    print("at position " + str(m.start()))

ambiguous base found!
at position 4


In [72]:
# another way to find multiple matches
# re.finditer()
dna = "CGCTCNTAGATGCGRATGACTGCAYTGC"
matches = re.finditer(r"[^ATGC]", dna)
for m in matches:
    base = m.group()
    pos = m.start()
    print(base + " found at position " + str(pos))

N found at position 5
R found at position 14
Y found at position 24


In [73]:
## Getting multiple matches as strings

In [74]:
dna = "CTGCATTATATCGTACGAAATTATACGCGCG!"
matches = re.finditer(r"[AT]{6,}", dna)
result = []
for m in matches:
    result. append(m.group())
print(result)

['ATTATAT', 'AAATTATA']


In [76]:
result = []
result = re.findall(r"[AT]{6,}", dna)
print(result)

['ATTATAT', 'AAATTATA']


In [77]:
## Splitting a string using regex

In [78]:
dna = "ACTNGCATRGCTACGYACGATSCGAWTCG"
runs = re.split(r"[^ATGC]", dna)
print(runs)

['ACT', 'GCAT', 'GCTACG', 'ACGAT', 'CGA', 'TCG']
