In [1]:
# module documentation:
# https://docs.python.org/3/library/re.html
import re

In [2]:
dna = "AAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA"

In [4]:
matches = re.search("AA.T", dna)

In [8]:
# print(matches)
print(matches.start())
print(matches.end())
print(matches.span())
print(matches.group(0)) # the text that was matched

0
4
(0, 4)
AAGT


In [9]:
matches = re.search("AA.*TGT", dna)
print(matches.group(0))

AAGTGGTGT


In [10]:
dna = "AAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA"
matches = re.search("GG*A", dna)
print(matches.group(0))

GA


In [13]:
dna = "AAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA"
matches = re.search("GG+A", dna)
if matches is not None:
    print(matches.group(0))
else:
    print("No match found")

No match found


In [15]:
dna = "AAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA"
matches = re.search("CC+G", dna)
if matches is not None:
    print(matches.group(0))
else:
    print("No match found")

CCCG


In [22]:
number_str = "one eight one one three two one seven six"
matches = re.search("one (one )+....", number_str)
if matches is not None:
    print(matches.group(0))
else:
    print("No match found")

one one thre


In [23]:
number_str = "one eight one one three two one seven six"
matches = re.search("one one +....", number_str)
if matches is not None:
    print(matches.group(0))
else:
    print("No match found")

one one thre


In [26]:
# test your regular expressions at
# https://www.debuggex.com/
# remember to set to Python mode
def repeat_string(repeat, num_repeats):
    result = ''
    for num in range(num_repeats):
        result = result + repeat
    return result
dna = 'AAGTGGTGTGAATT' + repeat_string('AT', 5) + 'CCATCGAGTCTTTTGAACGCAA'
print(dna)

AAGTGGTGTGAATTATATATATATCCATCGAGTCTTTTGAACGCAA


In [31]:
matches = re.search("AT(AT)+", dna)
if matches is not None:
    print(matches.group(0), matches.span())
else:
    print("No match")

ATATATATAT (14, 24)


In [33]:
matches = re.search("(AT){3}", dna) # only matches 3 ATs
if matches is not None:
    print(matches.group(0), matches.span())
else:
    print("No match")

ATATAT (14, 20)


In [37]:
def is_vowel(char):
    # regular expression character classes
    # width 1
    # matches anything between the brackets
    matches = re.search("[aeiou]", char.lower())
    if matches is None:
        return False
    else:
        return True
print("a", is_vowel("a"))
print("t", is_vowel("t"))

a True
t False


In [39]:
def is_consonant(char):
    # regular expression character classes
    # width 1
    # a ^ at the start of the character class
    # reverses the meaning
    # matches nothing between the brackets
    matches = re.search("[^aeiou]", char.lower())
    if matches is None:
        return False
    else:
        return True
print("a", is_consonant("a"))
print("t", is_consonant("t"))

a False
t True


In [41]:
header = ">gi|2765656|emb|Z78531.1|CFZ78531 C.fasciculatum 5.8S rRNA gene and ITS1 and ITS2 DNA"
pattern = ">([^ ]+) ([^ ]+) (.*)"
matches = re.match(pattern, header) # match only at start of string
if matches is not None:
    print(matches.group(1))
    print(matches.group(2))
    print(matches.group(3))

gi|2765656|emb|Z78531.1|CFZ78531
C.fasciculatum
5.8S rRNA gene and ITS1 and ITS2 DNA


In [42]:
mystring = "one three one four one five"
pattern1 = "^one three" # ^ means start of the string
pattern2 = "^one four"
matches = re.search(pattern1, mystring)
if matches:
    print("pattern1 matches")
matches = re.search(pattern2, mystring)
if matches:
    print("pattern2 matches")

pattern1 matches


In [43]:
mystring = "one three one four one five"
pattern1 = "one three$" # $ means end of the string
pattern2 = "one five$"
matches = re.search(pattern1, mystring)
if matches:
    print("pattern1 matches")
matches = re.search(pattern2, mystring)
if matches:
    print("pattern2 matches")

pattern2 matches


In [45]:
mystring = "The moon jumped over the cow"
pattern1 = "[A-Za-z]+" # a character class with all letter
matches = re.search(pattern1, mystring)
print(matches.group(0))

The


In [46]:
mystring = "The moon jumped over the cow"
pattern1 = "\w+" # use the \w character class
matches = re.search(pattern1, mystring)
print(matches.group(0))

The


In [47]:
mystring = "The cow had 99 problems to solve"
pattern1 = "\d+" # use the \d character class: digits
matches = re.search(pattern1, mystring)
print(matches.group(0))

99
