# Regular Expressions

is a technique to look for patterns in text mining. 

In [2]:
# import code packages 
import re

In [4]:
# load file 
file_name = "manyStrings.txt"
file_handler = open(file_name)

manyStrings = []  # initiates empty list
for line in file_handler:
    manyStrings.append(line)
    
file_handler.close()

print(manyStrings)

['qwer\n', 'tyui\n', 'poas\n', 'fghj\n', 'kl;\n', 'zxcv\n', 'bnm,\n', '1234\n', '4567\n', '7891\n', '555-79-8765\n', '555-75-7124\n', 'ATCGCGAATTCAC\n', 'CGATCGATGCTAGCTAGCTACGATCGATGCAGGTCCTCGATCATCGATCGATCG\n', 'CGATCGATCGAGGACCTCGATCGATCGATCGATCGATCGATGGTCCCGATCGATCGATCGA\n', 'GCATACGATCGATGCGGCCGATCATCGATCGATCGACTAGCTAGCTAGCTAGCTACGATGC\n', 'TGCATAGCTACGATCGATCGCCGCGATCGATCGATCGTAGCATCGATCGATCGATCGATCGATCG\n', 'ekrjg349\n', 'rer9348\n', 'ritj4598\n', 'wer5498\n']


In [11]:
# Searching 

# define
regex = re.compile(r"GATC")
results = list(filter(regex.search, manyStrings))
# print(results)  # Filters through list and only prints what you are looking for

# AvaII restriction enzyme cuts at two different motifs: GGACC or GGTCC
regex = re.compile(f"GG(A|T)CC")  # verticle line basicall saying "A" or "T", find in list 
results = list(filter(regex.search, manyStrings))
#print(results)

# BisI restriction enzyme cuts at 4 different motifs:
# GCAGC, GCTGC, GCGGC, GCCGC
regex = re.compile(r"GC[ATGC]GC")
results = list(filter(regex.search, manyStrings))
print(results)  # Goes through all strings in file for which strings would be cut by this enzyme, do they have the motifs

['GCATACGATCGATGCGGCCGATCATCGATCGATCGACTAGCTAGCTAGCTAGCTACGATGC\n', 'TGCATAGCTACGATCGATCGCCGCGATCGATCGATCGTAGCATCGATCGATCGATCGATCGATCG\n']


In [16]:
# Anchors 

# Start with a letter
regex = re.compile(r"^[a-z]")
results = list(filter(regex.search, manyStrings))
#print(results)  # prints any list with letters a-z

# End with a number
regex = re.compile(r"[0-9]$")  # add '$' to specifically look a the end of each string 
results = list(filter(regex.search, manyStrings))
#print(results)

# Gene headers start with multiple letters, and end with multiple numbers
regex = re.compile(r"^[a-z]+[0-9]+$")  # starts (^) with one or more (+) letters and ends ($) with one or more (+) numbers
results = list(filter(regex.search, manyStrings))
print(results)

['ekrjg349\n', 'rer9348\n', 'ritj4598\n', 'wer5498\n']


In [17]:
# Quantifiers

# find headers that start with exactly 3 letters
regex = re.compile(r"^[a-z]{3}[0-9]+$")  # if you want exactly 3 letters, it goes inside the curly brackets 
results = list(filter(regex.search, manyStrings))
print(results)  # gives output of two genes that start with 3 letters and end with any number 0-9

['rer9348\n', 'wer5498\n']


In [19]:
def extractDNA(s):
    # This function will extract acceptable strings as "DNA"
    # Input: s (list of strings)
    # Output: dna_strings (list of strings)
    
    # pattern: starts with, ends with, and has multiple nucleotides
    regex = re.compile(r"^[atgcATGC]+$")
    results = list(filter(regex.search, manyStrings))
    
    # remove newline characters "\n"
    dna_strings = []  # have to initialize the list first
    for line in results: 
        dna_strings.append(line.rstrip())
        
    return dna_strings

extractDNA(manyStrings)

['ATCGCGAATTCAC',
 'CGATCGATGCTAGCTAGCTACGATCGATGCAGGTCCTCGATCATCGATCGATCG',
 'CGATCGATCGAGGACCTCGATCGATCGATCGATCGATCGATGGTCCCGATCGATCGATCGA',
 'GCATACGATCGATGCGGCCGATCATCGATCGATCGACTAGCTAGCTAGCTAGCTACGATGC',
 'TGCATAGCTACGATCGATCGCCGCGATCGATCGATCGTAGCATCGATCGATCGATCGATCGATCG']

In [4]:
# Positions
dna = "CGATCGATCGAGGACCTCGATCGATCGATCGATCGATCGATGGTCCCGATCGATCGATCGAGGACC"

cut_site = re.search(r"GG[A|T]CC", dna)
print(cut_site.start())
print(cut_site.end())
print(cut_site)

11
16
<_sre.SRE_Match object; span=(11, 16), match='GGACC'>


In [28]:
# Extractions

print(dna[cut_site.start():cut_site.end()])

# AvaII: G*GACC or G*GTCC
# before cut 
print(dna[:cut_site.start() + 1])
#after cut
print(dna[cut_site.end() - 4:])

GGACC
CGATCGATCGAG
GACCTCGATCGATCGATCGATCGATCGATGGTCCCGATCGATCGATCGA


In [5]:
# Multiple matches 

# Isoleucine codon: ATC
results = re.findall(r"ATC", dna)
for result in results:
    print(result)

ATC
ATC
ATC
ATC
ATC
ATC
ATC
ATC
ATC
ATC
