 # <center> Regular Expressions </center>

In [2]:
import re

## Major Functions
- search(pattern, string, flags) : Returns the first match of the found object
- findall(pattern, string, flags) : Returns a list of ALL found objects
- finditer(pattern, string, flags) : Returns an iterator of found objects
- sub(pattern, replacement, string, count, flags) : Substitutes the pattern with the replacement in the string

<b> flags are optional!</b>

## Common Flags
- re.I : Ignore the case while matching
- re.M : Multiline. Include the \n denoting new line at the end of each line while matching for the end of the line

## Examples

In [11]:
print(re.search(r"ape", "The ape was at the apex")) # Only finds the first occurence
print(re.search(r"ape", "The ape was at the apex").span(0))
print(re.search(r"ape", "The ape was at the apex").group(0))

<_sre.SRE_Match object; span=(4, 7), match='ape'>
(4, 7)
ape


In [12]:
# findall() returns a list of matches
allApes = re.findall(r"ape", "The ape was at the apex")
allApes

['ape', 'ape']

In [14]:
# finditer returns an iterator of matching objects
# You can use span to get the location
 
s = "The ape was at the apex"
 
for i in re.finditer(r"ape", s):
 
    # Span returns a tuple
    locTuple = i.span()
 
    print(locTuple)
 
    # Slice the match out using the tuple values
    print(s[locTuple[0]:locTuple[1]])

(4, 7)
ape
(19, 22)
ape


## The . (dot/period) Operator

- <b> Matches one single character </b>

In [19]:
print(re.findall(r"ape.", "The ape was at the apex of all the apesss"))

['ape ', 'apex', 'apes']


## Character Classes
- <b> Match 1 of Several Letters</b>

In [20]:
# Denoted by []
print(re.findall(r"[crmfp]at", "Cat rat mat fat pat"))

['rat', 'mat', 'fat', 'pat']


In [21]:
# Range of characters : denoted by hyphen '-'
print(re.findall("[c-mC-M]at", "Cat rat mat fat pat"))

['Cat', 'mat', 'fat']


## The ^ operator

- Has 2 meanings based on usage

### Inside the character class

In [4]:
# Matches everything EXCEPT the characters
print(re.findall(r"[^cmp]at", "Cat rat mat fat pat"))

['Cat', 'rat', 'fat']


### Outside the character class

In [8]:
# Matches the start of a string
s1 = 'String of characters'
s2 = 'This is a String of characters'
print(re.findall(r"^String", s1))
print(re.findall(r"^String", s2))

['String']
[]


## The $ operator
- Similar to the ^ operator, matches the END of a string

In [15]:
s1 = 'The end is at the end'
s2 = 'The end is in the middle'
print(re.findall(r"end$", s1))
print(re.findall(r"end$", s2))

['end']
[]


## The compile() function
- Create and store a pattern as an object

In [20]:
# Useful for long patterns
s = "Cat rat mat fat pat"
pat = re.compile(r'.at')
print(re.findall(pat,s))
print(pat.findall(s))

['Cat', 'rat', 'mat', 'fat', 'pat']
['Cat', 'rat', 'mat', 'fat', 'pat']


## To match a special character
- Escape it using the \

In [4]:
s = "Here is \\stuff"
 
# This won't find it
print("Find \\stuff : ", re.search("\\stuff", s))
 
# This will
print("Find \\stuff : ", re.search("\\\\stuff", s))

Find \stuff :  None
Find \stuff :  <_sre.SRE_Match object; span=(8, 14), match='\\stuff'>


### Alternatively

In [5]:
# This will
print(r"Find \\stuff : ", re.search(r"\\stuff", s))

Find \\stuff :  <_sre.SRE_Match object; span=(8, 14), match='\\stuff'>


### Other special characters
- '.'
- '+'
- '?'
- '\n' '\t', etc
- '\d' or '\D'
- '\s' or '\S'
- And more

## Matching whitespace
- <b> '\s' : Match a space
- <b> '\S' : Match anything except a space

In [12]:
s = """This is a long
string that goes
on for many lines"""
 
print(s)
 
# Remove newlines
regex = re.compile("\s")
 
s = regex.sub("(space)", s)
 
print(s)

This is a long
string that goes
on for many lines
This(space)is(space)a(space)long(space)string(space)that(space)goes(space)on(space)for(space)many(space)lines


## Matching Any Single Number
- <b> '\d' can be used instead of [0-9]
- <b> '\D' is the same as [^0-9]

In [15]:
s = "12345"
print("Matches :", re.findall("\d", s))

Matches : ['1', '2', '3', '4', '5']


In [16]:
s = "a1b3f4"
print("Matches :", re.findall("\D", s))

Matches : ['a', 'b', 'f']


## Matching Any Single Letter, Number or Underscore
- <b> '\w' can be used instead of [0-9a-zA-Z_]
- <b> '\W' is the same as [^0-9a-zA-Z_]

In [20]:
phNum = "412-555-12125"
 
# Check if it is a phone number
if re.search("^\w\w\w-\w\w\w-\w\w\w\w$", phNum):
    print("It is a phone number")
else:
    print("Not a valid number")

Not a valid number


### Alternatively
- Use the { }
- Put the EXPECTED length of the word/number in the brackets 
- Quantifier

In [24]:
phNum = "412-555-1214"
 
# Check if it is a phone number
if re.search("^\w{3}-\w{3}-\w{4}$", phNum):
    print("It is a phone number")
else:
    print("Not a valid number")

It is a phone number


In [26]:
# Ranges of word length - Use {lower bound, upper bound}
# Check for valid first and last name with a space between 2 and 20 characters
if re.search("\w{2,20}\s\w{2,20}", "Natarajan Mahalingam"):
    print("It is a valid full name")

It is a valid full name


## Other Quantifiers
- '+' : Match ONE or more of the preceding expression
- '*' : Match ZERO or more of the preceding expression

In [92]:
print("Matches :", re.findall("ca+t", " ct cat caaat caaaaaaaat"))

Matches : ['cat', 'caaat', 'caaaaaaaat']


In [93]:
print("Matches :", re.findall("ca*t", "ct cat caaat caaaaaaaat"))

Matches : ['ct', 'cat', 'caaat', 'caaaaaaaat']


## The Optional modifier
- '?' : makes the previous expression optional

In [115]:
print("Matches :", re.findall("ca*ts?", "ct cat caaat caaaaaaaat cats cato"))

Matches : ['ct', 'cat', 'caaat', 'caaaaaaaat', 'cats', 'cat']


## Greedy & Lazy Matching
- Use the ? operator again

In [166]:
# Greedy matching - get the largest possible match
randStr = """<name>Nate</name><name>Sam</name><name>Morgan</name><name>Joyce</name><name>Monica</name>"""
regex = re.compile(r"<name>.*</name>")
matches = re.findall(regex, randStr)
print("Matches :", matches)

Matches : ['<name>Nate</name><name>Sam</name><name>Morgan</name><name>Joyce</name><name>Monica</name>']


In [163]:
# Lazy matching - get the smallest possible match
randStr = """<name>Nate</name><name>Sam</name><name>Morgan</name><name>Joyce</name><name>Monica</name>"""
regex = re.compile(r"<name>.*?</name>")
matches = re.findall(regex, randStr)
print("Matches :", matches)

Matches : ['<name>Nate</name>', '<name>Sam</name>', '<name>Morgan</name>', '<name>Joyce</name>', '<name>Monica</name>']


## Word Boundaries
- Use to match whole words
- \b matches beginning and end of a word
- \B matches everything EXCEPT beginning and end of word

In [149]:
s = """foo foo. (foo) foobar foo123 fooxyz"""
pat = re.compile(r'\bfoo\b')
matches = re.findall(pat, s)
print("Matches with word boundaries:", matches)

Matches with word boundaries: ['foo', 'foo', 'foo']


In [153]:
s1 = """foo foo. (foo) foobar foo123 fooxyz abcfooxyz"""
pat = re.compile(r'foo\B')
matches = re.findall(pat, s1)
print("Matches without word boundaries:", matches)

Matches without word boundaries: ['foo', 'foo', 'foo', 'foo']


## Subexpressions
- If you want to match for a large block, but only want to return part of it
- Splits the match based on ( )

In [160]:
randStr = "My number is 412-555-1212" 
regex = re.compile(r"412-(.*)") 
matches = re.findall(regex, randStr) 
print("Matches :", matches)

Matches : ['555-1212']


In [169]:
randStr = "My number is 412-555-1212"
regex = re.compile(r"412-(.*)-(.*)")
matches = re.findall(regex, randStr) # Creates a list of subexpression tuples
print(matches)  
print('First 3 digits:',matches[0][0])
print('Last 4 digits:',matches[0][1])

[('555', '1212')]
First 3 digits: 555
Last 4 digits: 1212


## Backreferences
- A back reference allows you to to reuse the expression that proceeds it

In [13]:
s = "<a href='#'><b>Link 1</b></a>"
 
# Regex matches bold tags and grabs the text between them to be used by the back reference
pat = re.compile(r"<b>(.*?)</b>")
 
# Replace the tags with just the text between them
s = re.sub(pat, r"\1", s)
 
print(s)
 
# Another Back Reference example
 
# Receive this string
phnum = "412-555-1212"
 
# Match the phone number using multiple subexpressions
pat = re.compile(r"([\d]{3})-([\d]{3}-[\d]{4})")
 
# Output (412)555-1212
s = re.sub(pat, r"(\1)\2", phnum)
 
print(s)

<a href='#'>Link 1</a>
(412)555-1212


# Named Groups
- Refer to a specific group by name
- Syntax : (?P< Name >expression)

In [10]:
randStr = "Februray 8, 2018"
 
regex = r"^(?P<month>\w+)\s(?P<day>\d+),\s(?P<year>\d+)"
 
matches = re.search(regex, randStr)
 
print("Month :", matches.group('month'))
print("Day :", matches.group('day'))
print("Year :", matches.group('year'))

Month : Februray
Day : 8
Year : 2018


# OR conditional
- Use the | (pipe) operator
- Checks for multiple conditions

In [7]:
randStr = "1. Dog 2. Cat 3. Turtle"
regex = re.compile(r"\d\.\s(Dog|Cat)") 
matches = re.findall(regex, randStr)
print(matches)

['Dog', 'Cat']


# Problems

### 1. Create a regex that will match for 5 digit zip codes or zip codes with 5 digits a dash and then 4 digits

In [14]:
# Answer
randStr = "12345 12345-1234 1234 12346-333"
regex = re.compile(r"(\d{5}-\d{4}|\d{5}\s)")
matches = re.findall(regex, randStr)
print(matches)

['12345 ', '12345-1234']


### 2. Find all of the following email addresses
 
- d+b@aol.com 
- a_1@yahoo.co.uk 
- A-100@m-b.INTERNATIONAL"

In [15]:
# Answer
randStr = "d+b@aol.com a_1@yahoo.co.uk A-100@m-b.INTERNATIONAL"
regex = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
matches = re.findall(regex, randStr)
print(matches) 

['d+b@aol.com', 'a_1@yahoo.co.uk', 'A-100@m-b.INTERNATIONAL']


### 3. Create HTML tags using backreference substitutions
- Receive a string like this "https://www.youtube.com"
- Convert to < a href='https://www.youtube.com'>www.youtube.com< /a>

In [19]:
# Answer
randStr = "https://www.youtube.com"
regex = re.compile(r"(https?://([\w.]+))")
randStr = re.sub(regex, r"<a href='\1'>\2</a>\n", randStr)
print(randStr)

<a href='https://www.youtube.com'>www.youtube.com</a>



### 4. Create a regex to match phone numbers

In [16]:
# Answer
randStr = """14125551212 4125551212 (412)5551212 412 555 1212 412-555-1212 1-412-555-1212 1 412 555 1212
             +919500632152"""
 
regex = re.compile(r"((\+\d{1,2})?((1?)(-|\s?)(\()?(\d{3})(\)|-|\s|\)-|\)\s)?(\d{3})(-|\s)?(\d{4}|\d{4})))")
 
matches = re.findall(regex, randStr)
matches 
for i in matches:
    print(i[0].lstrip())

14125551212
4125551212
(412)5551212
412 555 1212
412-555-1212
1-412-555-1212
1 412 555 1212
+919500632152


# Final tips
- 1) Do not try to do everything in one long regex
- 2) Get a regular expression tool
- 3) Don't reiinvent the wheel

# Resources

- regexr.com : my go to website for testing regex
- debuggex.com : useful for visualising the structure of regex
- Notepad++ : has a good regex engine
- rexegg.com : very comprehensive look at regex
- regular-expressions.info : More information on regex