# Regular Expressions

In [1]:
#(Regular expression are used for specifying text patterns)
import re  #import regular expressions

In [26]:
#this is to check if phone numbers exits in a block of text.

#block of text
message = 'call me 415-533-5353 tomorrow, or at 412-555-4332'

In [27]:
#\d is the regex for a numeric digit character
#phone number pattern (ddd-ddd-dddd)
phoneNumber = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') 


num = phoneNumber.search(message) #this searches for the first phone number
numAll = phoneNumber.findall(message) #this searches all occurences of the phone number pattern

#print results out
print(num)
print(num.group()) #the group object tells you the actual text
print(numAll) #returns a list of strings

<re.Match object; span=(8, 20), match='415-533-5353'>
415-533-5353
['415-533-5353', '412-555-4332']


In [30]:
#we can also mark out groups using RE
#let's take out the area code from the phone number. That's the first 3 digits

phoneGroup = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')

num2 = phoneGroup.search(message)
print(num2.group())  # or num2.group()
print(num2.group(1)) 
num2.group(2)
print('='*50)

#to find all
num3 = phoneGroup.findall(message)
print(num3) #returns a list of tuples with have strings (with groups)


415-533-5353
415
[('415', '533-5353'), ('412', '555-4332')]


In [24]:
#looking for all possible suffixes
BatRegex = re.compile(r'Bat(man|mobile|copter|bat)')

mo = BatRegex.search('Batmobile lost a wheel by Batman')
print(mo.group())
print('='*50)

mo1 = BatRegex.findall('Batmobile lost a wheel by Batman')
print(mo1)

Batmobile
['mobile', 'man']


In [23]:
# '?' refers to the preceding group appearing once or zero times

BatRegex = re.compile(r'Bat(wo)?man')

mo = BatRegex.search('The Adventures of Batman')
print(mo.group())
print('='*50)

mo = BatRegex.search('The Adventures of Batwoman')
print(mo.group())
print('='*50)

mo = BatRegex.search('The Adventures of Batwowowoman')
mo == None  #it will return None since it exists more than once

Batman
Batwoman


True

In [11]:
#using the ? to check phone numbers that may oy may not have an area code in front of it

phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')

#with the area code
phone = phoneRegex.search('My phone number is 415-333-1355. Call me tomorrow') 
print(phone)

#without the area code
phone = phoneRegex.search('My phone number is 333-1355. Call me tomorrow')
print(phone)

<re.Match object; span=(19, 31), match='415-333-1355'>

In [2]:
#to get literal characters, wwe add a backslash \

regex = re.compile(r'\+\*\?')
regex.search('I learned about +*? in class today')

<re.Match object; span=(16, 19), match='+*?'>

In [4]:
# the  + sign shows to appera one or more times

regex = re.compile(r'\+\*\?')
regex.findall('I learned about +*?+*?+*?+*? in class today')

['+*?', '+*?', '+*?', '+*?']

In [22]:
# to match the exact characters

haRegex = re.compile(r'(Ha){4}') #to match Ha that occurs exactly 4 times

ha = haRegex.search('He said HaHaHa') #Ha occurs 3 times
print(ha == None)
print('='*50)

ha1 = haRegex.search('He said HaHaHaHa')
print(ha1)

True
<re.Match object; span=(8, 16), match='HaHaHaHa'>


In [13]:
#another example - three phone numbers in a row
phoneRegex = re.compile(r'((\d\d\d-)?\d\d\d-\d\d\d\d(,)?){3}') #the comma might or might not be there

phoneRegex.search('My phone numbers are 222-787-8999,555-788-3324,444-785-5529')

<re.Match object; span=(21, 59), match='222-787-8999,555-788-3324,444-785-5529'>

In [21]:
haRegex = re.compile(r'(Ha){2,5}') #to match Ha that occurs 2-5 times

ha = haRegex.search('He said HaHaHa') #Ha occurs 3 times
print(ha)
print('='*50)

ha1 = haRegex.search('He said HaHaHaHa')
print(ha1)
print('='*50)

ha2 = haRegex.search('He said Ha') #it won't match
print(ha2)
print('='*50)

ha3 = haRegex.search('He said HaHaHaHaHaHaHaHa') #it matches only the first 5
print(ha3)

<re.Match object; span=(8, 14), match='HaHaHa'>
<re.Match object; span=(8, 16), match='HaHaHaHa'>
None
<re.Match object; span=(8, 18), match='HaHaHaHaHa'>


## Shorthand Codes for common character classes

\d      any numeric digit from 0 to 9
\D      any character that is NOT a numeric digit from 0 to 9

\w      any letter, numeric digit, or the underscore character (think of this as matching "word" characters)

\W      any character that is NOT a letter, numeric digit, or the underscore character

\s      any space tab, or newline character (think of this as matching "space" characters)

\S      any character that is NOT a space tab, or newline character

In [31]:
#12 Days of Christmas

lyrics = '12 drummers drumming, 11 players piping, 10 lords a leaping, 9 ladies dancing, 8 maids a milking, 7 swans a swimming, 6 geese a laying, 5 golden rings, 4 calling birds, 3 french hens, 2 turtle doves, and 1 patridge in a pear tree'

xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall(lyrics)

['12 drummers',
 '11 players',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 golden',
 '4 calling',
 '3 french',
 '2 turtle',
 '1 patridge']

In [34]:
# creating your own expressions e.g. VOWELS

vowelRegex = re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('Taiwo is a billionaire')

['a', 'i', 'o', 'i', 'a', 'i', 'i', 'o', 'a', 'i', 'e']

In [35]:
vowelRegex = re.compile(r'[aeiouAEIOU]{2}') #looking for two vowels in a roll
vowelRegex.findall('Taiwo is a billionaire')

['ai', 'io', 'ai']

In [38]:
# negative character classes using the ^ symbol
oppVowelRegex = re.compile(r'[^aeiouAEIOU]') #find all non-vowel characters
oppVowelRegex.findall('Taiwo is a billionaire') #note that it gets spaces, numeric and punctuation

['T', 'w', ' ', 's', ' ', ' ', 'b', 'l', 'l', 'n', 'r']

In [39]:
# the symbol ^ shows that it begins with the expression 
beginsWithRegex = re.compile(r'^Hello') #starts with Hello
mo = beginsWithRegex.search('Hello there!')
print(mo)
print('='*50)

mo1 = beginsWithRegex.search('Heyy, Hello')
print(mo1)

<re.Match object; span=(0, 5), match='Hello'>
None


In [41]:
# the symbol $ shows that it ends with the expression 
endsWithRegex = re.compile(r'Hello$') #starts with Hello
mo = endsWithRegex.search('Hello there!')
print(mo)
print('='*50)

mo1 = endsWithRegex.search('Heyy, Hello')
print(mo1)

None
<re.Match object; span=(6, 11), match='Hello'>


In [49]:
#we can use both ^ and $ to show it is exactly the character

allDigits = re.compile(r'^\d+$') # the + shows one or more digits
mo = allDigits.search('234568763445732135')
print(mo)
print('='*50)

mo1 = allDigits.search('243523x95402') #though it starts and ends with one or more digits, it returns false
mo1 == None
#this is because when we use both ^ abd $, it has to match exactly.

<re.Match object; span=(0, 18), match='234568763445732135'>


True

In [60]:
#the dot (.) character is for any character except a newline
dotRegex = re.compile(r'.at') #any character followed by at

at = dotRegex.findall('The cat in the hat sat on the flat mat') #flat is left out because it finds onnly one character before at
print(at)


#we can pass re.DOTALL as the second argument to make the . dot match newlines as well

['cat', 'hat', 'sat', 'lat', 'mat']


In [53]:
dotRegex = re.compile(r'.{1,2}at') #one or two characters followed by at

at = dotRegex.findall('The cat in the hat sat on the flat mat') #flat is left out because it finds onnly one character before at
print(at)

[' cat', ' hat', ' sat', 'flat', ' mat']


In [54]:
# .* means any pattern

name = 'First Name: Taiwo Last Name: Odetola'
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)') #it finds as much as possible - called f=greedy mactching
nameRegex.findall(name)

[('Taiwo', 'Odetola')]

In [58]:
# we add a ? to beome .*? for a non-greedy matching

message = '<To serve humans> is divine>'

nongreedy = re.compile(r'<.*?>')
mo = nongreedy.findall(message)
print(mo)
print('='*50)

greedy = re.compile(r'<.*>')
mo1 = greedy.findall(message)
print(mo1)

['<To serve humans>']
['<To serve humans> is divine>']


In [59]:
# ignore all case 

vowelRegex = re.compile(r'[aeiouAEIOU]', re.I) #or re.IGNORECASE
vowelRegex.findall('Taiwo Odetola is a BILLIONAIRE')

['a', 'i', 'o', 'O', 'e', 'o', 'a', 'i', 'a', 'i', 'i', 'o', 'a', 'i', 'e']