# Regular Expressions

In [1]:
text = "The agent's phone number is 408-555-1234. Call soon!"

In [2]:
'phone' in text

True

In [3]:
import re

In [4]:
pattern = 'phone'

In [5]:
re.search(pattern, text)

<re.Match object; span=(12, 17), match='phone'>

In [6]:
pattern = 'NOT IN TEXT'

In [7]:
re.search(pattern,text)

In [8]:
pattern = 'phone'

In [9]:
match = re.search(pattern,text)

In [10]:
match

<re.Match object; span=(12, 17), match='phone'>

In [11]:
match.span()

(12, 17)

In [12]:
match.start()

12

In [13]:
match.end()

17

In [16]:
text = 'my phone once, my phone twice'

In [17]:
match = re.search('phone',text)

In [18]:
matches = re.findall('phone',text)

In [19]:
matches

['phone', 'phone']

In [20]:
len(matches)

2

In [21]:
for match in re.finditer('phone',text):
    print(match)

<re.Match object; span=(3, 8), match='phone'>
<re.Match object; span=(18, 23), match='phone'>


In [22]:
for match in re.finditer('phone',text):
    print(match.span())

(3, 8)
(18, 23)


In [23]:
for match in re.finditer('phone',text):
    print(match.group())

phone
phone


<b>Syntax for Regex

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >\d</span></td><td>A digit</td><td>file_\d\d</td><td>file_25</td></tr>

<tr ><td><span >\w</span></td><td>Alphanumeric</td><td>\w-\w\w\w</td><td>A-b_1</td></tr>



<tr ><td><span >\s</span></td><td>White space</td><td>a\sb\sc</td><td>a b c</td></tr>



<tr ><td><span >\D</span></td><td>A non digit</td><td>\D\D\D</td><td>ABC</td></tr>

<tr ><td><span >\W</span></td><td>Non-alphanumeric</td><td>\W\W\W\W\W</td><td>*-+=)</td></tr>

<tr ><td><span >\S</span></td><td>Non-whitespace</td><td>\S\S\S\S</td><td>Yoyo</td></tr></table>

In [32]:
text = 'My phone number is 408-555-1234'

In [33]:
phone = re.search('408-555-1234',text)

In [34]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [35]:
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d',text)

In [36]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [37]:
phone.group()

'408-555-1234'

<b>Quantifiers</br>

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >+</span></td><td>Occurs one or more times</td><td>	Version \w-\w+</td><td>Version A-b1_1</td></tr>

<tr ><td><span >{3}</span></td><td>Occurs exactly 3 times</td><td>\D{3}</td><td>abc</td></tr>



<tr ><td><span >{2,4}</span></td><td>Occurs 2 to 4 times</td><td>\d{2,4}</td><td>123</td></tr>



<tr ><td><span >{3,}</span></td><td>Occurs 3 or more</td><td>\w{3,}</td><td>anycharacters</td></tr>

<tr ><td><span >\*</span></td><td>Occurs zero or more times</td><td>A\*B\*C*</td><td>AAACC</td></tr>

<tr ><td><span >?</span></td><td>Once or none</td><td>plurals?</td><td>plural</td></tr></table>

In [39]:
phone = re.search(r'\d{3}-\d{3}-\d{4}',text)

In [40]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [41]:
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})') # compiles together different regex codes

In [43]:
results = re.search(phone_pattern,text)

In [44]:
results.group()

'408-555-1234'

In [46]:
results.group(1) # group ordering starts at 1

'408'

In [47]:
results.group(2)

'555'

In [48]:
results.group(3)

'1234'

<b>Additional Regex Syntax

In [49]:
re.search(r'cat','The cat is here')

<re.Match object; span=(4, 7), match='cat'>

In [51]:
re.search(r'cat|dog','The cat is here') # | stands for or

<re.Match object; span=(4, 7), match='cat'>

In [52]:
re.findall(r'.at','THe cat in the hat sat there.') # period acts as wild card

['cat', 'hat', 'sat']

In [53]:
re.findall(r'...at','THe cat in the hat went splat.')

['e cat', 'e hat', 'splat']

In [54]:
re.findall(r'^\d','1 is a number') # if the entire string starts with a number (^ - starts with)

['1']

In [56]:
re.findall(r'\d$','The number is 2') # if the entire string ends with a number ($ - ends with)

['2']

In [57]:
phrase = 'there are 3 numbers 34 inside 5 this sentence'

In [58]:
pattern = r'[^\d]' # [^] to exclude

In [59]:
re.findall(pattern,phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e']

In [60]:
pattern = r'[^\d]+'

In [61]:
re.findall(pattern,phrase)

['there are ', ' numbers ', ' inside ', ' this sentence']

In [62]:
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

In [63]:
re.findall('[^\!.?]+',test_phrase)

['This is a string', ' But it has punctuation', ' How can we remove it']

In [64]:
clean = re.findall('[^\!.? ]+',test_phrase)

In [65]:
' '.join(clean)

'This is a string But it has punctuation How can we remove it'

In [66]:
text = 'Only find the hyphen-words in this sentence. But you do not know how long-ish they are'

In [67]:
pattern = r'[\w]+'

In [68]:
re.findall(pattern,text)

['Only',
 'find',
 'the',
 'hyphen',
 'words',
 'in',
 'this',
 'sentence',
 'But',
 'you',
 'do',
 'not',
 'know',
 'how',
 'long',
 'ish',
 'they',
 'are']

In [69]:
pattern = r'[\w]+-[\w]+'

In [70]:
re.findall(pattern,text)

['hyphen-words', 'long-ish']

In [71]:
text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

In [72]:
re.search(r'cat(fish|nap|claw)',text)

<re.Match object; span=(27, 34), match='catfish'>

In [73]:
re.search(r'cat(fish|nap|claw)',texttwo)

<re.Match object; span=(32, 38), match='catnap'>

In [74]:
re.search(r'cat(fish|nap|claw)',textthree)

In [75]:
re.search(r'cat(fish|nap|claw|erpillar)',textthree)

<re.Match object; span=(26, 37), match='caterpillar'>