# Regular Expressions

## Part One

In [1]:
text = "The agent's phone number is 408-555-1234. Call soon!"

In [2]:
'phone' in text

True

In [3]:
import re # Import the regular expressions module

In [4]:
pattern = 'phone'

In [5]:
re.search(pattern, text)

<re.Match object; span=(12, 17), match='phone'>

A match was found, starting at index 12 and ending at index 17

In [6]:
pattern = 'NOT IN TEXT'

In [7]:
re.search(pattern, text)

Nothing found

In [8]:
pattern = 'phone'

In [10]:
match = re.search(pattern, text)

In [11]:
match

<re.Match object; span=(12, 17), match='phone'>

In [13]:
match.span() # Index location of the span

(12, 17)

In [14]:
match.start()

12

In [15]:
match.end()

17

If there are mulitple matches in the string, we would only get back the first match

In [16]:
text = 'my phone once, my phone twice'

In [17]:
match = re.search(pattern, text)

In [18]:
match

<re.Match object; span=(3, 8), match='phone'>

To get all matches, use `findall`:

In [19]:
matches = re.findall(pattern, text)

In [20]:
matches

['phone', 'phone']

In [21]:
len(matches)

2

In [22]:
for match in re.finditer(pattern, text):
    print(match)

<re.Match object; span=(3, 8), match='phone'>
<re.Match object; span=(18, 23), match='phone'>


In [23]:
for match in re.finditer(pattern, text):
    print(match.span())

(3, 8)
(18, 23)


In [24]:
for match in re.finditer(pattern, text):
    print(match.group())

phone
phone


## Part Two

In [40]:
text = 'My phone number is 408-555-1234'

In [41]:
phone = re.search('408-555-1234', text)

In [42]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [43]:
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d', text)

In [44]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [48]:
phone = re.search(r'\d{3}-\d{3}-\d{4}', text) # Quantifiers

In [49]:
phone

<re.Match object; span=(19, 31), match='408-555-7777'>

In [50]:
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})') # Groups

`compile` lets you call the groupings individually

In [51]:
results = re.search(phone_pattern, text)

In [61]:
results.group() # Calling group with no number groups together all the patterns in the compile function

'408-555-7777'

In [55]:
results.group(1)

'408'

In [56]:
results.group(2)

'555'

In [57]:
results.group(3)

'7777'

In [58]:
results.group(4) # Ask for group that does not exist

IndexError: no such group

## Part Three

### Or Operator

In [63]:
re.search(r'cat', 'The cat is here')

<re.Match object; span=(4, 7), match='cat'>

In [64]:
re.search(r'cat', 'The dog is here')

In [65]:
re.search(r'cat|dog', 'The cat is here')

<re.Match object; span=(4, 7), match='cat'>

In [66]:
re.search(r'cat|dog', 'The dog is here')

<re.Match object; span=(4, 7), match='dog'>

### Wildcard Operator

In [67]:
re.findall('at', 'The cat in the hat sat there.')

['at', 'at', 'at']

In [68]:
re.findall('.at', 'The cat in the hat sat there.')

['cat', 'hat', 'sat']

In [71]:
re.findall('.at', 'The cat in the hat went splat.')

['cat', 'hat', 'lat']

In [70]:
re.findall('...at', 'The cat in the hat went splat.')

['e cat', 'e hat', 'splat']

### Starts With

In [72]:
re.findall(r'^\d', '1 is a number')

['1']

In [73]:
re.findall(r'^\d', 'The 2 is a number')

[]

### Ends With

In [74]:
re.findall(r'\d$', 'The number is 2')

['2']

In [75]:
re.findall(r'\d$', 'The 2 is a number')

[]

### Exclusion

In [76]:
phrase = 'there are 3 numbers 34 inside 5 this sentance'

In [77]:
pattern = r'[^\d]' # Exclude digits

In [78]:
re.findall(pattern, phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'a',
 'n',
 'c',
 'e']

In [79]:
pattern = r'[^\d]+'

In [80]:
re.findall(pattern, phrase)

['there are ', ' numbers ', ' inside ', ' this sentance']

In [81]:
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

In [86]:
re.findall(r'[^!.?]+', test_phrase) # Remove characters inside the [^] list

['This is a string', ' But it has punctuation', ' How can we remove it']

In [87]:
re.findall(r'[^!.? ]+', test_phrase) # Add space into the list

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [88]:
clean = re.findall(r'[^!.? ]+', test_phrase)

In [89]:
' '.join(clean)

'This is a string But it has punctuation How can we remove it'

### Inclusion

In [91]:
text = 'Only find the hyphen-words in this sentance. But you do not know how long-ish they are.'

In [92]:
pattern = r'[\w]+' # A group of alphanumerics

In [93]:
re.findall(pattern, text)

['Only',
 'find',
 'the',
 'hyphen',
 'words',
 'in',
 'this',
 'sentance',
 'But',
 'you',
 'do',
 'not',
 'know',
 'how',
 'long',
 'ish',
 'they',
 'are']

In [94]:
pattern = r'[\w]+-[\w]+'

In [95]:
re.findall(pattern, text)

['hyphen-words', 'long-ish']

In [96]:
pattern = r'\w+-\w+'

In [97]:
re.findall(pattern, text)

['hyphen-words', 'long-ish']

Can perform this without the groups, but it can be harder to understand what's going on

In [98]:
text = 'Hello, would you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

### Grouping

In [99]:
re.search(r'cat(fish|nap|claw)', text)

<re.Match object; span=(27, 34), match='catfish'>

In [100]:
re.search(r'cat(fish|nap|claw)', texttwo)

<re.Match object; span=(32, 38), match='catnap'>

In [101]:
re.search(r'cat(fish|nap|claw)', textthree)

In [102]:
re.search(r'cat(fish|nap|erpillar)', textthree)

<re.Match object; span=(26, 37), match='caterpillar'>