# REGular EXpression - INTRO TO REGEX

In [1]:
# your code here!
import re

1. `re.search`: Returns the first instance of an expression in a string.
2. `re.findall`: Finds all instances of an expression in a string and returns them as a list.
3. `re.split`: Splits a string based on a specified delimiter.
4. `re.sub`: Substitutes a string/substring with another.

Website to visually see what your regular expressions look like:
* https://regexper.com/
* https://regexr.com/

In [4]:
text = 'My neighbor, Mr. Rogers, has 5 dogs.'
pattern = 'neighbor'

re.findall(pattern, text)

['neighbor']

In [5]:
text = 'My neighbor, Mr. Rogers, has 5 dogs. I love those dogs.'
pattern = 'dog'

re.findall(pattern, text)

['dog', 'dog']

In [11]:
text = 'My neighbor, Mr. Rogers, has 5 dogs. I love those dogs.'
pattern = 'cat'

re.findall(pattern, text)

[]

## Introducing Sets

In [12]:
text = 'My neighbor, Mr. Rogers, has 5 dogs.'
pattern = '[neigh]'

re.findall(pattern, text)

['n', 'e', 'i', 'g', 'h', 'g', 'e', 'h', 'g']

In [19]:
text = 'My neighbor, Mr. Rogers, has 5 rogers.'
pattern = '[Rr]ogers'

re.findall(pattern, text)

['Rogers', 'rogers']

In [22]:
# your code here!
re.search(pattern, text)

<re.Match object; span=(17, 23), match='Rogers'>

In [29]:
text = 'My neighbor, Mr. Rogers, has 5 rogers.'

pattern = '[Rr]ogers|My'

re.sub(pattern, 'Joseph', text)

'Joseph neighbor, Mr. Joseph, has 5 Joseph.'

In [30]:
text = 'Sáo Paulo São Paulo Sao Paulo Sao Paolo San Pablo sao paulo sao Paulo são Paulo sao-paulo são paulo São Paulo Saon Paulo'

pattern ='[Ss][aáã][on][ -][Pp][aáã][buo]lo'

re.findall(pattern, text)

['Sáo Paulo',
 'São Paulo',
 'Sao Paulo',
 'Sao Paolo',
 'San Pablo',
 'sao paulo',
 'sao Paulo',
 'são Paulo',
 'sao-paulo',
 'são paulo',
 'São Paulo']

# Pattern sets:

Range

1. [a-z]: Any lowercase letter between a and z.
2. [A-Z]: Any uppercase letter between A and Z.
3. [0-9]: Any numeric character between 0 and 9.

In [51]:
text = 'My neighbor, Mr. Rogers, has 5 rogers.'
pattern = '[A-Z][a-z]'

re.findall(pattern, text)

['My', 'Mr', 'Ro']

In [56]:
text = 'My neighbor, Mr. Rogers, has 5.7 rogers.'
pattern = '[0-9].[0-9]'

re.findall(pattern, text)

['5.7']

In [62]:
# you can concatenate ranges
text = 'My neighbor, Mr. Rogers, has 5 rogers.'
pattern = '[A-Za-z0-9]'

print(re.findall(pattern, text))

# your code here!

['M', 'y', 'n', 'e', 'i', 'g', 'h', 'b', 'o', 'r', 'M', 'r', 'R', 'o', 'g', 'e', 'r', 's', 'h', 'a', 's', '5', 'r', 'o', 'g', 'e', 'r', 's']


The opposite: 
- `^` matches everything except the pattern 

In [65]:
text = 'My neighbor, Mr. Rogers, has 5 rogers. π'
pattern = '[^A-Za-z0-9]'

re.findall(pattern, text)

[' ', ',', ' ', '.', ' ', ',', ' ', ' ', ' ', '.', ' ', 'π']

# Meta Characters:

Characters that don't mean what they are.

1. `\w`: Any alphanumeric character.
3. `\d`: Any numeric character.
7. `.` : Any character except newline (\n).

In [84]:
text = 'My neighbor, Mr. Rogers, ] has 5 - dogs 10. α π \d'

In [77]:
# your code here!
pattern = '\w'
print(re.findall(pattern, text))

['M', 'y', 'n', 'e', 'i', 'g', 'h', 'b', 'o', 'r', 'M', 'r', 'R', 'o', 'g', 'e', 'r', 's', 'h', 'a', 's', '5', 'd', 'o', 'g', 's', '1', '0', 'α', 'π', 'd']


In [88]:
# your code here!
#pattern = r'\\d'
pattern = '\d'
print(re.findall(pattern, text))

['5', '1', '0']


In [80]:
# your code here!
pattern = '.'
print(re.findall(pattern, text))

['M', 'y', ' ', 'n', 'e', 'i', 'g', 'h', 'b', 'o', 'r', ',', ' ', 'M', 'r', '.', ' ', 'R', 'o', 'g', 'e', 'r', 's', ',', ' ', ']', ' ', 'h', 'a', 's', ' ', '5', ' ', '-', ' ', 'd', 'o', 'g', 's', ' ', '1', '0', '.', ' ', 'α', ' ', 'π', ' ', '\\', 'd']


## Quantifiers

1. `*`: 0 or more
2. `?`: 0 or 1
3. `+`: 1 or more

In [89]:
text = '''My neighbor, Mr. Rogers, has 5 - dogs and 100 cats and β sheeps.'''

In [92]:
# your code here!
pattern = '\d+'

re.findall(pattern, text)

['5', '100']

In [95]:
re.findall('-.*', text)

['- dogs and 100 cats and β sheeps.']

In [96]:
text = '''My neighbor, Mr. Rogers, has 5 -'''
re.findall('-.+', text)

[]

In [126]:
text = '''My neighbor, Mr. Rogers, has 5 - dogs and 100 cats and β sheeps. doooooooooooogs dooogs'''

print(re.findall('do.\w+', text))
print(re.findall('do.\w{4}', text))

['dogs', 'doooooooooooogs', 'dooogs']
['doooooo']


In [128]:
text = 'Sáo Paulo São Paulo Sao Paulo Sao Paolo San Pablo sao paulo sao Paulo são Paulo sao-paulo são paulo São SãoPaulo Saon Paulo'

pattern = '[Ss][aãáàâä][o]n? ?[Pp][aãáàâä][uob]lo'

re.findall(pattern, text)

['Sáo Paulo',
 'São Paulo',
 'Sao Paulo',
 'Sao Paolo',
 'sao paulo',
 'sao Paulo',
 'são Paulo',
 'são paulo',
 'SãoPaulo',
 'Saon Paulo']

In [129]:
text = 'This colonel has the colour or color blue'

# your code here!
pattern = 'colou?r'
re.findall(pattern, text)

['colour', 'color']

In [130]:
text = 'These apples are beautiful and the apple is blue.'

# your code here!
pattern = 'apples?'
re.findall(pattern, text)

['apples', 'apple']

# Other methods for regular expressions

In [143]:
text = 'My neighbor, Mr. Rogers, ] has 5 - rogers 1000,'

In [132]:
# your code here!
re.split('[Rr]ogers', text)

['My neighbor, Mr. ', ', ] has 5 - ', ' 1000,']

In [133]:
len(re.split('[Rr]ogers', text))

3

In [134]:
re.split('[Rr]ogers', text)[0]

'My neighbor, Mr. '

In [135]:
re.split('[Rr]ogers', text)[1]

', ] has 5 - '

In [149]:
re.split('[Rr]ogers', text)[2]

' 1000,'