# MetaCharacters:

.  -any character except NL <hr/> 
\d -digit(0-9) <hr/>
\w -(a-z, A-Z, 0-9,_ ) <hr/>
\s -whitespace <hr/>
\b -word boundry <hr/>
^ -beginning of a string <hr/>
$ -end of string <hr/>

[] -matches characters inside EX: [a-z] <hr/>
[^] -matches characters not inside <hr/>
| - either or <hr/>
() - Group

# Quantifiers

\* -0 or more <hr/>
\+ -1 or more <hr/>
? -0 or 1 <hr/>
{3} -exact number <hr/>
{3,4} -range (min, max)

In [2]:
import re

In [3]:
search_me = 'This is what abc searches for.'

### Raw Strings

In [4]:
pattern = re.compile(r'abc') #r == raw string

matches = pattern.finditer(search_me)
for match in matches:
    print(match)

<re.Match object; span=(13, 16), match='abc'>


In [5]:
#the span represents the location of the match within the string
search_me[13:16]

'abc'

### Escaping Metacharacters

In [6]:
#metacharacters must be escaped
pattern1 = re.compile(r'.') 
pattern2 = re.compile(r'\.') #escaped

matches1 = pattern1.finditer(search_me)
matches2 = pattern2.finditer(search_me)

len(list(matches1)), len(list(matches2))

(30, 1)

### Using WordBoundries

In [7]:
search_me = 'Ha HaHa'

pattern = re.compile(r'\bHa')

matches = pattern.finditer(search_me)
for match in matches:
    print(match)

<re.Match object; span=(0, 2), match='Ha'>
<re.Match object; span=(3, 5), match='Ha'>


In [8]:
pattern = re.compile(r'\BHa') #matches NON-Wordboundry HA

matches = pattern.finditer(search_me)
for match in matches:
    print(match)

<re.Match object; span=(5, 7), match='Ha'>


### Beginning and End of String

In [9]:
#beginning
search_me = 'Start me here. Start me there. The end. The end.'

pattern = re.compile(r'^Start') #will not match second Start

matches = pattern.finditer(search_me)
for match in matches:
    print(match)

<re.Match object; span=(0, 5), match='Start'>


In [10]:
#end
search_me = 'Start me here. Start me there. The end. The end.'

pattern = re.compile(r'end.$') #will not match first end

matches = pattern.finditer(search_me)
for match in matches:
    print(match)

<re.Match object; span=(44, 48), match='end.'>


### Using Quantifiers

In [11]:
search_me = "123-456-7890 456-.789-0123 890.123.4567 999*999*9999"

pattern = re.compile(r'\d{3}[-.]\d{3}[-.]\d{4}') 
#no need to escape meta in character set []
#character set only matches 1 character

matches = pattern.finditer(search_me)
for match in matches:
    print(match)

<re.Match object; span=(0, 12), match='123-456-7890'>
<re.Match object; span=(27, 39), match='890.123.4567'>


In [12]:
search_me = "cat bat hat spat lat rat"

pattern = re.compile(r'.?[^b]at\b') 
#.? states it can be 1 or 0 of any character before [^b]

matches = pattern.finditer(search_me)
for match in matches:
    print(match)

<re.Match object; span=(0, 3), match='cat'>
<re.Match object; span=(7, 11), match=' hat'>
<re.Match object; span=(12, 16), match='spat'>
<re.Match object; span=(16, 20), match=' lat'>
<re.Match object; span=(20, 24), match=' rat'>


### Using Character Sets

In [14]:
search_me = "Mr. T Mrs. Robinson Mr Smith Mr. Freeman, Mrs F, Ms Gilbe"

pattern = re.compile(r'M(r|s|rs)\.? \w*') #(groups)
#.? states it can be 1 or 0 of any character before [^b]

matches = pattern.finditer(search_me)
for match in matches:
    print(match)

<re.Match object; span=(0, 5), match='Mr. T'>
<re.Match object; span=(6, 19), match='Mrs. Robinson'>
<re.Match object; span=(20, 28), match='Mr Smith'>
<re.Match object; span=(29, 40), match='Mr. Freeman'>
<re.Match object; span=(42, 47), match='Mrs F'>
<re.Match object; span=(49, 57), match='Ms Gilbe'>


### Using Groups

In [27]:
search_me = '''
            https://www.google.com 
            http://coreys.com 
            https://youtube.com 
            https://www.nasa.gov
            '''
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)') #(groups)

matches = pattern.finditer(search_me)
for match in matches:
    print(match.group(1, 2,3)) #using groups to extract features.
    
##using patterns to substitute

subbed = pattern.sub(r'\2\3', search_me) #\2 group 2, \3 group 3.
print(subbed)

('www.', 'google', '.com')
(None, 'coreys', '.com')
(None, 'youtube', '.com')
('www.', 'nasa', '.gov')

            google.com 
            coreys.com 
            youtube.com 
            nasa.gov
            
