In [1]:
#Regular expressions are text matching patterns described 
#with a formal syntax. You'll often hear regular expressions 
#referred to as 'regex' or 'regexp' in conversation. 
#Regular expressions can include a variety of rules, from 
#finding repetition, to text-matching, and much more. As you 
#advance in Python you'll see that a lot of your parsing 
#problems can be solved with regular expressions (they're also 
#a common interview question!).

#If you're familiar with Perl, you'll notice that the syntax 
#for regular expressions are very similar in Python. We will 
#be using the "re" module with Python for this lecture.



In [2]:
import re

In [3]:
patterns = ['term1','term2']

In [4]:
text= 'This is a string with term1, but not the other one'

In [5]:
re.search('hello','hello world')
#Return is not just a boolean...

<_sre.SRE_Match at 0x1063132a0>

In [6]:
#^Notice that we get a 'match', meaning it picked up that
#the two strings had a pattern between the two

In [7]:
re.search(patterns[0],text)

<_sre.SRE_Match at 0x1063133d8>

In [9]:
re.search(patterns[1],text)

In [11]:
for pattern in patterns:
    print 'Searching for "%s" in: \n"%s"' %(pattern,text)
    
    #Check for match
    if re.search(pattern,text):
        print '\n'
        print 'Match was found. \n'
        
    else:
        print '\n'
        print 'No match was found. \n'
        
    #The \n just starts a new line

Searching for "term1" in: 
"This is a string with term1, but not the other one"


Match was found. 

Searching for "term2" in: 
"This is a string with term1, but not the other one"


No match was found. 



In [12]:
match1 = re.search(patterns[0],text)

In [14]:
type(match1)
#Not just a boolean...

_sre.SRE_Match

In [17]:
match1.start()

22

In [18]:
#^So at index 22, the pattern between the two began....so the
#sentence string had "t" (beginning of "term1") at the 22nd spot

In [21]:
match1.end()

27

In [22]:
re.split(patterns[0],text)

['This is a string with ', ', but not the other one']

In [23]:
#Common interview question: how would you split
#emails by their @ symbol....this is the answer. Just use
#the split method for a regular expression with one of the
#strings as '@' and the other as the email.

In [29]:
re.findall('match or what', 'Heres one match or what, heres no match, heres another match or what')

['match or what', 'match or what']

In [30]:
#We can use metacharacters along with re to find specific
#types of patterns.

#Since we will be testing multiple re syntax forms, 
#let's create a function that will print out recults given a 
#list of various regular expressions and a phrase to parse:

In [31]:
def multi_re_find(patterns,phrase):
    '''
    Takes in a list of regex patterns
    Prints a list of all matches
    '''
    for pattern in patterns:
        print 'Searching the phrase using the re check: %r' %pattern
        print re.findall(pattern,phrase)
        print '\n'


In [32]:
#Repetition Syntax

#There are five ways to express repetition in a pattern:

#1.) A pattern followed by the metacharacter * is repeated 
    #zero or more times. 
#2.) Replace the * with + and the pattern must appear at least once. 
#3.) Using ? means the pattern appears zero or one time. 
#4.) For a specific number of occurrences, use {m} after 
    #the pattern, where m is replaced with the number of times         
    #the pattern should repeat. 
#5.) Use {m,n} where m is the minimum number of repetitions and n 
    #is the maximum. Leaving out n ({m,}) means the 
    #value appears at least m times, with no maximum.



In [34]:
#Here is an example of each of these using our multi_re_find function:
test_phrase = 'sdsd..sssddd...sdddsddd...dsds...dsssss...sdddd'

test_patterns = [ 'sd*',     # s followed by zero or more d's
                'sd+',          # s followed by one or more d's
                'sd?',          # s followed by zero or one d's
                'sd{3}',        # s followed by three d's
                'sd{2,3}',      # s followed by two to three d's
                ]

multi_re_find(test_patterns,test_phrase)

#So pattern comes first, then phrase for the function he made

Searching the phrase using the re check: 'sd*'
['sd', 'sd', 's', 's', 'sddd', 'sddd', 'sddd', 'sd', 's', 's', 's', 's', 's', 's', 'sdddd']


Searching the phrase using the re check: 'sd+'
['sd', 'sd', 'sddd', 'sddd', 'sddd', 'sd', 'sdddd']


Searching the phrase using the re check: 'sd?'
['sd', 'sd', 's', 's', 'sd', 'sd', 'sd', 'sd', 's', 's', 's', 's', 's', 's', 'sd']


Searching the phrase using the re check: 'sd{3}'
['sddd', 'sddd', 'sddd', 'sddd']


Searching the phrase using the re check: 'sd{2,3}'
['sddd', 'sddd', 'sddd', 'sddd']




In [35]:
#Character Sets

#Character sets are used when you wish to match any one 
#of a group of characters at a point in the input. Brackets 
#are used to construct character set inputs. For example: the 
#input [ab] searches for occurences of either a or b. Let's see 
#some examples:

In [36]:


test_phrase = 'sdsd..sssddd...sdddsddd...dsds...dsssss...sdddd'

test_patterns = [ '[sd]',    # either s or d
            's[sd]+']   # s followed by one or more s or d
            

multi_re_find(test_patterns,test_phrase)



Searching the phrase using the re check: '[sd]'
['s', 'd', 's', 'd', 's', 's', 's', 'd', 'd', 'd', 's', 'd', 'd', 'd', 's', 'd', 'd', 'd', 'd', 's', 'd', 's', 'd', 's', 's', 's', 's', 's', 's', 'd', 'd', 'd', 'd']


Searching the phrase using the re check: 's[sd]+'
['sdsd', 'sssddd', 'sdddsddd', 'sds', 'sssss', 'sdddd']




In [37]:
#Exclusion

#We can use ^ to exclude terms by incorporating it into the 
#bracket syntax notation. For example: [^...] will match any 
#single character not in the brackets. If it is in the bracket (in this
#case, if there was a period anywhere in the phrase) it will be
#removed. Let's see some examples:

In [39]:
test_phrase = 'This is a string! But it has punctutation. How can we \
remove it?'

In [44]:
re.findall('[^!.? ]+',test_phrase)
#Notice that a space is included, too. So all spaces are deleted, too.
#We used the + to make it so that when AT LEAST ONE of these symbols is
#found, we get it to break. If we didn't have the + there, it would be
#breaking it up even when it doesn't see one of these symbols (every 
#letter would be broken up).

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctutation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [45]:
#Character Ranges

In [46]:
#As character sets grow larger, typing every character 
#that should (or should not) match could become very tedious. 
#A more compact format using character ranges lets you define 
#a character set to include all of the contiguous characters 
#between a start and stop point. The format used is [start-end].

#Common use cases are to search for a specific range of 
#letters in the alphabet, such [a-f] would return matches 
#with any instance of letters between a and f.

#Let's walk through some examples:

In [47]:
test_phrase = 'This is an example sentence. Lets see if we can \
find some letters.'

test_patterns=[ '[a-z]+',      # sequences of lower case letters
                '[A-Z]+',      # sequences of upper case letters
                '[a-zA-Z]+',   # sequences of lower or upper case letters
                '[A-Z][a-z]+'] # one upper case letter followed by lower case letters
                
multi_re_find(test_patterns,test_phrase)

Searching the phrase using the re check: '[a-z]+'
['his', 'is', 'an', 'example', 'sentence', 'ets', 'see', 'if', 'we', 'can', 'find', 'some', 'letters']


Searching the phrase using the re check: '[A-Z]+'
['T', 'L']


Searching the phrase using the re check: '[a-zA-Z]+'
['This', 'is', 'an', 'example', 'sentence', 'Lets', 'see', 'if', 'we', 'can', 'find', 'some', 'letters']


Searching the phrase using the re check: '[A-Z][a-z]+'
['This', 'Lets']




In [48]:
#Escape Codes

In [49]:
#You can use special escape codes to find specific types
#of patterns in your data, such as digits, non-digits, whitespace, 
#and more. For example:

#Code...meaning:
#\d...a digit
#\D...a non-digit
#\s...whitespace(tab, space, newline, etc.)
#\S...non-whitespace
#\w...alphanumeric
#\W...non-alphanumeric

In [50]:
#There already are escape codes built into strings:
print 'hello \n new line'

hello 
 new line


In [51]:
#Because of this, we ned to prefix the literal value with r when
#creating regular expressions to eliminate this problem...

test_phrase = 'This is a string with some numbers 1233 and a symbol #hashtag'

test_patterns=[ r'\d+', # sequence of digits
                r'\D+', # sequence of non-digits
                r'\s+', # sequence of whitespace
                r'\S+', # sequence of non-whitespace
                r'\w+', # alphanumeric characters
                r'\W+', # non-alphanumeric
                ]

multi_re_find(test_patterns,test_phrase)

Searching the phrase using the re check: '\\d+'
['1233']


Searching the phrase using the re check: '\\D+'
['This is a string with some numbers ', ' and a symbol #hashtag']


Searching the phrase using the re check: '\\s+'
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


Searching the phrase using the re check: '\\S+'
['This', 'is', 'a', 'string', 'with', 'some', 'numbers', '1233', 'and', 'a', 'symbol', '#hashtag']


Searching the phrase using the re check: '\\w+'
['This', 'is', 'a', 'string', 'with', 'some', 'numbers', '1233', 'and', 'a', 'symbol', 'hashtag']


Searching the phrase using the re check: '\\W+'
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' #']




In [52]:
#Additional sources:

#https://docs.python.org/2/library/re.html#regular-expression-syntax

#http://www.tutorialspoint.com/python/python_reg_expressions.htm