In [9]:
'''
\   Used to drop the special meaning of character
    following it (discussed below)
[]  Represent a character class
^   Matches the beginning
$   Matches the end
.   Matches any character except newline
?   Matches zero or one occurrence.
|   Means OR (Matches with any of the characters
    separated by it.
*   Any number of occurrences (including 0 occurrences)
+   One ore more occurrences
{}  Indicate number of occurrences of a preceding RE 
    to match.
()  Enclose a group of REs


\d   Matches any decimal digit, this is equivalent
     to the set class [0-9].
\D   Matches any non-digit character.
\s   Matches any whitespace character.
\S   Matches any non-whitespace character
\w   Matches any alphanumeric character, this is equivalent to the class [a-zA-Z0-9_].
\W   Matches any non-alphanumeric character
'''

'\n\\   Used to drop the special meaning of character\n    following it (discussed below)\n[]  Represent a character class\n^   Matches the beginning\n$   Matches the end\n.   Matches any character except newline\n?   Matches zero or one occurrence.\n|   Means OR (Matches with any of the characters\n    separated by it.\n*   Any number of occurrences (including 0 occurrences)\n+   One ore more occurrences\n{}  Indicate number of occurrences of a preceding RE \n    to match.\n()  Enclose a group of REs\n\n\n\\d   Matches any decimal digit, this is equivalent\n     to the set class [0-9].\n\\D   Matches any non-digit character.\n\\s   Matches any whitespace character.\n\\S   Matches any non-whitespace character\n\\w   Matches any alphanumeric character, this is equivalent to the class [a-zA-Z0-9_].\n\\W   Matches any non-alphanumeric character\n'

In [10]:
import re 

#### Find Small Letters from a strings 

In [37]:
# compile() creates regular expression character class [a-e],  which is equivalent to [abcde]. 
# class [abcde...z] will match with string with 'a', 'b', 'c', .... ,'z'

p = re.compile('[a-z]')

test_str = "Aye, said Mr. Gibenson Stark"

# findall() searches for the Regular Expression and return a list upon finding 
print(p.findall(test_str))

['y', 'e', 's', 'a', 'i', 'd', 'r', 'i', 'b', 'e', 'n', 's', 'o', 'n', 't', 'a', 'r', 'k']


#### Find Capital Letters from a strings 

In [38]:
# class of [A,B,C...Z]

p = re.compile('[A-Z]')

print(p.findall(test_str))

['A', 'M', 'G', 'S']


#### Find numbers from alphanumeric strings 

In [36]:
alpha_num = 'a1b2c3'

p = re.compile('[1-9]')

print(p.findall(alpha_num))

['1', '2', '3']


In [39]:
# \d is equivalent to [0-9]. 

p = re.compile('\d') 
print(p.findall("I went to him at 11 A.M. on 4th July 1886")) 

['1', '1', '4', '1', '8', '8', '6']


In [40]:
# \d+ will match a group on [0-9], group of one or greater size 

p = re.compile('\d+') 
print(p.findall("I went to him at 11 A.M. on 4th July 1886")) 

['11', '4', '1886']


In [41]:
# \w is equivalent to [a-zA-Z0-9_]. 
p = re.compile('\w') 
print(p.findall("He said * in some_lang.")) 
  
# \w+ matches to group of alphanumeric character. 
p = re.compile('\w+') 
print(p.findall("I went to him at 11 A.M., he said *** in some_language.")) 
  
# \W matches to non alphanumeric characters. 
p = re.compile('\W') 
print(p.findall("he said *** in some_language.")) 

['H', 'e', 's', 'a', 'i', 'd', 'i', 'n', 's', 'o', 'm', 'e', '_', 'l', 'a', 'n', 'g']
['I', 'went', 'to', 'him', 'at', '11', 'A', 'M', 'he', 'said', 'in', 'some_language']
[' ', ' ', '*', '*', '*', ' ', ' ', '.']


In [42]:
# '*' replaces the no. of occurrence of a character
# ‘a’ accompanied by any no. of ‘b’s, starting from 0

p = re.compile('ab*') 
print(p.findall("ababbaabbb")) 

['ab', 'abb', 'a', 'abbb']


#### split() Function 

In [43]:
from re import split 
  
# '\W+' denotes Non-Alphanumeric Characters or group of characters 
# Upon finding ',' or whitespace ' ', the split(), splits the string from that point 
print(split('\W+', 'Words, words , Words')) 
print(split('\W+', "Word's words Words")) 
  
# Here ':', ' ' ,',' are not AlphaNumeric thus, the point where splitting occurs 
print(split('\W+', 'On 12th Jan 2016, at 11:02 AM')) 
  
# '\d+' denotes Numeric Characters or group of characters 
# Splitting occurs at '12', '2016', '11', '02' only 
print(split('\d+', 'On 12th Jan 2016, at 11:02 AM')) 

['Words', 'words', 'Words']
['Word', 's', 'words', 'Words']
['On', '12th', 'Jan', '2016', 'at', '11', '02', 'AM']
['On ', 'th Jan ', ', at ', ':', ' AM']


#### Substring  -  sub() and subn()

In [45]:
# re.sub(pattern, replace_with, string, count=0, flags=0)

# Regular Expression pattern 'ub' matches the string at "Subject" and "Uber". 
# As the CASE has been ignored, using Flag, 'ub' should match twice with the string 
# Upon matching, 'ub' is replaced by '~*' in "Subject", and in "Uber", 'Ub' is replaced. 
print(re.sub('ub', '~*' , 'Subject has Uber booked already', flags = re.IGNORECASE)) 
  
# Consider the Case Senstivity, 'Ub' in "Uber", will not be reaplced. 
print(re.sub('ub', '~*' , 'Subject has Uber booked already')) 
  
# As count has been given value 1, the maximum times replacement occurs is 1 
print(re.sub('ub', '~*' , 'Subject has Uber booked already', count=1, flags = re.IGNORECASE)) 
  
# 'r' before the patter denotes RE, \s is for start and end of a String. 
print(re.sub(r'\sAND\s', ' & ', 'Baked Beans And Spam', flags=re.IGNORECASE)) 

S~*ject has ~*er booked already
S~*ject has Uber booked already
S~*ject has Uber booked already
Baked Beans & Spam


In [48]:
# subn() is similar to sub() in all ways, except in its way to providing output
# Returns a Tuple - (new_string, number of replacements)

print(re.subn('ub', '~*' , 'Subject has Uber booked already')) 
t = re.subn('ub', '~*' , 'Subject has Uber booked already', flags = re.IGNORECASE) 
print(t) 
print(len(t)) 
  
# This will give same output as sub() would have  
print(t[0]) 

('S~*ject has Uber booked already', 1)
('S~*ject has ~*er booked already', 2)
2
S~*ject has ~*er booked already


#### escape() 

In [49]:
# escape() returns a string with BackSlash '\', before every Non-Alphanumeric Character 
# In 1st case only ' ', is not alphanumeric 
# In 2nd case, ' ', caret '^', '-', '[]', '\' are not alphanumeric 

print(re.escape("This is Awseome even 1 AM")) 
print(re.escape("I Asked what is this [a-9], he said \t ^WoW")) 

This\ is\ Awseome\ even\ 1\ AM
I\ Asked\ what\ is\ this\ \[a\-9\],\ he\ said\ \	\ \^WoW
