## Intro to Regex for string manipulation


In [36]:
# imports 
import re #regex

In [37]:
# re.match matches pattern (abc) to string (abcdef)
re.match('abc' ,'abcdef')

<_sre.SRE_Match object; span=(0, 3), match='abc'>

In [38]:
# re.match only works from start of string so below doesn't return anything
re.match('bcd' ,'abcdef')

In [39]:
# re.search searches within the string
re.search('bcd' ,'abcdef')

<_sre.SRE_Match object; span=(1, 4), match='bcd'>

In [40]:
# use regex to find words, will find first word
word_regex = '\w+'
re.match(word_regex ,'hi there!')

<_sre.SRE_Match object; span=(0, 2), match='hi'>

In [41]:
# other regex matching patterns
'''

\            (escape charcter - tells regex what to look for)
\w+          (for words)
\d           (for digits)
\D           (no digits)
\s           (for spaces)
\S           (no spaces)
.*           (wildcard)
+            (greedy match)
*            (greedy match)
[]           (create an explicit character range)
[a-z]        (range from a to z lower case)
[A-Z]        (range from A to Z upper case)
()           (define a group)
|            (or)
(\w+|\d+)    (words or digits)
(\s+|,)      (spaces or a comma)
[A-Za-z\-\.] (A to Z, a to z, -, .)


''' 

'\n\\w+    (for words)\n\\d     (for digits)\n\\D     (no digits)\n\\s     (for spaces)\n\\S     (no spaces)\n.*     (wildcard)\n+      (greedy match)\n*      (greedy match)\n[a-z]  (groups lower case)\n'

In [42]:
# split on spaces will return strings woth no spaces
re.split('\s+', 'Split on spaces')

['Split', 'on', 'spaces']

In [43]:
# let's find all the words
my_string = "Let's write RegEx!"
re.findall('\w+', my_string)

['Let', 's', 'write', 'RegEx']

In [44]:
# something a bit more complex
my_string = "Let's write RegEx!  Won't that be fun?  I sure think so.  Can you find 4 sentences?  Or perhaps, all 19 words?"
print(my_string)

Let's write RegEx!  Won't that be fun?  I sure think so.  Can you find 4 sentences?  Or perhaps, all 19 words?


In [45]:
# the r"" is useful so python can interpret the regex without having conflicts in any code

# create matching patterns for how sentences end
sentence_endings = r"[.?!]"

# split out my_string where sentences end
print(re.split(sentence_endings, my_string))

["Let's write RegEx", "  Won't that be fun", '  I sure think so', '  Can you find 4 sentences', '  Or perhaps, all 19 words', '']


In [46]:
# Find all capitalized words in my_string and print the result
capitalized_words = r"[A-Z]\w+"
print(re.findall(capitalized_words, my_string))

['Let', 'RegEx', 'Won', 'Can', 'Or']


In [47]:
# Split my_string on spaces and print the result
spaces = r"\s+"
print(re.split(spaces, my_string))

["Let's", 'write', 'RegEx!', "Won't", 'that', 'be', 'fun?', 'I', 'sure', 'think', 'so.', 'Can', 'you', 'find', '4', 'sentences?', 'Or', 'perhaps,', 'all', '19', 'words?']


In [48]:
# Find all digits in my_string and print the result
digits = r"\d+"
print(re.findall(digits, my_string))

['4', '19']


## Tokenization
### Turning string or doc into tokens (smaller parts)

In [49]:
# nltk: natural language toolkit
'''
might need to open terminal and run:

python
import nltk
nltk.download()

then in window that opens go to 'model' tab and install 'punkit'
    
'''
from nltk.tokenize import word_tokenize

In [50]:
# work_tokenize will split out the sentance to it's word parts
word_tokenize("Hi there!")

['Hi', 'there', '!']

In [51]:
'''
sent_tokenize   (tokenize document into sentences)
regex_tokenize  (tokenize string or doc based on regex)
TweetTokenizer  (just for tweets, hastags etc) 
'''

'\nsent_tokenize   (tokenize document into sentences)\nregex_tokenize  (tokenize string or doc based on regex)\nTweetTokenizer  (just for tweets, hastags etc) \n'

In [52]:
# scene on of monty python to play around with :)
scene_one = "SCENE 1: [wind] [clop clop clop] \nKING ARTHUR: Whoa there!  [clop clop clop] \nSOLDIER #1: Halt!  Who goes there?\nARTHUR: It is I, Arthur, son of Uther Pendragon, from the castle of Camelot.  King of the Britons, defeator of the Saxons, sovereign of all England!\nSOLDIER #1: Pull the other one!\nARTHUR: I am, ...  and this is my trusty servant Patsy.  We have ridden the length and breadth of the land in search of knights who will join me in my court at Camelot.  I must speak with your lord and master.\nSOLDIER #1: What?  Ridden on a horse?\nARTHUR: Yes!\nSOLDIER #1: You're using coconuts!\nARTHUR: What?\nSOLDIER #1: You've got two empty halves of coconut and you're bangin' 'em together.\nARTHUR: So?  We have ridden since the snows of winter covered this land, through the kingdom of Mercea, through--\nSOLDIER #1: Where'd you get the coconuts?\nARTHUR: We found them.\nSOLDIER #1: Found them?  In Mercea?  The coconut's tropical!\nARTHUR: What do you mean?\nSOLDIER #1: Well, this is a temperate zone.\nARTHUR: The swallow may fly south with the sun or the house martin or the plover may seek warmer climes in winter, yet these are not strangers to our land?\nSOLDIER #1: Are you suggesting coconuts migrate?\nARTHUR: Not at all.  They could be carried.\nSOLDIER #1: What?  A swallow carrying a coconut?\nARTHUR: It could grip it by the husk!\nSOLDIER #1: It's not a question of where he grips it!  It's a simple question of weight ratios!  A five ounce bird could not carry a one pound coconut.\nARTHUR: Well, it doesn't matter.  Will you go and tell your master that Arthur from the Court of Camelot is here.\nSOLDIER #1: Listen.  In order to maintain air-speed velocity, a swallow needs to beat its wings forty-three times every second, right?\nARTHUR: Please!\nSOLDIER #1: Am I right?\nARTHUR: I'm not interested!\nSOLDIER #2: It could be carried by an African swallow!\nSOLDIER #1: Oh, yeah, an African swallow maybe, but not a European swallow.  That's my point.\nSOLDIER #2: Oh, yeah, I agree with that.\nARTHUR: Will you ask your master if he wants to join my court at Camelot?!\nSOLDIER #1: But then of course a-- African swallows are non-migratory.\nSOLDIER #2: Oh, yeah...\nSOLDIER #1: So they couldn't bring a coconut back anyway...  [clop clop clop] \nSOLDIER #2: Wait a minute!  Supposing two swallows carried it together?\nSOLDIER #1: No, they'd have to have it on a line.\nSOLDIER #2: Well, simple!  They'd just use a strand of creeper!\nSOLDIER #1: What, held under the dorsal guiding feathers?\nSOLDIER #2: Well, why not?\n"
print(scene_one)

SCENE 1: [wind] [clop clop clop] 
KING ARTHUR: Whoa there!  [clop clop clop] 
SOLDIER #1: Halt!  Who goes there?
ARTHUR: It is I, Arthur, son of Uther Pendragon, from the castle of Camelot.  King of the Britons, defeator of the Saxons, sovereign of all England!
SOLDIER #1: Pull the other one!
ARTHUR: I am, ...  and this is my trusty servant Patsy.  We have ridden the length and breadth of the land in search of knights who will join me in my court at Camelot.  I must speak with your lord and master.
SOLDIER #1: What?  Ridden on a horse?
ARTHUR: Yes!
SOLDIER #1: You're using coconuts!
ARTHUR: What?
SOLDIER #1: You've got two empty halves of coconut and you're bangin' 'em together.
ARTHUR: So?  We have ridden since the snows of winter covered this land, through the kingdom of Mercea, through--
SOLDIER #1: Where'd you get the coconuts?
ARTHUR: We found them.
SOLDIER #1: Found them?  In Mercea?  The coconut's tropical!
ARTHUR: What do you mean?
SOLDIER #1: Well, this is a temperate zone.
AR

In [53]:
# impport sentance tokenizer
from nltk.tokenize import sent_tokenize

In [54]:
# split out each sentence into tokens
sent_tokenize(scene_one)

['SCENE 1: [wind] [clop clop clop] \nKING ARTHUR: Whoa there!',
 '[clop clop clop] \nSOLDIER #1: Halt!',
 'Who goes there?',
 'ARTHUR: It is I, Arthur, son of Uther Pendragon, from the castle of Camelot.',
 'King of the Britons, defeator of the Saxons, sovereign of all England!',
 'SOLDIER #1: Pull the other one!',
 'ARTHUR: I am, ...  and this is my trusty servant Patsy.',
 'We have ridden the length and breadth of the land in search of knights who will join me in my court at Camelot.',
 'I must speak with your lord and master.',
 'SOLDIER #1: What?',
 'Ridden on a horse?',
 'ARTHUR: Yes!',
 "SOLDIER #1: You're using coconuts!",
 'ARTHUR: What?',
 "SOLDIER #1: You've got two empty halves of coconut and you're bangin' 'em together.",
 'ARTHUR: So?',
 "We have ridden since the snows of winter covered this land, through the kingdom of Mercea, through--\nSOLDIER #1: Where'd you get the coconuts?",
 'ARTHUR: We found them.',
 'SOLDIER #1: Found them?',
 'In Mercea?',
 "The coconut's tropic

In [55]:
# get the fourth sentence
sent_tokenize(scene_one)[3]

'ARTHUR: It is I, Arthur, son of Uther Pendragon, from the castle of Camelot.'

In [56]:
# word tokenize the fourth sentence
word_tokenize(sent_tokenize(scene_one)[3])

['ARTHUR',
 ':',
 'It',
 'is',
 'I',
 ',',
 'Arthur',
 ',',
 'son',
 'of',
 'Uther',
 'Pendragon',
 ',',
 'from',
 'the',
 'castle',
 'of',
 'Camelot',
 '.']

In [57]:
# word tokenize the entire scene
word_tokenize(scene_one)

['SCENE',
 '1',
 ':',
 '[',
 'wind',
 ']',
 '[',
 'clop',
 'clop',
 'clop',
 ']',
 'KING',
 'ARTHUR',
 ':',
 'Whoa',
 'there',
 '!',
 '[',
 'clop',
 'clop',
 'clop',
 ']',
 'SOLDIER',
 '#',
 '1',
 ':',
 'Halt',
 '!',
 'Who',
 'goes',
 'there',
 '?',
 'ARTHUR',
 ':',
 'It',
 'is',
 'I',
 ',',
 'Arthur',
 ',',
 'son',
 'of',
 'Uther',
 'Pendragon',
 ',',
 'from',
 'the',
 'castle',
 'of',
 'Camelot',
 '.',
 'King',
 'of',
 'the',
 'Britons',
 ',',
 'defeator',
 'of',
 'the',
 'Saxons',
 ',',
 'sovereign',
 'of',
 'all',
 'England',
 '!',
 'SOLDIER',
 '#',
 '1',
 ':',
 'Pull',
 'the',
 'other',
 'one',
 '!',
 'ARTHUR',
 ':',
 'I',
 'am',
 ',',
 '...',
 'and',
 'this',
 'is',
 'my',
 'trusty',
 'servant',
 'Patsy',
 '.',
 'We',
 'have',
 'ridden',
 'the',
 'length',
 'and',
 'breadth',
 'of',
 'the',
 'land',
 'in',
 'search',
 'of',
 'knights',
 'who',
 'will',
 'join',
 'me',
 'in',
 'my',
 'court',
 'at',
 'Camelot',
 '.',
 'I',
 'must',
 'speak',
 'with',
 'your',
 'lord',
 'and',
 'ma

In [58]:
# reducd it to a unique list but using set()
set(word_tokenize(scene_one))

{'!',
 '#',
 "'",
 "'d",
 "'em",
 "'m",
 "'re",
 "'s",
 "'ve",
 ',',
 '--',
 '.',
 '...',
 '1',
 '2',
 ':',
 '?',
 'A',
 'ARTHUR',
 'African',
 'Am',
 'Are',
 'Arthur',
 'Britons',
 'But',
 'Camelot',
 'Court',
 'England',
 'European',
 'Found',
 'Halt',
 'I',
 'In',
 'It',
 'KING',
 'King',
 'Listen',
 'Mercea',
 'No',
 'Not',
 'Oh',
 'Patsy',
 'Pendragon',
 'Please',
 'Pull',
 'Ridden',
 'SCENE',
 'SOLDIER',
 'Saxons',
 'So',
 'Supposing',
 'That',
 'The',
 'They',
 'Uther',
 'Wait',
 'We',
 'Well',
 'What',
 'Where',
 'Who',
 'Whoa',
 'Will',
 'Yes',
 'You',
 '[',
 ']',
 'a',
 'agree',
 'air-speed',
 'all',
 'am',
 'an',
 'and',
 'anyway',
 'are',
 'ask',
 'at',
 'back',
 'bangin',
 'be',
 'beat',
 'bird',
 'breadth',
 'bring',
 'but',
 'by',
 'carried',
 'carry',
 'carrying',
 'castle',
 'climes',
 'clop',
 'coconut',
 'coconuts',
 'could',
 'course',
 'court',
 'covered',
 'creeper',
 'defeator',
 'do',
 'does',
 'dorsal',
 'empty',
 'every',
 'feathers',
 'five',
 'fly',
 'forty-

In [59]:
# check the difference in sizes
print(len(word_tokenize(scene_one)))
print(len(set(word_tokenize(scene_one))))

609
226


In [69]:
# save for the first instance of the word coconuts then print location
cocoloco = re.search("coconuts", scene_one)
print(cocoloco.start(), cocoloco.end())

580 588


## Some Advanced Tokenization with Regex

In [70]:
# search for first anything in a square bracket
    #start     \[
    #wildcard  .*
    #end .     \]
    
re.search(r"\[.*\]", scene_one)

<_sre.SRE_Match object; span=(9, 32), match='[wind] [clop clop clop]'>

In [71]:
# lets match all words and digits but remove punctuation using or
    #digits   \d+ 
    #or       |
    #words    \w+
    
re.findall(r"\d+|\w+", 'He!, has 11 cats.')

['He', 'has', '11', 'cats']

In [72]:
# match stops at first letter
re.match(r"[a-z0-9 ]", 'holy cow 11, purple nuts')

<_sre.SRE_Match object; span=(0, 1), match='h'>

In [74]:
# include a plus and is travel till it stops at comma as comma is not defined
    # notice a space after 9
re.match(r"[a-z0-9 ]+", 'holy cow 11, purple nuts')

<_sre.SRE_Match object; span=(0, 11), match='holy cow 11'>

In [75]:
# now all is returned
re.match(r"[a-z0-9, ]+", 'holy cow 11, purple nuts')

<_sre.SRE_Match object; span=(0, 24), match='holy cow 11, purple nuts'>