In [11]:
import re

"""
HTTP Addresses -->   ^http:\/\/\S+(\/\S+)*(\/)?$
Full Numeric Strings -->   ^[0-9]+$ or ^\d+$
Image Filenames -->   ^\w+\.(gif|png|jpg|jpeg)$
Email Addresses -->   ^\w+([.-]?\w+)*@\w+([.-]?\w+)*(\.\w{2,3})+$
Swapping Words using Parenthesized Back-References -->  ^(\S+)\s+(\S+)$ and $2 $1

\d      any number (a digit)
\D      anything but a number (a non-digit)
\s      space (tab,space,newline etc.)
\S      anything but a space
\w      letters ( Match alphanumeric character, including "_")
\W      matches any non-word(non-alphanumeric) character excluding "_"
\b      any character except for new line

\t      Matches tab.
\n      Matches newline.
\r      Matches return.
\d      Matches decimal digit 0-9.


^     Caret: Starts with::"^hello"
.     Any character (except newline character)
$     Ends with::"world$"
*     Zero or more occurrences::"aix*"
+     One or more occurrences::"aix+"
|     Either or::"falls|stays"
()    parentheses: Capture and group
{}    Exactly the specified number of occurrences::"al{2}"
[]    A set of characters::"[a-m]"

+ -- 1 or more occurrences of the pattern to its left, e.g. 'i+' = one or more i
* -- 0 or more occurrences of the pattern to its left
? -- match 0 or 1 occurrences of the pattern to its left

\d      any number (a digit)
\D      anything but a number (a non-digit)
\s      space (tab,space,newline etc.)
\S      anything but a space
\w      letters ( Match alphanumeric character, including "_")
\W      matches any non-word(non-alphanumeric) character excluding "_"
\b      any character except for new line

\t      Matches tab.
\n      Matches newline.
\r      Matches return.
\d      Matches decimal digit 0-9.


^     Caret: Starts with::"^hello"
.     Any character (except newline character)
$     Ends with::"world$"
*     Zero or more occurrences::"aix*"
+     One or more occurrences::"aix+"
|     Either or::"falls|stays"
()    parentheses: Capture and group
{}    Exactly the specified number of occurrences::"al{2}"
[]    A set of characters::"[a-m]"

+ -- 1 or more occurrences of the pattern to its left, e.g. 'i+' = one or more i
* -- 0 or more occurrences of the pattern to its left
? -- match 0 or 1 occurrences of the pattern to its left
"""

'\nHTTP Addresses -->   ^http:\\/\\/\\S+(\\/\\S+)*(\\/)?$\nFull Numeric Strings -->   ^[0-9]+$ or ^\\d+$\nImage Filenames -->   ^\\w+\\.(gif|png|jpg|jpeg)$\nEmail Addresses -->   ^\\w+([.-]?\\w+)*@\\w+([.-]?\\w+)*(\\.\\w{2,3})+$\nSwapping Words using Parenthesized Back-References -->  ^(\\S+)\\s+(\\S+)$ and $2 $1\n\n\\d      any number (a digit)\n\\D      anything but a number (a non-digit)\n\\s      space (tab,space,newline etc.)\n\\S      anything but a space\n\\w      letters ( Match alphanumeric character, including "_")\n\\W      matches any non-word(non-alphanumeric) character excluding "_"\n\x08      any character except for new line\n\n\t      Matches tab.\n\n      Matches newline.\n\r      Matches return.\n\\d      Matches decimal digit 0-9.\n\n\n^     Caret: Starts with::"^hello"\n.     Any character (except newline character)\n$     Ends with::"world$"\n*     Zero or more occurrences::"aix*"\n+     One or more occurrences::"aix+"\n|     Either or::"falls|stays"\n()    parent

In [2]:
wood = 'How much wood would a woodchuck chuck if a woodchuck could chuck wood?'

print(re.findall(r'wo\w+', wood))
print(re.findall(r'o+', wood))
print(re.findall(r'e+', wood))

['wood', 'would', 'woodchuck', 'woodchuck', 'wood']
['o', 'oo', 'o', 'oo', 'oo', 'o', 'oo']
[]


re.findall(): It returns all matched string portions as a list. If there are no matches, it will simply return an empty list.

In [3]:
foo = 'This and that and those'

print(re.findall(r'th\w+', foo))
print(re.findall(r'th\w+', foo, re.IGNORECASE))    # case is ignored while matching

['that', 'those']
['This', 'that', 'those']


What if you want to replace all matching portions with something else? It can be done using the *re.sub()* method.

In [4]:
wood = 'How much wood would a woodchuck chuck if a woodchuck could chuck wood?' 

print(re.sub(r'[aeiou]+', '-', wood))    # 3 args: regex, replacer string, target string
# Removing the matching portions: just make the "replacer" string an empty string ''.
print(re.sub(r'[aeiou]+', '', wood))    # substitute with an empty string

H-w m-ch w-d w-ld - w-dch-ck ch-ck -f - w-dch-ck c-ld ch-ck w-d?
Hw mch wd wld  wdchck chck f  wdchck cld chck wd?


In [5]:
myre = re.compile(r'\w+ou\w+')     # compiling myre as a reg ex
print(myre.findall(wood))          # calling .findall() directly on myre

print(myre.findall('Colorless green ideas sleep furiously'))
print(myre.findall('The thirty-three thieves thought that they thrilled the throne throughout Thursday.'))

['would', 'could']
['furiously']
['thought', 'throughout']


```.span() returns a tuple containing the start-, and end positions of the match.
.string returns the string passed into the function
.group() returns the part of the string where there was a match```

In [6]:
str = "The rain in Spain"
print(re.search("ai", str))
print(re.search(r"\bS\w+", str).span())
print(re.search(r"\bS\w+", str).string)
print(re.search(r"\bS\w+", str).group())

<re.Match object; span=(5, 7), match='ai'>
(12, 17)
The rain in Spain
Spain


In this context, *re.search()* is a good alternative. This method only finds the first match and then quits. If a match is found, it returns a "match object". But if not, it returns... nothing.

In [7]:
print(re.search(r'e+', 'Colorless green ideas sleep furiously').group())

"""
>>> re.search(r'e+', wood).group()
Traceback (most recent call last):
  File "<pyshell#25>", line 1, in <module>
    re.search(r'e+', wood).group()
AttributeError: 'NoneType' object has no attribute 'group'
"""

str = 'an example word:cat!!'
match = re.search(r'word:www', str)
if match:
    print('found', match.group()) ## 'found word:cat'
else:
    print('did not find')

e
did not find


In [None]:
>>> f = open('D:\\Lab\\ling1330\\bible-kjv.txt')
>>> blines = f.readlines()
>>> f.close()
>>> smite = re.compile(r'sm(i|o)te\w*')
>>> for b in blines:
        matchobj = smite.search(b)
        if matchobj:         # True if matchobj is not "nothing"
            print(matchobj.group(), '-', b, end='')

smite - again smite any more every thing living, as I have done.
smote - were with him, and smote the Rephaims in Ashteroth Karnaim, and the
smite - hand of Esau: for I fear him, lest he will come and smite me, and the
smote - 36:35 And Husham died, and Hadad the son of Bedad, who smote Midian in
smitest - Wherefore smitest thou thy fellow?  2:14 And he said, Who made thee a
smite - 3:20 And I will stretch out my hand, and smite Egypt with all my
smite - behold, I will smite with the rod that is in mine hand upon the waters
smote - up the rod, and smote the waters that were in the river, in the sight
smite - 8:2 And if thou refuse to let them go, behold, I will smite all thy
smite - rod, and smite the dust of the land, that it may become lice
smotest - with thee of the elders of Israel; and thy rod, wherewith thou smotest
smite - and thou shalt smite the rock, and there shall come water out of it,
smiteth - 21:12 He that smiteth a man, so that he die, shall be surely put to
smiteth - 21:15 And he that smiteth his father, or his mother, shall be surely
smite - 21:18 And if men strive together, and one smite another with a stone,
...

In [8]:
str = "The rain in Spain"
print(re.split("\s", str))
print(re.split("\s", str, 1))

['The', 'rain', 'in', 'Spain']
['The', 'rain in Spain']


In [9]:
str = 'purple alice-b@google.com monkey dishwasher anand9@google.com'

print(re.findall(r'\w+@\w+', str))
print(re.findall(r'[\w.-]+@[\w.-]+', str))

str = 'purple alice-b@google.com monkey dishwasher'
match = re.search(r'([\w.-]+)@([\w.-]+)', str)
if match:
    print(match.group())   ## 'alice-b@google.com' (the whole match)
    print(match.group(1))  ## 'alice-b' (the username, group 1)
    print(match.group(2))  ## 'google.com' (the host, group 2)


str = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'
print(re.findall(r'([\w\.-]+)@([\w\.-]+)', str))

['b@google', 'anand9@google']
['alice-b@google.com', 'anand9@google.com']
alice-b@google.com
alice-b
google.com
[('alice', 'google.com'), ('bob', 'abc.com')]
