### Basics

In [1]:
import re

In [3]:
text = 'hello, world'

In [8]:
re.search('hello', text) # returns an object. Span means where the word hello is 

<re.Match object; span=(0, 5), match='hello'>

In [16]:
# Find all matches and return as a list
text = r'Hello, world and Hello, all of you'

In [17]:
re.findall('Hello', text)

['Hello', 'Hello']

In [41]:
pattern = re.compile(r'you') # it will search for exact this combination: abc

In [42]:
matches = pattern.finditer(text)
for m in matches:
    print(m) # what it shows that the word "world" in my string starts at position 7 and goes to position 12

<re.Match object; span=(31, 34), match='you'>


In [43]:
# now we can print out exactly this word from the line
print(text[31:34])

you


In [44]:
# everything but a character
pattern = re.compile(r'\W')

In [45]:
matches = pattern.finditer(text)
for m in matches:
    print(m)  # returns punctuation marks and spaces 

<re.Match object; span=(5, 6), match=','>
<re.Match object; span=(6, 7), match=' '>
<re.Match object; span=(12, 13), match=' '>
<re.Match object; span=(16, 17), match=' '>
<re.Match object; span=(22, 23), match=','>
<re.Match object; span=(23, 24), match=' '>
<re.Match object; span=(27, 28), match=' '>
<re.Match object; span=(30, 31), match=' '>


In [49]:
# Find "H", three characters, and then o -- and match the three inner characters. 
#Result is a list of those three characters

re.findall('H(...)o', text)

# re.finditer does the same but returns an iterator instead of a list

['ell', 'ell']

In [51]:
"""
Substituting text is done with re.sub, which takes a regexp string, a replacement string, and the text in which to search. 
It returns the transformed string, leaving the original string untouched. 
For example, the following replaces all vowels in a string with @:
"""
re.sub('[aeiou]','@', text) # what to substitute, with what, where

'H@ll@, w@rld @nd H@ll@, @ll @f y@@'

#### Phone numbers

In [56]:
# Let's work with phone numbers
phones = '212-333-1234 is the landline while 347.444.9876 is the cell phone number.'

phone_pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d\d')  # dot here doesn't mean the dot in the cell phone number, it means any character
phone_matches = phone_pattern.finditer(phones)

In [57]:
for p in phone_matches:
    print(p)

<re.Match object; span=(0, 12), match='212-333-1234'>
<re.Match object; span=(35, 47), match='347.444.9876'>


In [88]:
# now imagine we're adding a new phone number 201*765*1122. If we keep using a dot in the pattern,
# this number will be returned as well. But what if we want only numbers that have dashes and dots as a separator?
phones1 = '212-333-1234 is the landline while 347.444.9876 is the cell phone number. And this 201*765*1122 shouldn''t be chosen'

phones1_pattern = re.compile(r'\d\d\d[-.]\d\d\d[-.]\d\d\d\d') # instead of the dot we put the square brackets and inside -- what symbols we want

In [89]:
phone_matches1 = phones1_pattern.finditer(phones1)
for ph in phone_matches1:
    print(ph)

<re.Match object; span=(0, 12), match='212-333-1234'>
<re.Match object; span=(35, 47), match='347.444.9876'>


In [91]:
# Now let's match only numbers starting with 312 and 313
phones2 = '212-333-1234 is the landline while 347.444.9876 is the cell phone number. I want this 312-999-4321 and 313.888.6578'

phones2_pattern=re.compile(r'31[23][-.]\d\d\d[-.]\d\d\d\d') # give me 312 and 313 and either - or . between the digits

In [92]:
phone_matches2 = phones2_pattern.finditer(phones2)
for ph in phone_matches2:
    print(ph)

<re.Match object; span=(86, 98), match='312-999-4321'>
<re.Match object; span=(103, 115), match='313.888.6578'>


In [107]:
# Let's rewrite using quantifiers
phones3_pattern = re.compile(r'\d{3}[-.]\d{3}[-.]\d{4}') # we rewrite phones1_pattern essentially

In [108]:
phone_matches3 = phones3_pattern.finditer(phones1)
for ph in phone_matches3:
    print(ph)

<re.Match object; span=(0, 12), match='212-333-1234'>
<re.Match object; span=(35, 47), match='347.444.9876'>


#### Negations

In [103]:
neg = 'mat pat sat bat tat and some other non-relevant stuff'
# imagine we want to <letter>at but letter shouldn't be a 'b'
neg_pattern = re.compile(r'[^b]at')

In [104]:
neg_matches = neg_pattern.finditer(neg)
for n in neg_matches:
    print(n)

<re.Match object; span=(0, 3), match='mat'>
<re.Match object; span=(4, 7), match='pat'>
<re.Match object; span=(8, 11), match='sat'>
<re.Match object; span=(16, 19), match='tat'>


### Working with the dictionary from the .txt file

In [58]:
with open('words.txt','r') as f:
    contents = f.read() # loading the txt file that is stored in the same folder. Takes some memory

In [100]:
# User types in a word, we look it up in the dictionary. It also accepts regex patterns. Try ta.{4}
phrase = input('Type a word you are looking for')


Type a word you are looking for ambulation


In [86]:
ro = re.compile(phrase)

In [101]:
word_pattern = re.compile(phrase)

In [102]:
word_matches = word_pattern.finditer(contents)
for w in word_matches:
    print(w)

<re.Match object; span=(65658, 65668), match='ambulation'>
<re.Match object; span=(95325, 95335), match='ambulation'>
<re.Match object; span=(374178, 374188), match='ambulation'>
<re.Match object; span=(508153, 508163), match='ambulation'>
<re.Match object; span=(771559, 771569), match='ambulation'>
<re.Match object; span=(1298515, 1298525), match='ambulation'>
<re.Match object; span=(1347637, 1347647), match='ambulation'>
<re.Match object; span=(1490569, 1490579), match='ambulation'>
<re.Match object; span=(1609494, 1609504), match='ambulation'>
<re.Match object; span=(1949657, 1949667), match='ambulation'>


In [None]:
# Same things but without loading everything in the memory:
for line in open('words.txt'):
    m = ro.search(line)
    if m:
        print(line)