# Chapter 2: Corpus Processing Tools
Author: Pierre Nugues

We use the `regex` module to have a better Unicode support

In [59]:
import regex as re


## Matching one occurrence

#### A first match with `re.search()`

In [60]:
line = 'The aerial acceleration alerted the ace pilot'
match = re.search('ab*c', line)
match      # <regex.Match object; span=(11, 13), match='ac'>


<regex.Match object; span=(11, 13), match='ac'>

#### Getting the match value

In [61]:
match.group()  # ac


'ac'

## Getting all the matches

#### The list of all the strings

In [62]:
match_list = re.findall('ab*c', line)   # ['ac', 'ac']
match_list


['ac', 'ac']

#### The match groups (the objects)

In [63]:
match_iter = re.finditer('ab*c', line)
list(match_iter)


[<regex.Match object; span=(11, 13), match='ac'>,
 <regex.Match object; span=(36, 38), match='ac'>]

## Interactive match

#### Using the shell (does not work with the notebooks)

In [64]:
import sys

for line in sys.stdin:
    if re.search('ab*c', line):    # m/ab*c/
        print('-> ' + line, end='')


#### Using IPython ipywidgets

In [65]:
# https://github.com/ipython/ipywidgets
import ipywidgets as widgets
from IPython.display import display

# The input box
text = widgets.Text()
display(text)


def handle_submit(sender):
    if re.search('ab*c', text.value):
        print('->', text.value)
    text.value = ''


# Hitting return fires handle_submit
text.on_submit(handle_submit)


Text(value='')

  text.on_submit(handle_submit)


## Nonprintable characters and modifiers
#### Start of a line

We create a list of multiple strings with `split()`

In [66]:
# text = sys.stdin.read()
text = """Sing, O goddess, the anger of Achilles 
son of Peleus, that brought countless ills upon the Achaeans.
""".split('\n')
text


['Sing, O goddess, the anger of Achilles ',
 'son of Peleus, that brought countless ills upon the Achaeans.',
 '']

`split()` adds empty strings. We strip the string before we split it. 

In [67]:
# text = sys.stdin.read()
text = """Sing, O goddess, the anger of Achilles 
son of Peleus, that brought countless ills upon the Achaeans.
""".strip().split('\n')
text


['Sing, O goddess, the anger of Achilles ',
 'son of Peleus, that brought countless ills upon the Achaeans.']

In [68]:
for line in text:
    match = re.search('^s', line)  # m/^s/
    if match:
        print('-> ' + match.group())


-> s


#### The case-insensitive modifier

We did not match `S`. We can make the regex case-insensitive

In [69]:
for line in text:
    match = re.search('^s', line, re.I)  # m/^s/i
    if match:
        print('-> ' + match.group())


-> S
-> s


#### Case insensitive and multiline

The start anchor `^` corresponds to the unique start a string. With the multiline modifier, `re.M` a `\n` also defines a start position

In [70]:
text = """Sing, O goddess, the anger of Achilles son
of Peleus, that brought countless ills upon the Achaeans.
""".strip()
text


'Sing, O goddess, the anger of Achilles son\nof Peleus, that brought countless ills upon the Achaeans.'

In [71]:
match = re.search('^s', text, re.I | re.M)  # m/^s/im
if match:
    print('-> ' + match.group())


-> S


In [72]:
match = re.search('^o', text, re.I)
if match:
    print('-> ' + match.group())


In [73]:
match = re.search('^o', text, re.I | re.M)
if match:
    print('-> ' + match.group())


-> o


#### Getting all the matches

In [74]:
text = """Sing, O goddess, the anger of Achilles 
son of Peleus, that brought countless ills upon the Achaeans.
""".strip()
text


'Sing, O goddess, the anger of Achilles \nson of Peleus, that brought countless ills upon the Achaeans.'

In [75]:
match_list = re.findall('^s', text, re.I | re.M)
print(match_list)


['S', 's']


#### Getting all the matches with `finditer()`

In [76]:
match_list = re.finditer('^s', text, re.I | re.M)
match_list


<_regex.Scanner at 0x1068047d0>

In [77]:
list(match_list)


[<regex.Match object; span=(0, 1), match='S'>,
 <regex.Match object; span=(40, 41), match='s'>]

## Substitution

#### Global replacement: `s/regex/replacement/g`

In [78]:
text = """Sing, O goddess, the anger of Achilles 
son of Peleus, that brought countless ills upon the Achaeans.
""".strip().split('\n')
text


['Sing, O goddess, the anger of Achilles ',
 'son of Peleus, that brought countless ills upon the Achaeans.']

In [79]:
for line in text:
    if re.search('es+', line):
        print("Old: " + line)
        # Replaces all the occurrences
        line = re.sub('es+', 'ES', line)
        print("New: " + line)
# s/ab+c/ABC/g


Old: Sing, O goddess, the anger of Achilles 
New: Sing, O goddES, the anger of AchillES 
Old: son of Peleus, that brought countless ills upon the Achaeans.
New: son of Peleus, that brought countlES ills upon the Achaeans.


 
#### Just one replacement: s/regex/replacement/

In [80]:
text = """Sing, O goddess, the anger of Achilles 
son of Peleus, that brought countless ills upon the Achaeans.
""".strip().split('\n')
text


['Sing, O goddess, the anger of Achilles ',
 'son of Peleus, that brought countless ills upon the Achaeans.']

In [81]:
for line in text:
    if re.search('es+', line):
        print("Old: " + line)
        # Replaces all the occurrences
        line = re.sub('es+', 'ES', line, 1)
        print("New: " + line)
# s/ab+c/ABC/


Old: Sing, O goddess, the anger of Achilles 
New: Sing, O goddES, the anger of Achilles 
Old: son of Peleus, that brought countless ills upon the Achaeans.
New: son of Peleus, that brought countlES ills upon the Achaeans.


## Backreferences

In [82]:
line = 'abbbcdeeef'


In [83]:
match = re.search('^(.)(b+)c+', line)


The whole pattern

In [84]:
match.group()


'abbbc'

Equivalent to

In [85]:
match.group(0)


'abbbc'

Back reference 1, `(.)` stored in `\1`

In [86]:
match.group(1)


'a'

Backreference 2, `(b+)` stored in `\2`

In [87]:
match.group(2)


'bbb'

Matching a sequence of three identical characters

In [88]:
match = re.search(r'(.)\1\1', line)
match.group(1)                # 'b'


'b'

 #### Substitutions `s/(.)\1\1/***/g`

In [89]:
re.sub(r'(.)\1\1', '***', 'abbbcdeeef')  # 'a***cd***f'


'a***cd***f'

#### Multiple backreferences `m/\$ *([0-9]+)\.?([0-9]*)/`

In [90]:
price = "We'll buy it for $72.40"


In [91]:
match = re.search(r'\$ *([0-9]+)\.?([0-9]*)', price)
match.group()  # ’$72.40’ The entire match


'$72.40'

In [92]:
match.group(1)  # ’72’ The first group


'72'

In [93]:
match.group(2)  # ’40’ The second group


'40'

#### Substitutions `s/\$ *([0-9]+)\.?([0-9]*)/\1 dollars and \2 cents/g`

In [94]:
re.sub(r'\$ *([0-9]+)\.?([0-9]*)',
       r'\1 dollars and \2 cents', price)
# We’ll buy it for 72 dollars and 40 cents


"We'll buy it for 72 dollars and 40 cents"

#### Why `r`

In [95]:
'\1'


'\x01'

In [96]:
'\141'


'a'

In [97]:
r'\1'


'\\1'

In [98]:
r'\141'


'\\141'

## Matching objects

In [99]:
price = "We'll buy it for $72.40"


In [100]:
match = re.search(r'\$ *([0-9]+)\.?([0-9]*)', price)
match


<regex.Match object; span=(17, 23), match='$72.40'>

#### Input

In [101]:
match.string            # We’ll buy it for $72.40


"We'll buy it for $72.40"

#### Groups

In [102]:
match.groups()          # (’72’, ’40’)


('72', '40')

In [103]:
match.group(0)          # '$72.40'


'$72.40'

In [104]:
match.group(1)


'72'

In [105]:
match.group(2)


'40'

#### Match objects: The indices

In [106]:
match.start(0)


17

In [107]:
match.end(0)


23

In [108]:
match.start(1)


18

In [109]:
match.end(1)


20

#### Example

In [110]:
line = """Tell me, O muse, of that ingenious hero
  who travelled far and wide after he had sacked
  the famous town of Troy.""".strip()
line


'Tell me, O muse, of that ingenious hero\n  who travelled far and wide after he had sacked\n  the famous town of Troy.'

In [111]:
match = re.search(',.*,', line, re.S)
match


<regex.Match object; span=(7, 16), match=', O muse,'>

In [112]:
line[0:match.start()]             # ’Tell me’


'Tell me'

In [113]:
line[match.start():match.end()]   # ’, O muse,’


', O muse,'

In [114]:
line[match.end():]   # ’of that ingenious hero
#  who travelled far and wide after he had sacked
#  the famous town of Troy.’


' of that ingenious hero\n  who travelled far and wide after he had sacked\n  the famous town of Troy.'

## Concordances: `.{0,15}Nils Holgersson.{0,15}`

In [129]:
pattern = 'Nils Holgersson'
width = 15


We build a regex from these parameters: `.{0,width}pattern.{0,width}`

In [130]:
('.{{0,{width}}}{pattern}.{{0,{width}}}'
 .format(pattern=pattern, width=width))


'.{0,15}Nils Holgersson.{0,15}'

In [131]:
file_name = '../../corpus/Selma.txt'
text = open(file_name).read()
text[:100]


FileNotFoundError: [Errno 2] No such file or directory: '../../corpus/Selma.txt'

In [118]:
# spaces match tabs and newlines
pattern = re.sub(' ', r'\\s+', pattern)
pattern


'Nils\\s+Holgersson'

In [119]:
# Replaces newlines with spaces in the text
text = re.sub(r'\s+', ' ', text)


TypeError: expected string or buffer

In [120]:
concordance = ('(.{{0,{width}}}{pattern}.{{0,{width}}})'
               .format(pattern=pattern, width=width))
concordance


'(.{0,15}Nils\\s+Holgersson.{0,15})'

In [121]:
for match in re.finditer(concordance, text):
    print(match.group(1))


TypeError: expected string or buffer

## Min-edit

In [122]:
[source, target] = ('language', 'lineage')


In [123]:
length_s = len(source) + 1
length_t = len(target) + 1

# Initialize first row and column
table = [None] * length_s

for i in range(length_s):
    table[i] = [None] * length_t
    table[i][0] = i
for j in range(length_t):
    table[0][j] = j


In [124]:
table


[[0, 1, 2, 3, 4, 5, 6, 7],
 [1, None, None, None, None, None, None, None],
 [2, None, None, None, None, None, None, None],
 [3, None, None, None, None, None, None, None],
 [4, None, None, None, None, None, None, None],
 [5, None, None, None, None, None, None, None],
 [6, None, None, None, None, None, None, None],
 [7, None, None, None, None, None, None, None],
 [8, None, None, None, None, None, None, None]]

In [125]:
# Fills the table. Start index of rows and columns is 1
for i in range(1, length_s):
    for j in range(1, length_t):
        # Is it a copy or a substitution?
        cost = 0 if source[i - 1] == target[j - 1] else 2
        # Computes the minimum
        minimum = table[i - 1][j - 1] + cost
        if minimum > table[i][j - 1] + 1:
            minimum = table[i][j - 1] + 1
        if minimum > table[i - 1][j] + 1:
            minimum = table[i - 1][j] + 1
        table[i][j] = minimum


In [126]:
table


[[0, 1, 2, 3, 4, 5, 6, 7],
 [1, 0, 1, 2, 3, 4, 5, 6],
 [2, 1, 2, 3, 4, 3, 4, 5],
 [3, 2, 3, 2, 3, 4, 5, 6],
 [4, 3, 4, 3, 4, 5, 4, 5],
 [5, 4, 5, 4, 5, 6, 5, 6],
 [6, 5, 6, 5, 6, 5, 6, 7],
 [7, 6, 7, 6, 7, 6, 5, 6],
 [8, 7, 8, 7, 6, 7, 6, 5]]

In [127]:
for j in range(length_t):
    for i in range(length_s):
        print(table[i][length_t - j - 1], " ", end='')
    print()


7  6  5  6  5  6  7  6  5  
6  5  4  5  4  5  6  5  6  
5  4  3  4  5  6  5  6  7  
4  3  4  3  4  5  6  7  6  
3  2  3  2  3  4  5  6  7  
2  1  2  3  4  5  6  7  8  
1  0  1  2  3  4  5  6  7  
0  1  2  3  4  5  6  7  8  


In [128]:
print('Minimum distance: ', table[length_s - 1][length_t - 1])


Minimum distance:  5
