# NLP in Python: Unleashing the power of spaCy

#### This is some magic to make things a bit prettier. Don't worry about it too much.

In [None]:
# !pip install -U pandas

import pandas as pd

heading_properties = [('font-size', '14px')]
cell_properties = [('font-size', '14px')]

df_style = [dict(selector="th", props=heading_properties),
            dict(selector="td", props=cell_properties)]

def display_as_table(tbl):
    df = pd.DataFrame(tbl[1:], columns=tbl[0])
    display(df.style.set_table_styles(df_style))

#### Install spaCy if necessary. 

In [None]:
# !pip install -U spacy
!python -m spacy validate

## Basic spaCy usage

In [None]:
from spacy.lang.es import Spanish

nlp = Spanish()

In [None]:
doc = nlp("Mi nombre completo es Mario García Armas.")

In [None]:
token_tbl = [["TOKEN", "SHAPE", "ALPHA", "STOP", "PUNCT"]]
for token in doc:
    token_tbl.append([token.text, token.shape_, token.is_alpha, 
                      token.is_stop, token.is_punct])
display_as_table(token_tbl)

In [None]:
print(f"Current spaCy pipeline: {nlp.pipe_names}")

## Introduction to spaCy models

#### Download spaCy models if necessary. See https://spacy.io/models for more info.

In [None]:
# !python -m spacy download en_core_web_sm

In [None]:
import spacy
from spacy import displacy

In [None]:
nlp = spacy.load('en_core_web_sm')

print(f"Current spaCy pipeline: {nlp.pipe_names}")

#### Text for analysis:
Jorge Luis Borges is one of the most influential writers of the 20th century. He was born in vibrant Buenos Aires, the capital of Argentina. Borges later moved with his family to Switzerland in 1914, where he studied at the Collège de Genève.

In [None]:
text = "Jorge Luis Borges is one of the most influential writers of the 20th century. He was born in vibrant Buenos Aires, the capital of Argentina. Borges later moved with his family to Switzerland in 1914, where he studied at the Collège de Genève."

doc = nlp(text)

In [None]:
sents = list(doc.sents)
for idx, sent in enumerate(sents, 1):
    print(f'{idx}.- {sent.text.strip()}\n')

In [None]:
token_tbl = [["TOKEN", "LEMMA", "POS", "TAG", "DEP", "SHAPE", "ALPHA", "STOP", "PUNCT"]]
for token in sents[1]:
    token_tbl.append([token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                      token.shape_, token.is_alpha, token.is_stop, token.is_punct])
display_as_table(token_tbl)

In [None]:
displacy.render(sents[1], style='dep', options={'compact': False, 'distance': 120, 'bg': '#cef5ef'})

In [None]:
displacy.render(doc, style="ent")

In [None]:
labels = set(ent.label_ for ent in doc.ents)
label_tbl = [['LABEL', 'EXPLANATION']]
label_tbl.extend([[label, spacy.explain(label)] for label in sorted(labels)])
display_as_table(label_tbl)

## Customizing spaCy: best-in-class tokenizer

In [None]:
text = "A thought—How should a best-in-class tokenizer split this?"
print(text)

In [None]:
from spacy.lang.en import English

nlp_to_modify = English()

print(f"Current spaCy pipeline: {nlp_to_modify.pipe_names}")

In [None]:
doc = nlp_to_modify(text)
print(*(f'{token.text}' for token in doc), sep=' | ')

#### How about we modify  infix rules

In [None]:
print(nlp_to_modify.Defaults.infixes[6])

#### Well... THAT WAS SCARY. Let's disentangle it.

In [None]:
from spacy.lang.char_classes import ALPHA, HYPHENS
print(f"ALPHA: {ALPHA}")
print()
print(f"HYPHENS: {HYPHENS}")

#### Magic regex: (?<=[{ALPHA}])(?:{HYPHENS})(?=[{ALPHA}])'

In [None]:
assert nlp_to_modify.Defaults.infixes[6] == \
    rf'(?<=[{ALPHA}])(?:{HYPHENS})(?=[{ALPHA}])'

In [None]:
from spacy.util import compile_infix_regex

infixes = list(nlp_to_modify.Defaults.infixes[:6]) + \
    list(nlp_to_modify.Defaults.infixes[7:]) + \
    [rf'(?<=[{ALPHA}])(?:{HYPHENS[4:]})(?=[{ALPHA}])']

infix_re = compile_infix_regex(infixes)

nlp_to_modify.tokenizer.infix_finditer = infix_re.finditer

In [None]:
doc = nlp_to_modify(text)
print(*(f'{token.text}' for token in doc), sep=' | ')

## Customizing spaCy: simpler noun chunking

In [None]:
text = "Very beautiful British Columbia is home to the best and brightest yoga teachers."

In [None]:
nlp_to_modify = spacy.load('en_core_web_sm')

print(f"Current spaCy pipeline: {nlp_to_modify.pipe_names}")

#### Sometimes noun chunks can be long and complex

In [None]:
doc = nlp_to_modify(text)
print(*(f'{nc.text}' for nc in doc.noun_chunks), sep=' | ')

#### Set custom attribute for docs called custom_noun_chunks

In [None]:
from spacy.tokens import Doc

if Doc.get_extension("custom_noun_chunks") is None:
    Doc.set_extension("custom_noun_chunks", default=[])

#### Create the custom noun chunker. It matches optional adjective followed by consecutive nouns.

In [None]:
import re

class CustomNounChunker(object):
    name = 'custom_noun_chunker'
    def __init__(self):
        self.noun_chunk_re = re.compile(r'(?:ADJ )?(?:NOUN |PROPN )+')

    def _get_pos_string(self, doc):
        pos_list, starts, ends = [], {}, {}
        curr_pos = 0
        for token in doc:
            pos_list.append(f'{token.pos_} ')
            # Update starts and ends
            starts[curr_pos] = token.i
            curr_pos += len(pos_list[-1])
            ends[curr_pos] = token.i
        return ''.join(pos_list), starts, ends

    def __call__(self, doc):
        pos_string, starts, ends = self._get_pos_string(doc)
        for sre_match in self.noun_chunk_re.finditer(pos_string):
            regex_span = sre_match.span(0)
            start, end = starts[regex_span[0]], ends[regex_span[1]] + 1
            doc._.custom_noun_chunks.append(doc[start:end])
        return doc

#### Registed the pipeline at the end (as it depends on POS tagging).

In [None]:
custom_noun_chunker = CustomNounChunker()
try:
    nlp_to_modify.add_pipe(custom_noun_chunker, last=True)
except ValueError:
    nlp_to_modify.replace_pipe("custom_noun_chunker", custom_noun_chunker)
    
print(f"Current spaCy pipeline: {nlp_to_modify.pipe_names}")

In [None]:
doc = nlp_to_modify(text)
print(*(f'{nc.text}' for nc in doc._.custom_noun_chunks), sep=' | ')