# Text Mining: Introduction spaCy

## Helper Functions

In [None]:
import json

Helper functions to load IMDB movie reviews.

In [None]:
def documents():
    with open('review_polarity.train.json') as fp:
        for d in json.load(fp):
            yield d['text']

def tokens(doc):
    for token in doc.split():
        if token.isalpha():
            yield token

In [None]:
len(list(documents()))

## Exploration 1: Basic statistics

Load the Counter class, which is useful for statistics.

In [None]:
from collections import Counter

Count how many occurrences of each token the data contains.

In [None]:
counter = Counter()
for doc in documents():
    counter.update(tokens(doc))

Print the total number of tokens.

In [None]:
print(len(counter))

The token *movie* occurs quite often:

In [None]:
print(counter['movie'])

Print the 10 most common words.

In [None]:
counter.most_common(10)

Plot the number of occurrences of the 100 most common words.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

labels, values = zip(*counter.most_common(100))
plt.bar(range(len(labels)), values, 1)
plt.show()

## Exploration 2: Information extraction

Load spaCy.

In [None]:
import spacy

Load the English language model.

In [None]:
nlp = spacy.load('en')

Define a short text.

In [None]:
text = u'Apple Corp. buys Alphabet Inc. for $1 billion'

Process the text using the default pipeline (tokenizer, tensorizer, tagger, parser, entity recognizer, text classifier).

In [None]:
doc = nlp(text)

Print the tokens together with their lemmas, part-of-speech tags, and stopword flags.

In [None]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.is_stop)

Show the dependency parse.

In [None]:
from spacy import displacy
displacy.render(doc, style='dep', options={"distance": 110}, jupyter=True)

Show the named entities.

In [None]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

Find the root word of an entity.

In [None]:
def root(ent):
    for token in ent:
        while not token.head is token and ent.start <= token.head.i and token.head.i < ent.end:
            token = token.head
        return token

Extract semantic relations.

In [None]:
for ent1 in doc.ents:
    root1 = root(ent1)
    for ent2 in doc.ents:
        root2 = root(ent2)
        if root1.dep_ == 'nsubj' and root2.dep_ == 'dobj' and root1.head == root2.head and root1.head.pos_ == 'VERB':
            print("[{}]-[{}]-[{}]".format(ent1, root1.head.lemma_, ent2))

## Exploration 3: Topic modelling

Import gensim.

In [None]:
import gensim

Build the vocabulary.

In [None]:
dictionary = gensim.corpora.Dictionary(tokens(d) for d in documents())

Use spaCy to filter out stop words.

In [None]:
del_ids = [k for k, v in dictionary.items() if nlp.vocab[v].is_stop]

dictionary.filter_tokens(bad_ids=del_ids)

Create an iterator over the data (for efficiency).

In [None]:
class MyCorpus(object):
    def __iter__(self):
        for d in documents():
            yield dictionary.doc2bow(t.lower() for t in tokens(d))

Build the LDA model (takes a while).

In [None]:
lda = gensim.models.ldamodel.LdaModel(
    corpus=MyCorpus(),
    id2word=dictionary,
    num_topics=4,
    chunksize=5,
    passes=5,
    update_every=1
)

Print the most common topics.

In [None]:
lda.print_topics(4)

Visualise the model using the pyLDAvis library.

In [None]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 

Visualise using Principle Coordinates Analysis.

In [None]:
pyLDAvis.gensim.prepare(lda, list(MyCorpus()), dictionary)

Visualise using t-SNE.

In [None]:
pyLDAvis.gensim.prepare(lda, list(MyCorpus()), dictionary, mds='tsne')

That&rsquo;s all, folks!