# Class 6: Text Basics - Tutorial

https://spacy.io/models/da

In [None]:
import numpy as np

import spacy
from spacy.lang.da.examples import sentences 

In [None]:
#!python -m spacy download da_core_news_sm
#!python -m spacy download en_core_web_sm

## Spacy Models

In [None]:
spacy_pipeline = spacy.load("da_core_news_sm")

In [None]:
type(spacy_pipeline)

## Tokenization

We pass whatever text we want to process to `spacy_pipeline`, which returns a `Doc` container object (https://spacy.io/api/doc) containing the tokenized text and a number of annotations for each token. 


In [None]:
# Generate sample text
sample_text = sentences[0]

In [None]:
# Instantiate SpaCy pipeline
doc = spacy_pipeline(sample_text)

In [None]:
# Print doc
print(doc)

In [None]:
# # Looks like a standard string, but it's not - Check type
type(doc)

In [None]:
# We can iterate over the Doc object to access the tokens - note that we access the token by the .text attribute
tokens0 = [t.text for t in doc]

In [None]:
# Alternative tokenizer: .split()
tokens1 = sample_text.split(' ')

In [None]:
len(tokens0), len(tokens1)

In [None]:
tokens1

In [None]:
tokens0

In [None]:
# We can view an individual token by indexing into the Doc object
print(doc[0])

In [None]:
# Also looks like a string, but it's not -- Check type
print(type(doc[0]))

In [None]:
# Slicing a Doc object returns a Span object.
print(doc[0:3])
print(type(doc[0:3]))

In [None]:
# Access a token's index in a sentence
print([(t.text, t.i) for t in doc])

In [None]:
# Spacy's tokenization is _non-destructive_, which means the original input can be reconstructed from the tokens.
# You can view the original input like so:
print(doc.text)

In [None]:
# And by reconstructing, we also now have a string object
print(type(doc.text))

In [None]:
# It is even non-destructive from each individual token as well
print(doc[0].doc.text)

In [None]:
print(type(doc[0].doc.text))

In [None]:
# It also possible to tokenize multiple sentences at once - but spacy requires a string input
s = sentences[0] + ' ' + sentences[1]
# s = [sentences[0]] + [sentences[1]]
doc = spacy_pipeline(s)

In [None]:
doc

In [None]:
# Look at individual sentences (there should be two 'Span' objects).
print([sent for sent in doc.sents])

In [None]:
# We can also access individual tokens, but where the sentence structure is hidden
print([t.text for t in doc])

In [None]:
# Use a nested list comprehension to maintain the sentence structure while looking at individual tokens
[[t.text for t in sent] for sent in doc.sents]

In [None]:
# Why choose a pretrained pipeline over the .split() method?

s = 'Toronto ligger 159km fra Buffalo.'

doc = spacy_pipeline(s)

# Consider the spacy result:
tokens0 = [t.text for t in doc]
print(tokens0)

In [None]:
# And the .split() result
tokens1 = s.split()
print(tokens1)

In [None]:
# So far we have tokenized sentences or at most two sentences. Imagine we have a corpus. 
# tokens = [spacy_pipeline(x) for x in sentences]

## Preprocessing

* Stopwords/digits
* Casing
* Word reduction (stemming and lemmatization)

In [None]:
# Define a list with Danish stopwords
stop_words = sorted(list(spacy_pipeline.Defaults.stop_words))

In [None]:
# Print stopwords
print(stop_words)

In [None]:
# Compute length of stopwords
len(stop_words)

In [None]:
print(tokens0)

In [None]:
# Removal of stopwords using list comprehension
[x for x in tokens0 if x not in stop_words]

In [None]:
# Removal of digits
[x for x in tokens0 if not x.isdigit()]

### Casing

In [None]:
# Case-folding using the builtin .lower() function
s.lower().split()

In [None]:
# Case-folding using the .lower_ attribute
print([t.lower_ for t in doc])

In [None]:
# Conditional lowering
print([t.lower_ if not t.is_sent_start else t for t in doc])

In [None]:
# SpaCy performs advanced preprocessing steps under the hood such as NER, POS, and Parsing
s = 'Toronto ligger 159km fra Buffalo.'
[(t.text, t.ent_type_) for t in spacy_pipeline(s)]

In [None]:
# The results are not always as we want. Try replace 'Toronto' with 'København'
s = 'København ligger 159km fra Buffalo.'
[(t.text, t.ent_type_) for t in spacy_pipeline(s)]

In [None]:
# Load english pipeline
spacy_pipeline_en = spacy.load('en_core_web_sm')

In [None]:
s = 'Toronto ligger 159km fra Buffalo.'
[(t.text, t.ent_type_) for t in spacy_pipeline_en(s)]

In [None]:
# We can get SpaCy to explain its abbreviations
spacy.explain('GPE')

In [None]:
# Conditional lowering using NER as exceptions
print([t.lower_ if t.ent_type_ not in ['GPE', 'LOC'] else t for t in spacy_pipeline_en(s)])

### Word Reduction

In [None]:
from nltk.stem.snowball import DanishStemmer
stemmer = DanishStemmer()

In [None]:
# s = 'Udlændinge kommer herop og begår kriminalitet'
s = 'Toronto ligger 159km fra Buffalo.'

In [None]:
# Stemming using NLTK
[stemmer.stem(t.text) for t in spacy_pipeline(s)]

In [None]:
# Lemmatization using SpaCy
[t.lemma_ for t in spacy_pipeline(s)]

## Vectorization

* Binary and count
* TF-IDF

### Binary and Count Vectorization

In [None]:
# Generate a corpus of sentences.
corpus = [
    "Red Bull drops hint on F1 engine.",
    "Honda exits F1, leaving F1 partner Red Bull.",
    "Hamilton eyes record eighth F1 title.",
    "Aston Martin announces sponsor."]

In [None]:
# Import classes and functions from sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Initiate a BoW vectorizer
vectorizer = CountVectorizer(binary=True)

In [None]:
# Build vocabulary
vectorizer.fit(corpus)

In [None]:
# See vocab
print(vectorizer.get_feature_names_out())
vectorizer.vocabulary_

In [None]:
# Sort vocab
dict(sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1]))

In [None]:
# Apply the vocab to the corpus
bow = vectorizer.transform(corpus)

In [None]:
# Convert sparse matrix to np array
bow_array = bow.toarray()
print(bow_array)

In [None]:
# Define custom tokenizer (more steps can easily be added)
def spacy_tokenizer(doc):
    toks = [t for t in spacy_pipeline_en(doc) if not t.is_punct]
    return [t.text for t in toks]

In [None]:
# Instantiate CountVectorizer and apply fit_transform
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, lowercase=False, binary=False, decode_error='ignore', token_pattern=None)
bow = vectorizer.fit_transform(corpus)

In [None]:
bow_array = bow.toarray()

In [None]:
# Compute pairwise cosine similarity
cosine_similarity(bow_array)

In [None]:
# Manual computation of cosine similarity
np.dot(bow_array[0], bow_array[1]) / (np.linalg.norm(bow_array[0]) * np.linalg.norm(bow_array[1]))

### TF-IDF

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

We'll use the **20 newsgroups** dataset, which is a collection of 18,000 newsgroup posts across 20 topics.<br>
https://scikit-learn.org/stable/datasets/real_world.html#the-20-newsgroups-text-dataset
<br><br>
List of datasets available:<br>
https://scikit-learn.org/stable/datasets.html#datasets

The **datasets** module includes fetchers for each dataset in scikit-learn. For our purposes, we'll fetch only the posts from the *sci.space* topic, and skip on headers, footers, and quoting of other posts.<br>
https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html#sklearn.datasets.fetch_20newsgroups
<br><br>
By default, the fetcher retrieves the *training* subset of the data only. If you don't know what that means, it'll become clear later in the course when we discuss modelling. For now, it doesn't matter for our purposes.

In [None]:
corpus = fetch_20newsgroups(categories=['sci.space'],
                            remove=('headers', 'footers', 'quotes'))

In [None]:
# We don't need named-entity recognition nor dependency parsing for
# this so these components are disabled. This will speed up the
# pipeline. We do need part-of-speech tagging however.
unwanted_pipes = ["ner", "parser"]

# For this exercise, we'll remove punctuation and spaces (which
# includes newlines), filter for tokens consisting of alphabetic
# characters, and return the lemma (which require POS tagging).
def spacy_tokenizer(doc):
    with nlp.disable_pipes(*unwanted_pipes):
        return [t.lemma_ for t in nlp(doc) if \
                not t.is_punct and \
                not t.is_space and \
                t.is_alpha]

In [None]:
%%time
# Use the default settings of TfidfVectorizer.
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, token_pattern=None)
features = vectorizer.fit_transform(corpus.data)

In [None]:
# The number of unique tokens.
print(len(vectorizer.get_feature_names_out()))

In [None]:
# The dimensions of our feature matrix. X rows (documents) by Y columns (tokens).
print(features.shape)

In [None]:
# View first two posts.
corpus.data[:2]

In [None]:
#
vectorizer.vocabulary_['satellite']

In [None]:
# What the encoding of the first document looks like in sparse format.
print(features[0])

In [None]:
# Transform the query into a TF-IDF vector.
query = ["lunar orbit"]
query_tfidf = vectorizer.transform(query)

In [None]:
# Calculate the cosine similarities between the query and each document.
# We're calling flatten() here becaue cosine_similarity returns a list
# of lists and we just want a single list.
cosine_similarities = cosine_similarity(features, query_tfidf).flatten()

In [None]:
def top_k(arr, k):
    kth_largest = (k + 1) * -1
    return np.argsort(arr)[:kth_largest:-1]

In [None]:
# So for our query above, these are the top five documents.
top_related_indices = top_k(cosine_similarities, 5)
print(top_related_indices)

In [None]:
# Let's take a look at their respective cosine similarities.
print(cosine_similarities[top_related_indices])

In [None]:
# Top match.
print(corpus.data[top_related_indices[0]])

In [None]:
# Second-best match.
print(corpus.data[top_related_indices[1]])

In [None]:
# Try a different query
query = ["satellite"]
query_tfidf = vectorizer.transform(query)

cosine_similarities = cosine_similarity(features, query_tfidf).flatten()
top_related_indices = top_k(cosine_similarities, 5)

print(top_related_indices)
print(cosine_similarities[top_related_indices])

In [None]:
print(corpus.data[top_related_indices[0]])