In [3]:
import spacy

from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# By default, the fetcher retrieves the training subset of the data only.
corpus = fetch_20newsgroups(categories=['sci.space'],
                            remove=('headers', 'footers', 'quotes'))

In [4]:
print(type(corpus))

# Number of posts in our dataset.
print(len(corpus.data))

# View first two posts.
corpus.data[:2]

<class 'sklearn.utils._bunch.Bunch'>
593


["\nAny lunar satellite needs fuel to do regular orbit corrections, and when\nits fuel runs out it will crash within months.  The orbits of the Apollo\nmotherships changed noticeably during lunar missions lasting only a few\ndays.  It is *possible* that there are stable orbits here and there --\nthe Moon's gravitational field is poorly mapped -- but we know of none.\n\nPerturbations from Sun and Earth are relatively minor issues at low\naltitudes.  The big problem is that the Moon's own gravitational field\nis quite lumpy due to the irregular distribution of mass within the Moon.",
 '\nGlad to see Griffin is spending his time on engineering rather than on\nritual purification of the language.  Pity he got stuck with the turkey\nrather than one of the sensible options.']

In [5]:
# Like before, if we want to use spaCy's tokenizer, we need
# to create a callback. Remember to upgrade spaCy if you need
# to (refer to beginnning of file for commentary and instructions).
nlp = spacy.load('en_core_web_sm')

# We don't need named-entity recognition nor dependency parsing for
# this so these components are disabled. This will speed up the
# pipeline. We do need part-of-speech tagging however.
unwanted_pipes = ["ner", "parser"]

# For this exercise, we'll remove punctuation and spaces (which
# includes newlines), filter for tokens consisting of alphabetic
# characters, and return the lemma (which require POS tagging).
def spacy_tokenizer(doc):
  with nlp.disable_pipes(*unwanted_pipes):
    return [t.lemma_ for t in nlp(doc) if \
            not t.is_punct and \
            not t.is_space and \
            t.is_alpha]
  
# Use the default settings of TfidfVectorizer.
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer)
features = vectorizer.fit_transform(corpus.data)





In [6]:
# The number of unique tokens
print(len(vectorizer.get_feature_names_out()))

9440


In [7]:
# The dimensions of our feature matrix. X rows (documents) by Y columns (tokens).
print(type(features))
print(features.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(593, 9440)


# 2 | simple quering the documents


The similarity measuring techniques we learned previously can be used here in the same way. In effect, we can query our data using this sequence:

Transform our query using the same vocabulary from our fit step on our corpus.
Calculate the pairwise cosine similarities between each document in our corpus and our query.
Sort them in descending order by score.

In [8]:
# Transform the query into a TF-IDF vector.
query = ["lunar orbit"]
query_tfidf = vectorizer.transform(query)

# Calculate the cosine similarities between the query and each document.
# We're calling flatten() here becaue cosine_similarity returns a list
# of lists and we just want a single list.
cosine_similarities = cosine_similarity(features, query_tfidf).flatten()


import numpy as np

# numpy's argsort() method returns a list of *indices* that
# would sort an array:
# https://numpy.org/doc/stable/reference/generated/numpy.argsort.html
#
# The sort is ascending, but we want the largest k cosine_similarites
# at the bottom of the sort. So we negate k, and get the last k
# entries of the indices list in reverse order. There are faster
# ways to do this using things like argpartition but this is
# more succinct.
def top_k(arr, k):
  kth_largest = (k + 1) * -1
  return np.argsort(arr)[:kth_largest:-1]



In [10]:
print(cosine_similarities)

[0.2736328  0.         0.         0.         0.         0.09945553
 0.         0.         0.         0.         0.05161722 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.0392964  0.         0.
 0.         0.         0.         0.00792869 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.10750917 0.         0.
 0.         0.         0.         0.04538261 0.         0.
 0.         0.         0.         0.         0.         0.05675789
 0.         0.         0.04197194 0.         0.         0.08096857
 0.         0.         0.         0.         0.         0.00815123
 0.         0.         0.09558718 0.02529751 0.         0.
 0.04246228 0.         0.         0.         0.         0.03791222
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.  

In [9]:
# So for our query above, these are the top five documents.
top_related_indices = top_k(cosine_similarities, 5)
print(top_related_indices)

[249 108   0 312 509]


In [11]:
print(cosine_similarities[top_related_indices])

[0.47855355 0.4292246  0.2736328  0.19486489 0.19125175]


In [12]:
# Top match.
print(corpus.data[top_related_indices[0]])


Actually, Hiten wasn't originally intended to go into lunar orbit at all,
so it indeed didn't have much fuel on hand.  The lunar-orbit mission was
an afterthought, after Hagoromo (a tiny subsatellite deployed by Hiten
during a lunar flyby) had a transmitter failure and its proper insertion
into lunar orbit couldn't be positively confirmed.

It should be noted that the technique does have disadvantages.  It takes
a long time, and you end up with a relatively inconvenient lunar orbit.
If you want something useful like a low circular polar orbit, you do have
to plan to expend a certain amount of fuel, although it is reduced from
what you'd need for the brute-force approach.


In [None]:
# still not so perfect.

So here we have the beginnings of a simple search engine but we're a far cry from competing with commercial off-the-shelf search engines, let alone Google.

- For each query, we're scanning through our entire corpus, but in practice, you'll want to create an inverted index. Search applications such as Elasticsearch do that under the hood.
- You'd also want to evaluate the efficacy of your search using metrics like precision and recall.
- Document ranking also tends to be more sophisticated, using different ranking functions like Okapi BM25. With major search engines, ranking also involves hundreds of variables such as what the user searched for previously, what do they tend to click on, where are they physically, and on and on. These variables are part of the "secret sauce" and are closely guarded by companies.
- Beyond word presence, intent and meaning are playing a larger role.


Information Retrieval is a huge, rich topic and beyond search, it's also key in tasks such as question-answering.