In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.metrics import pairwise_distances
from sklearn.datasets import fetch_20newsgroups
import numpy as np

# define newsgroup categories
categories = [
    'sci.space'
]

dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=24)

# take the first 100 records in the data set to reduce computation time
document_corpus = dataset.data[:100]

In [4]:
print("%d documents" % len(dataset.data[:100]))

print("%d categories" % len(dataset.target_names))

print()

# display the first 200 characters of the first document
document_corpus[0][:200]

100 documents
1 categories



'From: wallacen@CS.ColoState.EDU (nathan wallace)\nSubject:  Level 5\nReply-To: wallacen@CS.ColoState.EDU\nNntp-Posting-Host: sor.cs.colostate.edu\nOrganization: Colorado State University -=- Computer Scie'

In [5]:
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)\

svd_model = TruncatedSVD(n_components=50, algorithm='randomized', n_iter=10, random_state=42)

svd_transformer = Pipeline([('tfidf', vectorizer), ('svd', svd_model)])

svd_matrix = svd_transformer.fit_transform(document_corpus)

print(svd_matrix.shape)

(100, 50)


In [19]:
query = ["space"]

query_vector = svd_transformer.transform(query)

distance_matrix = pairwise_distances(query_vector, svd_matrix, metric='cosine', n_jobs=-1)

print(min(distance_matrix[0, :]))

if min(distance_matrix[0, :]) == 1.0:
    print("Query string not found.")
else:

    best_match_index = np.where(distance_matrix[0, :] == min(distance_matrix[0, :]))

    print("best match index: %d " % best_match_index[0][0])

    print()

    print("-- document at index %d --" % best_match_index[0][0])

    print(document_corpus[best_match_index[0][0]])

0.303722598788
best match index: 16 

-- document at index 16 --
From: mwm+@cs.cmu.edu (Mark Maimone)
Subject: How to read sci.space without netnews
Summary: Space Digest address
Nntp-Posting-Host: a.gp.cs.cmu.edu
Organization: School of Computer Science, Carnegie Mellon
Lines: 36

In article <734975852.F00001@permanet.org> Mark.Prado@p2.f349.n109.z1.permanet.org (Mark Prado) writes:
>If anyone knows anyone else who would like to get sci.space,
>but doesn't have an Internet feed (or has a cryptic Internet
>feed), I would be willing to feed it to them.	

	Kudos to Mark for his generous offer, but there already exists a
large (email-based) forwarding system for sci.space posts:  Space Digest.
It mirrors sci.space exactly, and provides simple two-way communication.

	TO SUBSCRIBE:
	   Send the following message in the *body* (not subject) of an
	   email message:

		subscribe space John Q Public

	   to one of these addresses:

		listserv@uga
		listserv@uga.cc.uga.edu
		listserv@finhutc
	