In [1]:
# Latent Semantic Analysis Lab
# Matthew T. Dearing

In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = ['sci.space']
dataset = fetch_20newsgroups(subset='all', shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

In [2]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [31]:
stopset = set(stopwords.words('english'))
stopset.update(['nasa gov', 'edu', 'com', 'gov', 'net', 'host', 'NNTP', 'nntp', 'digex']) # removed after initial runs of LSA results

In [None]:
# Note: Had a lot of trouble getting nltk installed -- locked up the computer each time; tried repeatedly, and didn't seem to finish, but remaining started to work.
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [32]:
vectorizer = TfidfVectorizer(stop_words=stopset,use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)

In [33]:
# The Shape of X -- there are lot of docs here
X.shape

(987, 257660)

In [22]:
# Sample output to confirm data exists.
print corpus[1]

From: wingo%cspara.decnet@Fedex.Msfc.Nasa.Gov
Subject: Re: NASA "Wraps"
Organization: University of Houston
Lines: 160
Distribution: world
NNTP-Posting-Host: judy.uh.edu
News-Software: VAX/VMS VNEWS 1.41    

In article <1993Apr18.034101.21934@iti.org>, aws@iti.org (Allen W. Sherzer) writes...
>In article <17APR199316423628@judy.uh.edu> wingo%cspara.decnet@Fedex.Msfc.Nasa.Gov writes:
> 
>>I don't care who told you this it is not generally true. I see EVERY single
>>line item on a contract and I have to sign it. There is no such thing as
>>wrap at this university. 
> 
>Dennis, I have worked on or written proposals worth tens of millions
>of $$. Customers included government (including NASA), for profit and
>non-profit companies. All expected a wrap (usually called a fee). Much
>of the work involved allocating and costing the work of subcontractors.
>The subcontractors where universities, for-profits, non-profits, and
>even some of the NASA Centers for the Commercialization of Space. ALL

In [34]:
X[0]

<1x257660 sparse matrix of type '<type 'numpy.float64'>'
	with 241 stored elements in Compressed Sparse Row format>

In [35]:
# Scored words
print X[0]

  (0, 90788)	0.0526015849478
  (0, 65713)	0.0526015849478
  (0, 14557)	0.0722114880303
  (0, 74649)	0.0722114880303
  (0, 253687)	0.0722114880303
  (0, 250955)	0.0537832364734
  (0, 135639)	0.0537832364734
  (0, 51350)	0.0537832364734
  (0, 53299)	0.0537832364734
  (0, 173825)	0.0537832364734
  (0, 254107)	0.0537832364734
  (0, 110162)	0.0537832364734
  (0, 207104)	0.0537832364734
  (0, 32853)	0.0537832364734
  (0, 124197)	0.0537832364734
  (0, 24233)	0.0544273072111
  (0, 198700)	0.0765191142632
  (0, 76889)	0.0765191142632
  (0, 39264)	0.0765191142632
  (0, 96724)	0.0765191142632
  (0, 50723)	0.0765191142632
  (0, 136870)	0.0765191142632
  (0, 223026)	0.0765191142632
  (0, 211135)	0.0765191142632
  (0, 186679)	0.0765191142632
  :	:
  (0, 178749)	0.0348524088176
  (0, 40246)	0.0594205851095
  (0, 254755)	0.0161553377471
  (0, 215091)	0.0334733014864
  (0, 106845)	0.064341570045
  (0, 234549)	0.0676988169938
  (0, 257576)	0.0688795555767
  (0, 120077)	0.0617912428951
  (0, 45439)	0.076

In [36]:
# Now for the LSA...
lsa = TruncatedSVD(n_components=27, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=27, n_iter=100,
       random_state=None, tol=0.0)

In [37]:
# The first row for the V matrix (the term by concept matrix):
lsa.components_[0]

array([ 0.01016643,  0.0005223 ,  0.00024953, ...,  0.00234562,
        0.00234562,  0.00234562])

In [38]:
# So what terms are included in the above array for the first concept?
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_):
    termsInComp = zip(terms, comp)
    sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10] #only first 10 terms
    print "Concept %d:" % i
    for term in sortedTerms:
        print term[0]
    print " "

Concept 0:
henry
space
toronto
nasa
would
access
zoo
zoo toronto
writes
alaska
 
Concept 1:
henry
toronto
zoo toronto
zoo
spencer
toronto henry
zoo toronto henry
see
high
henry zoo
 
Concept 2:
pat
access
also
like
lines
would
first
enough
fred
better
 
Concept 3:
think
henry
subject
lines
zoo
toronto henry
zoo toronto henry
might
ti
spencer
 
Concept 4:
venus
moon
planet
kilometers
solar system
orbit
miles
could
think
spacecraft
 
Concept 5:
would
us
writes
launch
like
long
article
aurora
could
university
 
Concept 6:
writes
space
one
launch
toronto
satellite
solar
better
time
station
 
Concept 7:
space
orbit
organization
billion
nsmca
henry spencer
nsmca aurora
nsmca aurora alaska
solar
jsc
 
Concept 8:
one
toronto
things
alaska
toronto zoology
zoology
year
00 00 gmt
man
like
 
Concept 9:
nasa
prb access pat
hst
university
one
time
sky
around
moon
see
 
Concept 10:
nasa
also
launch
like
zoo
writes
henry spencer
moon
need
writes article
 
Concept 11:
access
mission
organization
hst
kn

In [None]:
# a great deal of discussion revolving around launching spacecraft; note: the email addresses provide a lot of messy data that 
# could be difficult to filter out because some email accounts include important names that might also be a key part
# of the discussion (e.g., NASA).