Author: Melanie Klein

Date: 6 February 2017

CSC570R Week 4 Lab: Latent Semantic Analysis

In [12]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [13]:
#download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mmcla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
#download newsgroups dataset
from sklearn.datasets import fetch_20newsgroups
categories = ['rec.sport.baseball']
dataset = fetch_20newsgroups(subset='all', shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

In [51]:
stopset = set(stopwords.words('english'))
stopset.update(['000', '00', '01', '02', '03', '04', '05', 'com', 'edu', 'vb30', 'netcom', 'mail'])

In [52]:
#Peek at raw data
corpus[0]

"From: writingctr@leo.bsuvc.bsu.edu\nSubject: Re: CUB fever.\nOrganization: Ball State University, Muncie, In - Univ. Computing Svc's\nLines: 21\n\n\nIn article <kingoz.735285670@camelot>, kingoz@camelot.bradley.edu (Orin Roth) writes:\n> \n>    CUB fever is hitting me again. I'm beginning to think they have a \n>    chance this year. (what the heck am i thinking?)\n>    Sorry. Just a moment of incompetence.\n>    I'll be ok. Really. \n>    Orin.\n>    Bradley U.\n> \n> --\n> I'm really a jester in disguise!                                   \nI hear ya!  Then again, we must remember that we are indeed Cub fans, and\nthat the Cubs will eventually blow it.  After all, the Cubs are the easiest\nteam in the National League to root for.  No Pressure.  You know they will\nlose eventually.  Oh well, I suppose we must have faith.  After all, they\ndo look pretty good, and they don't even have Sandberg back yet.  \n\nCUBS IN '93!!!!!\n\nCHA\n"

In [53]:
#Calculate TF-IDF
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)

In [54]:
#TF-IDF Vectorized data
print(X[0])

  (0, 51330)	0.073941735884
  (0, 187940)	0.073941735884
  (0, 28990)	0.073941735884
  (0, 144818)	0.073941735884
  (0, 62636)	0.073941735884
  (0, 76423)	0.073941735884
  (0, 132020)	0.073941735884
  (0, 101823)	0.073941735884
  (0, 65024)	0.073941735884
  (0, 112984)	0.073941735884
  (0, 161836)	0.073941735884
  (0, 179889)	0.073941735884
  (0, 118386)	0.073941735884
  (0, 62774)	0.073941735884
  (0, 102315)	0.073941735884
  (0, 94049)	0.073941735884
  (0, 131935)	0.073941735884
  (0, 141963)	0.073941735884
  (0, 96679)	0.073941735884
  (0, 113626)	0.073941735884
  (0, 164233)	0.073941735884
  (0, 59253)	0.073941735884
  (0, 51370)	0.073941735884
  (0, 35799)	0.073941735884
  (0, 62760)	0.073941735884
  :	:
  (0, 184463)	0.0156508441505
  (0, 142211)	0.0555654199865
  (0, 121069)	0.111130839973
  (0, 37607)	0.0996519817033
  (0, 40840)	0.105229895418
  (0, 16503)	0.073941735884
  (0, 93435)	0.111130839973
  (0, 25785)	0.0161879311096
  (0, 7090)	0.0377753217633
  (0, 99514)	0.0103180

In [55]:
#Perform LSA using singular value decomposition
lsa = TruncatedSVD(n_components=50, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=50, n_iter=100,
       random_state=None, tol=0.0)

In [56]:
#The first row of matrix V
lsa.components_[0]

array([ 0.00054257,  0.00054257,  0.00054257, ...,  0.00113271,
        0.00113271,  0.00113271])

In [57]:
#Print the top 10 terms for each concept
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_):
    termsInComp = zip (terms,comp)
    sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print("Concept %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print (" ")

Concept 0:
year
team
would
game
writes
cs
article
baseball
players
games
 
Concept 1:
jewish
come
baseball
mets
baseball players
good
reply
see
lafayette
lafibm
 
Concept 2:
ibm
go
runs
team
aix
baseball
come
run
morris
mjones
 
Concept 3:
team
morris
news
many
nntp posting
game
john
alomar
roger
home
 
Concept 4:
clutch
team
runs
000th career win
roger
say
two
players
could
hit
 
Concept 5:
team
games
win
braves
least
anyone
pitching
look
baseball
better
 
Concept 6:
first
win
know
game
games
may
runs
news
might
subject
 
Concept 7:
would
even
games
player
go
time
pitch
roger
run
alomar
 
Concept 8:
year
say
back
mets
make
last
gant
stats
hcf jhu
000th career win
 
Concept 9:
games
ball
game
players
morris
good
well
time
lost
runs
 
Concept 10:
world
first
baseball
better
like
alomar
era
001 100
day
say
 
Concept 11:
first
last
distribution
hitter
writes article
pitchers
many
know
john
base
 
Concept 12:
two
still
go
nntp
pitch
play
host
nntp posting
university
players
 
Concept 13:
t