In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = ['rec.sport.baseball']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


In [2]:
import re
import pandas as pd
categories = ['rec.sport.baseball']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\millm_000\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Set letters to lowercase

In [4]:
corpus = [x.lower() for x in corpus]

## Remove email addresses

In [10]:
corpus = [re.sub(r'(\s)(\S+\@\S+)(\s)', r'\1\3', corpus[x]) for x in range(len(corpus))]

In [21]:
stopset = set(stopwords.words('english'))
stopset.update(['\n', '--', '+', '.', '|', '20', 'would', 'edu', 
                'com', 'net', 'eng', 'umd', 'sysmgr', 'ti', 'uiuc', 
                'ti', 'msu', 'jpl', 'vnet', 'ibm', 're', '_', 'hst', 
                'subject', '__', '___', '____', '22', '575', '3539', 
                'nntp', 'posting', 'host', '10', '30', '50', '14', 
               'henry', 'spencer', 'pat', 'digex', 'access', 'some'])

# TF-IDF Vectorizing

Utilizing TF-IDF vectorizer to take the corpus and convert each document into a sparse matrix of TFIDF features

In [22]:
#Before
corpus[0]

"from: \nsubject: re: cub fever.\norganization: ball state university, muncie, in - univ. computing svc's\nlines: 21\n\n\nin article   (orin roth) writes:\n> \n>    cub fever is hitting me again. i'm beginning to think they have a \n>    chance this year. (what the heck am i thinking?)\n>    sorry. just a moment of incompetence.\n>    i'll be ok. really. \n>    orin.\n>    bradley u.\n> \n> --\n> i'm really a jester in disguise!                                   \ni hear ya!  then again, we must remember that we are indeed cub fans, and\nthat the cubs will eventually blow it.  after all, the cubs are the easiest\nteam in the national league to root for.  no pressure.  you know they will\nlose eventually.  oh well, i suppose we must have faith.  after all, they\ndo look pretty good, and they don't even have sandberg back yet.  \n\ncubs in '93!!!!!\n\ncha\n"

In [23]:
vectorizer = TfidfVectorizer(stop_words=stopset,
                                 use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)

In [24]:
X[0]

<1x177684 sparse matrix of type '<class 'numpy.float64'>'
	with 197 stored elements in Compressed Sparse Row format>

In [25]:
#After
print(X[0])

  (0, 46723)	0.0795476292247
  (0, 176983)	0.0795476292247
  (0, 25633)	0.0795476292247
  (0, 136768)	0.0795476292247
  (0, 57639)	0.0795476292247
  (0, 71238)	0.0795476292247
  (0, 124297)	0.0795476292247
  (0, 95419)	0.0795476292247
  (0, 60010)	0.0795476292247
  (0, 106118)	0.0795476292247
  (0, 152674)	0.0795476292247
  (0, 169953)	0.0795476292247
  (0, 111183)	0.0795476292247
  (0, 57780)	0.0795476292247
  (0, 95907)	0.0795476292247
  (0, 87606)	0.0795476292247
  (0, 124212)	0.0795476292247
  (0, 133970)	0.0795476292247
  (0, 90164)	0.0795476292247
  (0, 106738)	0.0795476292247
  (0, 155042)	0.0795476292247
  (0, 54363)	0.0795476292247
  (0, 46763)	0.0795476292247
  (0, 32320)	0.0795476292247
  (0, 57766)	0.0795476292247
  :	:
  (0, 104393)	0.0566039504214
  (0, 146162)	0.0420202553086
  (0, 157660)	0.048956057487
  (0, 75432)	0.0541418788191
  (0, 175371)	0.0242992378624
  (0, 39259)	0.0423943106965
  (0, 156963)	0.0243362013047
  (0, 29437)	0.0580772793519
  (0, 77715)	0.0370667

### LSA

Input:  X, a matrix where m is the number of documents I have, and n is the number of terms.

Process:   I'm going to decompose X into three matricies called U, S, and T.  When we do the decomposition, we have to pick a value k, that's how many concepts we are going to keep.  

$$X \approx USV^{T}$$

U will be a m x k matrix.  The rows will be documents and the columns will be 'concepts'

S will be a k x k diagnal matrix.   The elements will be the amount of variation captured from each concept.

V will be a n x k (mind the transpose) matrix.   The rows will be terms and the columns will be conepts.

In [26]:
X.shape

(994, 177684)

In [27]:
lsa = TruncatedSVD(n_components=100, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=100, n_iter=100,
       random_state=None, tol=0.0)

In [28]:
#This is the first row for V
lsa.components_[0]

array([ 0.02011662,  0.00673544,  0.0010296 , ...,  0.00118638,
        0.00118638,  0.00118638])

In [31]:
import sys
print (sys.version)

3.5.2 |Anaconda 4.2.0 (64-bit)| (default, Jul  5 2016, 11:41:13) [MSC v.1900 64 bit (AMD64)]


In [30]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print( "Concept %d:" % i)
    for term in sortedTerms:
        print( term[0])
    print( " ")

Concept 0:
year
team
game
writes
article
baseball
players
games
one
good
 
Concept 1:
jewish
baseball players
come
players
jewish baseball
jewish baseball players
anyone
vb30
koufax
sandy koufax
 
Concept 2:
hall
morris
dave
fame
team
hall fame
jewish
give
little
02
 
Concept 3:
02
03
00
04
lost
morris
last year
05
distribution
batting
 
Concept 4:
clutch
year
better
since
years
could
past
say
sabo
jewish
 
Concept 5:
morris
gant
clutch
hirschbeck
let
hit
big
get
go
12
 
Concept 6:
morris
team
last
year
come
jays
al
12
big
list
 
Concept 7:
home
batting
hit
bonds
game
clutch
reply
00
league
least
 
Concept 8:
might
team
batting
season
clutch
fan
good
go
braves
1988
 
Concept 9:
time
new
article
bonds
ted
alomar
writes
williams
win
best
 
Concept 10:
year
games
like
hitter
much
00
last year
mets
still
clutch
 
Concept 11:
year
well
lopez
really
good
braves
players
season
ever
also
 
Concept 12:
team
games
game
00
hirschbeck
winning
cubs
look
think
big
 
Concept 13:
team
back
pitcher
00


In [32]:
lsa.components_

array([[  2.01166186e-02,   6.73543955e-03,   1.02960172e-03, ...,
          1.18638032e-03,   1.18638032e-03,   1.18638032e-03],
       [ -7.16869137e-03,  -1.13314343e-03,  -2.18395639e-02, ...,
          1.01575629e-03,   1.01575629e-03,   1.01575629e-03],
       [  6.29191578e-03,   2.97821406e-03,  -6.16063379e-03, ...,
          1.61172324e-03,   1.61172324e-03,   1.61172324e-03],
       ..., 
       [ -1.52170764e-02,   1.38118585e-02,  -3.71444167e-02, ...,
         -5.05643163e-05,  -5.05643163e-05,  -5.05643163e-05],
       [ -5.18977282e-02,   2.15349212e-02,   8.20529015e-02, ...,
         -9.22714828e-04,  -9.22714828e-04,  -9.22714828e-04],
       [  2.74179684e-02,  -1.19550227e-02,  -3.94522781e-02, ...,
          2.32897379e-03,   2.32897379e-03,   2.32897379e-03]])