In [1]:
import numpy as np

In [13]:
X = np.random.randint(0, 10, (10, 3))

In [14]:
XXt = X.dot(X.T)

In [15]:
eigVals, eigVecs = np.linalg.eig(XXt)
eigValsIndices = eigVals.argsort()


In [19]:
eigValsIndices

array([3, 7, 6, 8, 9, 5, 4, 2, 1, 0])

In [22]:
eigValsIndices = eigValsIndices[:-3:-1]

In [23]:
eigVals

array([  8.54173420e+02 +0.00000000e+00j,
         1.01332945e+02 +0.00000000e+00j,
         6.64936354e+01 +0.00000000e+00j,
        -4.33857741e-14 +0.00000000e+00j,
         1.23812381e-14 +4.69926005e-15j,
         1.23812381e-14 -4.69926005e-15j,
        -4.01756745e-15 +8.67758846e-15j,
        -4.01756745e-15 -8.67758846e-15j,
        -2.69588189e-15 +0.00000000e+00j,   2.02267113e-15 +0.00000000e+00j])

In [24]:
U = eigVecs[:, eigValsIndices]

In [25]:
U

array([[ 0.18723896+0.j,  0.25285278+0.j],
       [-0.50011632+0.j,  0.27028291+0.j],
       [ 0.06263949+0.j,  0.41450369+0.j],
       [-0.53057684+0.j,  0.25119143+0.j],
       [-0.07438622+0.j,  0.27964501+0.j],
       [ 0.24330675+0.j,  0.39014237+0.j],
       [ 0.05569041+0.j,  0.37470013+0.j],
       [-0.14710959+0.j,  0.30076512+0.j],
       [ 0.57793946+0.j,  0.20271356+0.j],
       [ 0.07682183+0.j,  0.35459434+0.j]])

In [26]:
S = np.mat(np.diag(np.sqrt(eigVals[eigValsIndices])))

In [28]:
XtX = X.T.dot(X)
eigVals, eigVecs = np.linalg.eig(XtX)
eigValsIndices = eigVals.argsort()
eigValsIndices = eigValsIndices[:-3:-1]
V = eigVecs[:, eigValsIndices]

In [29]:
V

array([[ 0.55797225,  0.30662856],
       [ 0.63498108, -0.75602257],
       [ 0.53429018,  0.57828089]])

In [31]:
1+3j - 6

(-5+3j)

In [32]:
a = np.random.randint(0, 5, 5)

In [33]:
a

array([1, 0, 4, 0, 3])

In [34]:
a.argsort()

array([1, 3, 0, 4, 2])

In [42]:
a[:-3:-1]

array([3, 0])

In [6]:
# analyze the NG20 dataset using the class sklearn LDA using 10 topics
from __future__ import  print_function
from time import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups

In [8]:
n_sampes = 2000
n_features = 1000
n_topics = 20
n_top_words = 10

In [7]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print('Topic #%d' % topic_idx)
        print(' '.join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    
    print()

In [4]:
print('load dataset')
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data
print('done in %0.3fs' % (time() - t0))

load dataset
done in 1.254s


In [5]:
# use tf (raw term count) features for LDA
print('extracting tf features for LDA')
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print('done in %0.3fs' % (time() - t0))

extracting tf features for LDA
done in 1.550s


In [9]:
# use tfidf feature for NMF
print('extracting tf-idf features for NMF')
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print('done in %0.3fs' % (time() - t0))

extracting tf-idf features for NMF
done in 1.821s


In [14]:
print('fitting LDA models with tf features, n_sampes=%d and n_features=%d' % (n_sampes, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0)
t0 = time()
lda.fit(tf)
print('done in %0.3fs' % (time() - t0))

fitting LDA models with tf features, n_sampes=2000 and n_features=1000




done in 13.892s


In [10]:
print('fitting NMF models with tfidf features, n_sampes=%d and n_features=%d' % (n_sampes, n_features))
nmf = NMF(n_components=n_topics, random_state=1, alpha=0.1, l1_ratio=.5)
t0 = time()
nmf.fit(tfidf)
print('done in %0.3fs' % (time() - t0))

fitting NMF models with tfidf features, n_sampes=2000 and n_features=1000
done in 7.846s


In [15]:
print('topics in LDA model:')
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

topics in LDA model:
Topic #0
people gun state control right guns crime states law police
Topic #1
time question book years did like don space answer just
Topic #2
mr line rules science stephanopoulos title current define int yes
Topic #3
key chip keys clipper encryption number des algorithm use bit
Topic #4
edu com cs vs w7 cx mail uk 17 send
Topic #5
use does window problem way used point different case value
Topic #6
windows thanks know help db does dos problem like using
Topic #7
bike water effect road design media dod paper like turn
Topic #8
don just like think know people good ve going say
Topic #9
car new price good power used air sale offer ground
Topic #10
file available program edu ftp information files use image version
Topic #11
ax max b8f g9v a86 145 pl 1d9 0t 34u
Topic #12
government law privacy security legal encryption court fbi technology information
Topic #13
card bit memory output video color data mode monitor 16
Topic #14
drive scsi disk mac hard apple drives contr

In [11]:
print('topics in NMF model:')
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

topics in NMF model:
Topic #0
don people just think like time good know right ve
Topic #1
use program software edu available graphics pc ftp using server
Topic #2
god jesus bible christ faith believe christians christian heaven sin
Topic #3
geb dsl chastity n3jxp cadre shameful pitt intellect skepticism surrender
Topic #4
key chip encryption clipper keys escrow government algorithm security secure
Topic #5
drive disk drives hard floppy ide boot controller cd internal
Topic #6
mail address list send mailing post info edu com reply
Topic #7
thanks advance hi looking info appreciated help email anybody information
Topic #8
sale offer shipping price condition new asking sell best email
Topic #9
card video monitor vga bus drivers cards driver ati color
Topic #10
game team games players hockey year season play win league
Topic #11
windows dos ms microsoft os running nt version drivers driver
Topic #12
window manager application display motif xterm root position tvtwm expose
Topic #13
car car

In [16]:
for topicidx, topic in enumerate(nmf.components_):
    print(topicidx, ' ', topic)

0   [ 0.          0.10522514  0.         ...,  0.          0.          0.        ]
1   [ 0.  0.  0. ...,  0.  0.  0.]
2   [ 0.  0.  0. ...,  0.  0.  0.]
3   [ 0.  0.  0. ...,  0.  0.  0.]
4   [ 0.  0.  0. ...,  0.  0.  0.]
5   [ 0.  0.  0. ...,  0.  0.  0.]
6   [ 0.  0.  0. ...,  0.  0.  0.]
7   [ 0.  0.  0. ...,  0.  0.  0.]
8   [ 0.00112823  0.07013121  0.         ...,  0.          0.          0.        ]
9   [ 0.  0.  0. ...,  0.  0.  0.]
10   [ 0.          0.01015273  0.         ...,  0.          0.          0.        ]
11   [ 0.  0.  0. ...,  0.  0.  0.]
12   [ 0.  0.  0. ...,  0.  0.  0.]
13   [ 0.          0.03634624  0.         ...,  0.          0.          0.        ]
14   [ 0.  0.  0. ...,  0.  0.  0.]
15   [ 0.         0.0145849  0.        ...,  0.         0.         0.       ]
16   [ 2.73515411  0.03825862  0.         ...,  0.          0.          0.        ]
17   [ 0.  0.  0. ...,  0.  0.  0.]
18   [ 0.  0.  0. ...,  0.  0.  0.]
19   [ 0.          0.09138045  0.         ..

In [19]:
import numpy as np
X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
from sklearn.decomposition import NMF
model = NMF(n_components=2, init='random', random_state=0)
W = model.fit_transform(X)
H = model.components_

In [21]:
H

array([[ 2.09783018,  0.30560234],
       [ 2.13443044,  2.13171694]])

In [22]:
W

array([[ 0.        ,  0.46880684],
       [ 0.55699523,  0.3894146 ],
       [ 1.00331638,  0.41925352],
       [ 1.6733999 ,  0.22926926],
       [ 2.34349311,  0.03927954],
       [ 2.78981512,  0.06911798]])

In [23]:
W.dot(H)

array([[ 1.00063558,  0.99936347],
       [ 1.99965977,  1.00034074],
       [ 2.99965485,  1.20034566],
       [ 3.9998681 ,  1.0001321 ],
       [ 5.00009002,  0.79990984],
       [ 6.00008587,  0.999914  ]])