# SD/TSIA_214  TP4
## WANG Yuqing

In [1]:
from __future__ import print_function
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups
import numpy as np

In [2]:
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

In [3]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] ])
        print(message)
    print()

In [4]:
print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 1.997s.


In [5]:
print("Extracting tf-idf features...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features...
done in 0.493s.


In [6]:
print("Fitting the NMF model (Frobenius norm) with tf-idf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.303s.


In [7]:
print("Topics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Topics in NMF model (Frobenius norm):
Topic #0: just people don think like know time good make way really say right ve want did ll new use years
Topic #1: windows use dos using window program os drivers application help software pc running ms screen files version card code work
Topic #2: god jesus bible faith christian christ christians does heaven sin believe lord life church mary atheism belief human love religion
Topic #3: thanks know does mail advance hi info interested email anybody looking card help like appreciated information send list video need
Topic #4: car cars tires miles 00 new engine insurance price condition oil power speed good 000 brake year models used bought
Topic #5: edu soon com send university internet mit ftp mail cc pub article information hope program mac email home contact blood
Topic #6: file problem files format win sound ftp pub read save site help image available create copy running memory self version
Topic #7: game team games year win play season player

## 1. Test and comment on the effect of varying the initialisation, especially using random nonnegative values as initial guesses.

In [8]:
print("Topics in NMF model (random initalisation):")
nmf2 = NMF(n_components=n_components, init='random', random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
print_top_words(nmf2, tfidf_feature_names, n_top_words)

Topics in NMF model (random initalisation):
Topic #0: just like don people time know good make way use right say ve really want government ll new did going
Topic #1: christian bible true christians faith jesus religion people does christ belief church life truth read reading believe statement atheism claim
Topic #2: god jesus sin heaven lord christ believe mary does bible love human knowledge life marriage faith say children atheism knows
Topic #3: think don people win extra just early sold sex need actually happen means pretty toronto wasn agree david statement mike
Topic #4: drive drives hard disk software floppy card mac 00 computer scsi controller power apple mb pc sale rom monitor internal
Topic #5: thanks know does mail advance hi info interested email anybody looking card help like appreciated list send information video need
Topic #6: windows file dos files program using use window problem os help running drivers application pc ms ftp version screen available
Topic #7: game tea

We can see that with the change of the different kinds of initialisation, top words may belong to different topics. Topic 1 totally changed it's meaning, it has almost the same topic as Topic 2.

## 2. Compare and comment on the difference between the results obtained with l2 cost compared to the generalised Kullback-Liebler cost.

In [9]:
print("Topics in NMF model (random initalisation):")
nmf3 = NMF(n_components=n_components, solver='mu', beta_loss='kullback-leibler', random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
print_top_words(nmf3, tfidf_feature_names, n_top_words)

Topics in NMF model (random initalisation):
Topic #0: people don just like think did say time make know really right said things way ve course didn question probably
Topic #1: windows help thanks using hi looking info video dos pc does anybody ftp appreciated mail know advance available use card
Topic #2: god does jesus true book christian bible christians religion faith believe life church christ says know read exist lord people
Topic #3: thanks know bike interested mail like new car edu heard just price list email hear want cars thing sounds reply
Topic #4: 10 00 sale time power 12 new 15 year 30 offer condition 14 16 model 11 monitor 100 old 25
Topic #5: space government number public data states earth security water research nasa general 1993 phone information science technology provide blood internet
Topic #6: edu file com program soon try window problem remember files sun send library article mike wrong think code win manager
Topic #7: game team year games play win season points 

With the kullback-leibler loss function, we can see that the accuracy is improved. Some topics changed their meaning, but the words in a topic are more relatied.

## 3. Test and comment on the results obtained using a simpler term-frequency representation as input when considering the Kullback-Liebler cost.

In [10]:
print("Topics in NMF model (simpler term-frequency):")
count_vectorizer = CountVectorizer(max_features=n_features, stop_words='english' )
count = count_vectorizer.fit_transform(data_samples)
count_feature_names = count_vectorizer.get_feature_names()
print_top_words(nmf3, count_feature_names, n_top_words)

Topics in NMF model (simpler term-frequency):
Topic #0: people don kept like think did save time make know real results runs things way various couple didn quality pro
Topic #1: windows hi thanks uses high looking information vga dos pc does anti function apply mail know administration automatic unless canada
Topic #2: goes does jewish true bob choice best christ related family begin life christian chip say know ray expect lord people
Topic #3: thanks know bible interesting mail like new came edu heart kept price list email heard vs care thing sound religious
Topic #4: 10 00 safety time power 12 new 15 year 30 offer completely 14 16 model 11 monitor 100 old 25
Topic #5: soviet graphics number pub dangerous states earth section water require nasa generally 1993 phone input school technology prove black involved
Topic #6: edu files cold product soldiers try window probes religion final sun self library army mike wrong think clock win manager
Topic #7: games team year gas play win scsi po

After using a simpler term-frequency representation as input, there are some slightly change in the result.We can see from the topic 9 that the top words changed a lot, but the number remains the same.

# CUSTOM NMF IMPLEMENTATION

In [11]:
def b_divergence(b,x,y):
    if b == 0:
        return x/y-np.log(x/y)-1
    if b == 1:
        return x*np.log(x/y)-x+y
    else:
        return 1/(b*(b-1))*(x**b+(b-1)*y**b-b*x*y**(b-1))

In [12]:
order = 6
rank = 4
iteration = 10
b = 1 #Kullback-Liebler divergence

w = np.random.randint(0,9,size=(rank, order))
h = np.random.randint(0,9,size=(order, rank))
v = np.dot(w, h)

for i in range(iteration):
    sum_1 = 0
    for j in range(v.shape[0]):
        vn = v[:][j]
        vn_hat = np.dot(np.dot(w, w.T), vn)
        sum_2 = 0
        for k in range(v.shape[1]):
            sum_2 += b_divergence(b,vn[k],vn_hat[k])
    sum_1 += sum_2
            

  """
