# NONNEGATIVE MATRIX FACTORISATION FOR TOPIC EXTRACTION
## - TOPIC EXTRACTION FROM DOCUMENTS -

The goal is to study the use of nonnegative matrix factorisation (NMF) for topic extraction from a dataset of text documents. The rationale is to interpret each extracted NMF component as being associated with a specific topic.


Study and test the following script (introduced  on http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html)

In [1]:
from __future__ import print_function
from time import time

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups

In [13]:
def featuresToVector(vectorizer=None, random_state=None):
    # Take default parameters from assignment script
    if vectorizer is None:
        n_features = 1000
        _vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, 
                                max_features=n_features, stop_words='english')
    else:
        _vectorizer = vectorizer
    # For results replicability
    if random_state is None:
        _random_state = 1
    else:
        _random_state = random_state
        
    # Load data
    print("Loading dataset...")
    t0 = time()
    dataset = fetch_20newsgroups(shuffle=True, random_state=_random_state,
                                 remove=('headers', 'footers', 'quotes'))
    n_samples = 2000
    data_samples = dataset.data[:n_samples]        
    print("done in %0.3fs." % (time() - t0))
    
    # Vectorize
    print("Vectorizing...")
    t0 = time()
    features = _vectorizer.fit_transform(data_samples)
    names = _vectorizer.get_feature_names()
    print("done in %0.3fs." % (time() - t0))
    return features, names
    
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
def NmfModel(features, vectorizer=None, random_state=None, 
             beta_loss=None, init=None, W=None, H=None, K=None,
             verbose=False):
    
    if K is None:
        n_components = 10 
    else:
        n_components = K
        
    if vectorizer is None: 
        _vectorizer = 'tf_idf'
    else:
        _vectorizer = vectorizer
        
    if random_state is None: 
        _random_state = 1         
    else:
        _random_state = random_state
        
    if beta_loss is None:
        _solver = 'cd' 
    else:
        _solver = 'mu'
    
    if beta_loss is None:
        _beta_loss = 'frobenius' 
    else:
        _beta_loss = beta_loss
    
    if init is None:
        _init = 'random' 
    else:
        _init = init
    
    n_samples = features.shape[0]
    n_features = features.shape[1]
    print("Fitting the NMF model ("+_beta_loss+" norm) with "+_vectorizer+" features, "
          "n_samples=%d and n_features=%d..." % (n_samples, n_features))
    
    t0 = time()
    if init is None:
        nmf = NMF(n_components=n_components, 
                  random_state=_random_state,
                  init= _init,
                  solver = _solver,
                  beta_loss = _beta_loss,
                  alpha=.1, l1_ratio=.5, verbose=verbose).fit(features)
    else:
        nmf = NMF(n_components=n_components, 
                  random_state=_random_state,
                  init= _init,
                  solver = _solver,
                  beta_loss = _beta_loss,
                  alpha=.1, l1_ratio=.5, verbose=verbose)
        nmf.fit_transform(features, W=W, H=H)
    print("done in %0.3fs." % (time() - t0))

    return nmf

def runNmf(vectorizer=None, vectorizerName=None, random_state=None, beta_loss=None, 
           init=None, W=None, H=None, K=None, verbose=False, print_results=False,
           n_top_words=10):
    features, names = featuresToVector(vectorizer=vectorizer)
    nmf = NmfModel(features, vectorizer=vectorizerName,
                                random_state=random_state, 
                                beta_loss=beta_loss, init=init,
                                W=W, H=H, K=K, verbose=verbose)
    if print_results:
        print("\nTopics in NMF model ("+_beta_loss+" norm):")
        print_top_words(nmf, names, n_top_words)
    
    return nmf, names

### 1. Test and comment on the effect of varying the initialisation, especially using random nonnegative values as initial guesses (for W and H coefficients, using the notations introduced during the lecture).

In [14]:
nmf, names = runNmf(random_state=0, beta_loss='frobenius',
                    init='nndsvda', n_top_words=20)
print('Converged after ' + str(nmf.n_iter_) + ' iterations.')
print('Frobenius norm of matrix difference: ' 
      + str(nmf.reconstruction_err_))

Loading dataset...
done in 1.150s.
Vectorizing...
done in 0.271s.
Fitting the NMF model (frobenius norm) with tf_idf features, n_samples=2000 and n_features=1000...
done in 0.068s.
Converged after 30 iterations.
Frobenius norm of matrix difference: 42.14133796612238


In [15]:
nmf, names = runNmf(random_state=0, beta_loss='frobenius',
                    init='nndsvdar', n_top_words=20)
print('Converged after ' + str(nmf.n_iter_) + ' iterations.')
print('Frobenius norm of matrix difference: ' 
      + str(nmf.reconstruction_err_))

Loading dataset...
done in 1.157s.
Vectorizing...
done in 0.263s.
Fitting the NMF model (frobenius norm) with tf_idf features, n_samples=2000 and n_features=1000...
done in 0.086s.
Converged after 40 iterations.
Frobenius norm of matrix difference: 42.14711940431842


In [16]:
nmf, names = runNmf(random_state=0, beta_loss='frobenius', 
                    init='random', n_top_words=20)
print('Converged after ' + str(nmf.n_iter_) + ' iterations.')
print('Frobenius norm of matrix difference: '
      + str(nmf.reconstruction_err_))

Loading dataset...
done in 1.171s.
Vectorizing...
done in 0.267s.
Fitting the NMF model (frobenius norm) with tf_idf features, n_samples=2000 and n_features=1000...
done in 0.084s.
Converged after 50 iterations.
Frobenius norm of matrix difference: 42.17995382807882


Three different initialisation configuration were tested, while mantaining the same cost function (frobenius norm) and using multiplicative update rules. The 'nndsvda' initializiation, as from the scikit documentation, performs a Nonnegative Double Singular Value Decomposition on the features and then fills zero values with the average value of the features matrix. The 'nndsvdar' initializiation performs the same operation, but fills zeros with very small random values. Finally, the random initialization creates two non-negative random matrices properly scaled.

Of all the three approaches, the one that showed the best results was undoubtly the 'nndsvda', which was able to converge with only 30 iterations and provided the lowest error.

### 2. Compare and comment on the difference between the results obtained with $l_{2}$ cost compared to the generalised Kullback-Leibler cost.

$l_{2}$ and frobenius norm are just two different words for the same regularization algorithm.

In [17]:
nmf_f, names_f = runNmf(random_state=0, beta_loss='frobenius',
                    init='nndsvda')
print('Converged after ' + str(nmf_f.n_iter_) + ' iterations.')
print('Frobenius norm of matrix difference: ' 
      + str(nmf_f.reconstruction_err_))

Loading dataset...
done in 1.161s.
Vectorizing...
done in 0.258s.
Fitting the NMF model (frobenius norm) with tf_idf features, n_samples=2000 and n_features=1000...
done in 0.070s.
Converged after 30 iterations.
Frobenius norm of matrix difference: 42.14133796612238


In [18]:
nmf_k, names_k = runNmf(random_state=0, beta_loss='kullback-leibler',
                    init='nndsvda')
print('Converged after ' + str(nmf_k.n_iter_) + ' iterations.')
print('Frobenius norm of matrix difference: ' 
      + str(nmf_k.reconstruction_err_))

Loading dataset...
done in 1.144s.
Vectorizing...
done in 0.264s.
Fitting the NMF model (kullback-leibler norm) with tf_idf features, n_samples=2000 and n_features=1000...
done in 1.981s.
Converged after 110 iterations.
Frobenius norm of matrix difference: 211.17742134247516


The Kullback-Lieber cost seems to perform worse than the $ l_{2} $ cost: it needs 3.5 times the iterations to converge, and has an error five times higher. Let us try and print also the topics:

In [19]:
print('Topics obtained from NFM with l2 cost:')
print_top_words(nmf_f, names_f, 15)

Topics obtained from NFM with l2 cost:
Topic #0: just people don think like know good time make way really say right ve did
Topic #1: windows use dos using window program card help software pc drivers os application video running
Topic #2: god jesus bible faith christian christ christians does heaven sin believe lord life church mary
Topic #3: thanks know does advance mail info hi interested email anybody like list send information appreciated
Topic #4: 00 car sale 10 price condition new card cars offer 250 12 asking 15 sell
Topic #5: edu soon com send university internet mit ftp mail cc article information pub hope mac
Topic #6: file files problem format win sound read pub ftp save create site running self copy
Topic #7: game team games year win play season players nhl runs goal hockey toronto division flyers
Topic #8: drive drives hard disk floppy mac software mb controller scsi computer rom apple internal power
Topic #9: key chip clipper keys encryption government public use secure 

In [20]:
print('Topics obtained from NFM with Kullback-Leibler cost:')
print_top_words(nmf_k, names_k, 15)

Topics obtained from NFM with Kullback-Leibler cost:
Topic #0: people did right like time things said look say don just know want real work
Topic #1: using windows help work use need hi thanks looking software pc video running card used
Topic #2: does true god mean say says believe read matter fact point people word example particular
Topic #3: know thanks like mail interested does post send list new don want wondering hear email
Topic #4: new 10 old sale 20 offer 15 30 weeks 16 power test check 25 11
Topic #5: number states com control free university government space research including general white women data used
Topic #6: edu remember file try soon sun problem article reading available short couldn written copy mentioned
Topic #7: year won second world team time win game maybe news play season games bad series
Topic #8: think people make hard means drive write don read problems need similar problem case actually
Topic #9: just use sure way good got doesn like wrong want going don 

Found topics seem to be similar (religion, business, games...), but also from this representation we can see that the l2 cost produces more accurate results: in topics obtained with l2 cost, all words within the same topic seem to be specific to it, whilst in the topics obtained with Kullback-Leibler cost there seems to be less precision. As an example, we can look at topic #2, where both algorithms find the same topic, but the most important words found by using the l2 cost are much more representative than the ones found using the Kullback-Leibler one.

### 3. Test and comment on the results obtained using a simpler term-frequency representation as input (as opposed to the TF-IDF representation considered in the code above) when considering the Kullback-Liebler cost.

In [32]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english', use_idf=False)    
nmf, names = runNmf(random_state=0, beta_loss='kullback-leibler',
                    vectorizer=vectorizer, vectorizerName='TermFrequencyVectorizer', init='nndsvda')
print('Converged after ' + str(nmf.n_iter_) + ' iterations.')
print('Frobenius norm of matrix difference: ' 
      + str(nmf.reconstruction_err_))

Loading dataset...
done in 1.154s.
Vectorizing...
done in 0.282s.
Fitting the NMF model (kullback-leibler norm) with TermFrequencyVectorizer features, n_samples=2000 and n_features=1000...
done in 2.174s.
Converged after 120 iterations.
Frobenius norm of matrix difference: 213.63610577925184


In [33]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')    
nmf, names = runNmf(random_state=0, beta_loss='kullback-leibler',
                    vectorizer=vectorizer, vectorizerName='CountVectorizer', init='nndsvda')
print('Converged after ' + str(nmf.n_iter_) + ' iterations.')
print('Frobenius norm of matrix difference: ' 
      + str(nmf.reconstruction_err_))

Loading dataset...
done in 1.146s.
Vectorizing...
done in 0.260s.
Fitting the NMF model (kullback-leibler norm) with CountVectorizer features, n_samples=2000 and n_features=1000...
done in 1.745s.
Converged after 120 iterations.
Frobenius norm of matrix difference: 592.1368392281813


Two simpler representations were tested, both the simple Term Frequency representation and the simple Count of tokens. However, neither of them seems to produce better results: if compared to the previous Tf-Idf representation, both of them need 

## - CUSTOM NMF IMPLEMENTATION -
Implement the multiplicative update rules (derived from the majorisation-minimisation approach) for NMF estimation with β divergences, including the case β = 1 (generalised Kullback-Liebler divergence). Ensure that:
- You can easily choose a custom initialisation for the W and H matrices;
- You can set a custom number of iteration;
- You can monitor the behaviour of the loss function across the iterations and that it is readily decreasing.

Compare your implementation with the one offered by scikit-learn.

In [11]:
import numpy

def custom_NMF(V, k=2, W=None, H=None, max_iter=50, beta=0, tol=0.1, verbose=False):
    
    def _beta_div(V,W,H,beta):
        div = 0
        # Update beta_divergence
        if beta == 1: 
            func = _kullback_leiber
        elif beta == 0: 
            func = _itakura_saito
        else: 
            func = _euclidean_distance
        WH = np.dot(W, H)
        for i in range(V.shape[0]):
            for j in range(V.shape[1]):
                x = V[i][j]
                if x == 0:
                    x = np.finfo(np.double).tiny
                y = WH[i][j]
                div += func(x,y,beta)
        return div

    # Generalized Kullback-Leibler divergence -> d = x*log(x/y) - x + y
    def _kullback_leiber(x,y,beta):
        return x*np.log(x/y) - x + y

    # Itakura-Saito divergence -> d = (x/y) - log(x/y) -1
    def _itakura_saito(x,y,beta):
        return (x/y)-np.log(x/y) - 1
    
    # Euclidean distance -> d = (1/beta(beta-1))(x^beta + (beta-1)y^beta - beta*x*y^beta-1)    
    def _euclidean_distance(x,y,beta):
        return (1/(beta*(beta-1)))*(np.pow(x,beta) + (beta-1)*np.pow(y,beta) - beta*x*np.pow(y,beta-1))

    if V is None:
        raise ValueError("Please provide initial Matrix V.")

    if W is None:
        W = np.random.rand(V.shape[0],k)
        
    if H is None:
        H = np.random.rand(k,V.shape[1])
            
    # Setup initial error
    init_error = _beta_div(V,W,H,beta)
    if verbose:
        print("Initial error: "+str(init_error))
    error = init_error
    
    for it in range(max_iter):
    
        # Tests with whole matrix : multiply = O | dot = *
        mur_up = np.dot(W.T, np.multiply(np.power(np.dot(W,H),beta-2), V))
        mur_down = np.dot(W.T, np.power(np.dot(W,H),beta-1))
        mur = np.divide(mur_up, mur_down)
        H = np.multiply(H, mur)
        
        mur_up = np.dot(np.multiply(np.power(np.dot(W,H),beta-2), V),H.T)
        mur_down = np.dot(np.power(np.dot(W,H),beta-1), H.T)
        mur = np.divide(mur_up, mur_down)
        W = np.multiply(W, mur)
        
        # To avoid underflow, we clip H and W to 10^-150
        # so that a multiplication between them is not higher
        # than 10^-307 (smallest exponent on this machine)
        H = np.clip(H, 10**-150, None)
        W = np.clip(W, 10**-150, None)
        
        bdiv = _beta_div(V,W,H,beta)
        if verbose:
            print("Iteration "+str(it+1)+" - Error : " +str(bdiv))
        # Check if approximation error relative decrease is below the desired threshold
        rel_dec = ((error - bdiv) / init_error)
        if verbose:
            print("Iteration "+str(it+1)+" - Error Relative Decrease : "+str(rel_dec))
        if (rel_dec < tol) & (rel_dec > 0) :
            break
        error = bdiv
            
    return W, H

In [27]:
features, names = featuresToVector()
W, H = custom_NMF(features.toarray(), k=2, beta = 1, tol = 0.0001, verbose = True)

Loading dataset...
done in 1.141s.
Vectorizing...
done in 0.271s.
Initial error: 993340.6257554608
Iteration 1 - Error : 28006.792799539184
Iteration 1 - Error Relative Decrease : 0.9718054491345914
Iteration 2 - Error : 27889.39777531558
Iteration 2 - Error Relative Decrease : 0.00011818204267475769
Iteration 3 - Error : 27804.75570314812
Iteration 3 - Error Relative Decrease : 8.520951421179389e-05


The algorithm above implemented seems to satisfy all the requirement listed. However, to avoid underflow a clipping procedure was applied at each iteration, reducing consequently the algorithm's precision. A more sofisticated rescaling technique should have been applied instead. Nonetheless, the algorithm produces satisfying results and converges in few iterations.