# TP4 - Non-negative Matrix Factorization
The goal is to study the use of nonnegative matrix factorisation (NMF) for topic extraction from a dataset of text documents. The rationale is to interpret each extracted NMF component as being associated with a specific topic. 

Study and test the following script (introduced  on [scikit](http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html))

1. Test and comment on the effect of varying the initialisation, especially using random
nonnegative values as initial guesses (for W and H coefficients, using the notations introduced
during the lecture).
2. Compare and comment on the difference between the results obtained with `2 cost compared
to the generalised Kullback-Liebler cost.
3. Test and comment on the results obtained using a simpler term-frequency representation
as input (as opposed to the TF-IDF representation considered in the code above) when
considering the Kullback-Liebler cost.

In [37]:
###### CUSTOM NMF IMPLEMENTATION ######
# Multiplicative Update Rules for NMF #
# estimation with beta divergences    #
import numpy

# TODO: translate slides 59 [beta-divergence] & 47 [error and special cases]

def custom_NMF(V, K, W=None, H=None, steps=50, beta=0, toll=0.1, show_div=False):
    
    def _beta_div(V,W,H,beta,F,N,K):
        div = 0
        # Update beta_divergence
        if beta == 1: # generalized Kullback-Leibler divergence. x log(x/y) - x + y
            # div = numpy.dot(V, numpy.log(V,numpy.dot(W,H))) - numpy.sum(V) + numpy.sum(numpy.dot(W,H))
            func = _kullback_leiber
        elif beta == 0: # Itakura-Saito divergence. (x/y) - log(x/y) -1
            # div = numpy.sum(V / numpy.dot(W,H)) - numpy.sum(numpy.log(V / numpy.dot(W,H))) - numpy.product(len(V))
            func = _itakura_saito
        else: # Euclidean distance. (1/beta(beta-1))(x^beta + (beta-1)y^beta - beta*x*y^beta-1)
            func = _euclidean_distance
        WH = numpy.dot(W, H)
        for i in range(F):
            for j in range(N):
                x = V[i][j]
                if x == 0:
                    x = numpy.finfo(numpy.double).tiny
                y = WH[i][j]
                div += func(x,y,beta)
        return div

    def _kullback_leiber(x,y,beta):
        return x*numpy.log(x/y) - x + y

    def _itakura_saito(x,y,beta):
        return x*numpy.log(x/y) - x + y

    def _euclidean_distance(x,y,beta):
        return (1/(beta*(beta-1)))*(pow(x,beta) + (beta-1)*pow(y,beta) - beta*x*pow(y,beta-1))

    F = len(V) #Number of V rows
    N = len(V[0]) #Number of V columns

    if W is None:
        W = numpy.random.rand(F,K)
        
    if H is None:
        H = numpy.random.rand(K,N)
        
    if N != len(H[0]):
        raise ValueError("Size for H[0] is different - found "+str(len(H[0]))+" in place of "+str(N))
    if F != len(W):
        raise ValueError("Size for F is different - found "+str(len(F))+" in place of "+str(N))
        
    #Setup n_iter
    n_iter = 1
    
    # Setup initial error
    init_error = _beta_div(V,W,H,beta,F,N,K)
    if show_div:
        print("Initial error: "+str(init_error))
    error = init_error
    
    for step in range(steps):
    
        # Tests with whole matrix : multiply = O | dot = *
        upd_UP = numpy.dot(W.T, numpy.multiply(numpy.power(numpy.dot(W,H),beta-2), V))
        upd_DOWN = numpy.dot(W.T, numpy.power(numpy.dot(W,H),beta-1))
        upd = numpy.divide(upd_UP, upd_DOWN)
        H = numpy.multiply(H, upd)
        
        upd_UP = numpy.dot(numpy.multiply(numpy.power(numpy.dot(W,H),beta-2), V),H.T)
        upd_DOWN = numpy.dot(numpy.power(numpy.dot(W,H),beta-1), H.T)
        upd = numpy.divide(upd_UP, upd_DOWN)
        W = numpy.multiply(W, upd)

        print('W ' + str(W))
        print('H ' + str(H))
        # Test element-wise products
#         for i in range(F):
#             for j in range(N):
#                 for k in range(K):
#                     x = V[i][j]
#                     w = W[i][k]
#                     h = H[k][j]
#                     y = w*h
# #                     print("x:"+str(x)+" | w:"+str(w)+" | h:"+str(h)+" | y:"+str(y))
#                     # Update h
#                     upd_up = w*(pow(y,beta-2)*x)
#                     upd_down = w*pow(y,beta-1)
#                     upd = upd_up/upd_down
#                     h = h*upd
#                     # Update w
#                     upd_up = (pow(y,beta-2)*x)*h
#                     upd_down = pow(y,beta-1)*h
#                     upd = upd_up/upd_down
#                     w = w*upd
        
        if toll > 0:
            new_error = _beta_div(V,W,H,beta,F,N,K)
            if show_div:
                print("Error on iteration "+str(n_iter)+": " +str(new_error))
            # Check if approximation error relative decrease is below the desired threshold
            rel_dec = ((error - new_error) / init_error)
            if show_div:
                print("Error relative decrease at iteration "+str(n_iter)+": "+str(rel_dec))
            if (rel_dec > 0) & (rel_dec < toll):
                break
            error = new_error
            
        n_iter += 1
            
    return W, H



#######

if __name__ == "__main__":
    V = [
         [5,3,0,1],
         [4,0,0,1],
         [1,1,0,5],
         [1,0,0,4],
         [0,1,5,4],
        ]

    V = numpy.array(V) # Data matrix F x N 
    K = 2
    numpy.random.seed(0)
    W, H = custom_NMF(V, K, beta = 0, toll = 0.0001, show_div = True)

Initial error: 34.67244110728304
W [[0.80920527 0.81040776]
 [0.31568274 0.2344681 ]
 [0.31694595 1.00383222]
 [0.12971685 0.72613382]
 [1.69440161 0.39664803]]
H [[3.64462264 1.46570247 1.05284474 2.52849236]
 [0.34174711 0.24792135 0.01219242 2.40930892]]
Error on iteration 1: 17.653374180858854
Error relative decrease at iteration 1: 0.4908528613189939
W [[0.88545256 0.59344692]
 [0.24460008 0.18201628]
 [0.22148414 1.32548495]
 [0.04821585 0.77079626]
 [2.75989377 0.30393194]]
H [[5.03697957e+00 1.18723933e+00 6.01297455e-01 2.03895225e+00]
 [4.46861299e-01 1.79300430e-01 7.74567312e-04 3.01634080e+00]]
Error on iteration 2: 24.328678697921223
Error relative decrease at iteration 2: -0.19252479213700943
W [[0.89576313 0.34955588]
 [0.20111472 0.19501216]
 [0.15232838 1.5862147 ]
 [0.01501661 0.77806214]
 [4.35058922 0.23735675]]
H [[6.31994700e+00 1.21649923e+00 3.64401087e-01 1.33923618e+00]
 [6.10544901e-01 1.53556443e-01 1.11099545e-05 3.31362202e+00]]
Error on iteration 3: 38.3



In [34]:
##### TEST RESULTS #####
W

array([[nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan]])