Notebook which focuses on the text dataset and the performance comparison of algorithms on it

In [None]:
from IPython.core.display import display, HTML
display(HTML('<style>.container {width:100% !important;}</style>'))

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt 
import numpy as np
import torch

import nmf.mult
import nmf.pgrad
import nmf.nesterov

import nmf_torch.mult
import nmf_torch.pgrad
import nmf_torch.nesterov
import nmf_torch.norms

import matplotlib
import pickle

from read_data.reading import read_reuters21578, HashTfidfVectoriser

from performance.performance_eval_func import get_random_lowrank_matrix, get_time_ratio,\
                              compare_performance, plot_performance_dict,\
                              torch_algo_wrapper, plot_errors_dict,\
                              plot_ratios_gpu_algo, plot_ratios_cpu_gpu, plot_ratios_cpu_algo,\
                              plot_differences_gpu_algo, plot_differences_cpu_gpu, plot_differences_cpu_algo

In [None]:
vectorizer=HashTfidfVectoriser(12000)
reuters_data = read_reuters21578("data/reuters21578", vectorizer=vectorizer)

In [None]:
algo_dict_to_test = {
    "mult": nmf.mult.factorise_Fnorm,
    "pgrad": nmf.pgrad.factorise_Fnorm_subproblems,
    "nesterov": nmf.nesterov.factorise_Fnorm,

    "mult_torch": torch_algo_wrapper(nmf_torch.mult.factorise_Fnorm, 
                                     device="cuda"),
    "pgrad_torch": torch_algo_wrapper(nmf_torch.pgrad.factorise_Fnorm_subproblems, 
                                      device="cuda"),
    "nesterov_torch": torch_algo_wrapper(nmf_torch.nesterov.factorise_Fnorm, 
                                        device="cuda")
}

# Performance evaluation

In [None]:
errors_dict = pickle.load(open("text_data_errors_dict.pkl","rb"))

In [None]:
inner_dim = 120
shape = reuters_data.shape

In [None]:
W_init = np.random.rand(shape[0], inner_dim) * 0.001
H_init = np.random.rand(inner_dim, shape[1]) * 0.001

errors_dict = compare_performance(reuters_data.toarray(), 
                                  inner_dim, time_limit=1000,
                                  W_init=W_init, H_init=H_init, 
                                  algo_dict_to_test=algo_dict_to_test)

In [None]:
pickle.dump(errors_dict,  open("text_data_errors_dict.pkl","wb"))

## Graphs of time atio

In [None]:
nmbytes = 1975.52
shape = (21578, 12000)
inner_dim = 120

In [None]:
shape = reuters_data.shape
inner_dim = 120
nmbytes = reuters_data.toarray().nbytes / 2**20

In [None]:
f, axes = plt.subplots(2, 2, figsize=(10, 10), 
                       gridspec_kw=gridspec_kw)




f.suptitle("Text data represdented by {0} matrix, {2:.2f} MB \n Factorization of size {1}".format(shape, inner_dim,
                                                                       nmbytes))


plot_errors_dict(errors_dict, axes[0, 0], log=True, title="Cost function", x_lbl="time [s]")
plot_ratios_cpu_gpu(errors_dict, axes[0, 1])
plot_ratios_gpu_algo(errors_dict, axes[1, 0:], selected_algs=["mult_torch", "pgrad_torch", "nesterov_torch"])

axes[0, 1].set_title("CPU / GPU comparison")
axes[1, 0].set_title("multiplicative / gradient\nalgorithms comparison")
axes[1, 1].set_title("Nesterov / projected\n gradient algorithms comparison")

# Demonstration

In [None]:
W, H, errors = nmf.nesterov.factorise_Fnorm(reuters_data.toarray(), 120, max_steps=20,
                                           epsilon=0, record_errors=True)

In [None]:
for topic_id in range(H.shape[0]):
    hashes = cols[topic_id, :3]
    words = ["({})".format("|".join(vectorizer.words_by_hash(h))) for h in hashes]
    print(topic_id, *words)

In [None]:
text_id_of_interest = 160
text_vector_of_interest = W[text_id_of_interest, :]

In [None]:
idxs = np.argsort(np.linalg.norm(W[:, :] - text_vector_of_interest, axis=1))

In [None]:
print("Text corresponding to 160th row:")
print(vectorizer.last_data[idxs[0]])
print()

print("Text corresponding to the closest to the 160th row in the space of latent topic")
print(vectorizer.last_data[idxs[1]])
print()

print("Text corresponding to the second closest to the 160th row in the space of latent topic")
print(vectorizer.last_data[idxs[2]])
print()