In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sys
import codecs

from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import normalized_mutual_info_score 

In [2]:
sys.setrecursionlimit(1000000000)

In [3]:
%matplotlib inline
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (16, 7)})

In [4]:
arena_news_stem_df = pd.read_pickle('arena_news_stem_df.pkl')
sport_news_stem_df = pd.read_pickle('sport_news_stem_df.pkl')
jovem_news_stem_df = pd.read_pickle('jovem_news_stem_df.pkl')
arena_news_df = pd.read_pickle('arena_news_df.pkl')
sport_news_df = pd.read_pickle('sport_news_df.pkl')
jovem_news_df = pd.read_pickle('jovem_news_df.pkl')

In [5]:
labels = np.array(len(arena_news_df)*[1] + len(sport_news_df)*[0])

In [12]:
count_vect = CountVectorizer(encoding='UTF-8',lowercase=False, min_df=2)
X = count_vect.fit_transform(arena_news_df['all'].tolist() + sport_news_df['all'].tolist())

X_train_norm_tfidf = TfidfTransformer(norm=u'l2', use_idf=True).fit_transform(X)
X_train_norm = TfidfTransformer(norm=u'l2', use_idf=False).fit_transform(X)

In [13]:
def to_csv(some_list, file_name, header):
    def to_str(st):
        if isinstance(st, str) or isinstance(st, unicode):
            return st
        else:
            return str(st)
    with codecs.open(file_name, 'w', 'utf-8') as file_stream:
        if len(header) != 0:
            file_stream.write(u','.join(header) + '\n')
        for item in some_list:
            if isinstance(item, tuple) or isinstance(item, list):
                line = u','.join(map(to_str, item)) + '\n'
            else:
                line = item + '\n'
            file_stream.write(line)
    file_stream.close()

In [14]:
def _big_s(x, center):
    len_x = len(x)
    total = 0

    for i in range(len_x):
        total += np.linalg.norm(x[i]-center)

    return total / len_x

def davies_bouldin_score(X, labels_pred, k_centers):
    num_clusters, _ = k_centers.shape
    big_ss = np.zeros([num_clusters], dtype=np.float64)
    d_eucs = np.zeros([num_clusters, num_clusters], dtype=np.float64)
    db = 0

    for k in range(num_clusters):
        samples_in_k_inds = np.where(labels_pred == k)[0]
        samples_in_k = X[samples_in_k_inds, :]
        big_ss[k] = _big_s(samples_in_k, k_centers[k])

    for k in range(num_clusters):
        for l in range(0, num_clusters):
            d_eucs[k, l] = np.linalg.norm(k_centers[k]-k_centers[l])

    for k in range(num_clusters):
        values = np.zeros([num_clusters-1], dtype=np.float64)
        for l in range(0, k):
            values[l] = (big_ss[k] + big_ss[l])/d_eucs[k, l]
        for l in range(k+1, num_clusters):
            values[l-1] = (big_ss[k] + big_ss[l])/d_eucs[k, l]

        db += np.max(values)
    res = db / num_clusters
    return res

def calculate_centroids_doc_mean(X, labels_pred, k):
    _, m = X.shape

    centroids = np.zeros((k, m))
    for k in range(k):
        samples_in_k_inds = np.where(labels_pred == k)[0]
        centroids[k, :] = X[samples_in_k_inds, :].mean(axis=0)

    return centroids

In [28]:
def matrix_factorization_overlapping_bin(X, k, l, num_iters=50):
    def weights_initialization(X, n, m, k):
        shuffle_inds = np.random.permutation(n)
        cluster_end_ind = 0
        for i in xrange(k):
            cluster_init_ind = cluster_end_ind
            cluster_end_ind = round((i + 1) * n / k)
            X[shuffle_inds[cluster_init_ind : cluster_end_ind], i] = 1
        return X

    def calculate_block_matrix(X, F, G, S, k, l):
        for i in xrange(k):
            for j in xrange(l):
                S[i, j] = np.mean(X[F[:, i] == 1][:, G[i][:, j] == 1])
        where_are_NaNs = np.isnan(S)
        S[where_are_NaNs] = 0
        return S

    n, m = X.shape

    F = weights_initialization(np.zeros((n, k)), n, m, k)

    G = []
    for i in xrange(k):
        G.append( weights_initialization(np.zeros((m, l)), m, n, l) )

    S = np.random.rand(k, l)

    for iter_ind in xrange(num_iters):
        S = calculate_block_matrix(X, F, G, S, k, l)

        for i in xrange(k):
            F_t = F[F[:, i] == 1, :].dot(S)
            X_t = X[F[:, i] == 1, :]
            G[i] = np.zeros((m, l))
            for j in xrange(m):
                clust_len, _ = X_t.shape
                diff = F_t - X_t[:, j].reshape(clust_len, 1).dot(np.ones(l).reshape(1, l))
                errors = np.diag(diff.T.dot(diff))
                minV = np.min(errors)
                index = np.where(errors <= minV)[0]
                G[i][j, index[np.random.randint(len(index))]] = 1

        G_t = np.zeros((k, m))
        for i in xrange(k):
            G_t[i, :] = S[i, :].dot(G[i].T)

        F = np.zeros((n, k))
        for j in xrange(n):
            diff = G_t - np.ones(k).reshape(k, 1).dot(X[j, :].reshape(1, m))
            errors = np.diag(diff.dot(diff.T))
            minV = np.min(errors)
            index = np.where(errors <= minV)[0]
            F[j, index[np.random.randint(len(index))]] = 1

    error = np.sum((X - F.dot(G_t))**2)
    print error

    rows_ind = np.argmax(F, axis=1)

    return F, S, G, G_t, rows_ind, error

In [29]:
def rand_score(labels_true, labels_pred):
    return 'Rand score: %s' % adjusted_rand_score(labels_true, labels_pred)

def sil_score(X, labels_pred):
    score = silhouette_score(X, labels_pred)
    return 'Silhouette score: %s' % score

def db_score(X, labels_pred, k_centers):
    return 'Davies-Bouldin index: %s' % davies_bouldin_score(X, labels_pred, k_centers)

In [33]:
def top_k(arr, k, axis=0):
    top_inds = np.argsort(arr, axis=axis)[-k:]
    top_vals = np.sort(arr, axis=axis)[-k:]
    return top_inds, top_vals

def reverse(arr):
    return arr[::-1]

def top_k_words_term_cluster(vec, X, count_vect, k):
    sum_per_word = np.sum(X, axis=0)
    top_inds, top_vals = top_k(vec, k)
    words = count_vect.get_feature_names()
    top_words = [words[i] for i in top_inds]
    correspondence_vals = [sum_per_word[i] for i in top_inds]

    top_pairs = reverse(zip(top_words, top_vals, correspondence_vals))

    return top_pairs

def top_k_words(vec, count_vect, k):
    top_inds, top_vals = top_k(vec, k)
    words = count_vect.get_feature_names()
    top_words = [words[i] for i in top_inds]
    top_pairs = reverse(zip(top_words, top_vals))
    return top_pairs

In [34]:
for k in [2, 3]:
    for l in [2, 3]:
        best = 1e10
        for anything in xrange(5):
            U_t, S_t, V_t, V_t_t, rows_ind_t, error = matrix_factorization_overlapping_bin(X_train_norm.toarray(), k, l, num_iters=100)
            if error < best:
                best = error
                U = U_t
                S = S_t
                V = V_t
                V_t_t = V_t_t
                rows_ind = rows_ind_t
            print

        np.savetxt('nmtf_overlap_bin_%sx%s_S.csv' % (k, l), S, delimiter=",")

        for kk in range(k):
            for ll in range(l):
                print 'Top words for term cluster %s and doc cluster %s:' % (ll, kk)
                top_pairs = top_k_words_term_cluster(V[kk][:, ll], X_train_norm.toarray(), count_vect, 999999999)
                print

                for w, v_value, tf_value in top_pairs[0:30]:
                    print w, v_value, tf_value
                to_csv(top_pairs, 'nmtf_overlap_bin_%sx%s_V%s_words_doc_clust_%s_top.csv' % (k, l, ll, kk), ['word', 'V cluster value', 'tf norm value'])
                print

        for i in range(k):
            clust_inds = np.where(rows_ind == i)[0]
            sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
            print 'Top words for document cluster %s' % i
            top_pairs = top_k_words(sum_per_word, count_vect, 99999999)
            for word, tf_val in top_pairs[0:30]:
                print word, tf_val
            print
            to_csv(top_pairs, 'nmtf_overlap_bin_%sx%s_doc_cluster_%s_words_top.csv' % (k, l, i),
                   ['word', 'tf norm value'])

187.209507753

187.166699818

190.324010313

187.15256547

189.584324074

Top words for term cluster 0 and doc cluster 0:

útil 1.0 0.0462847202446
eliminação 1.0 0.668606385737
eletrônicos 1.0 0.139512966878
elevado 1.0 0.0399220193959
elimina 1.0 0.0969023819587
eliminada 1.0 0.0710717559958
eliminar 1.0 0.304578578942
eliminatórias 1.0 1.15430289699
elogiado 1.0 0.109549597526
elenco 1.0 0.649160395082
elogiar 1.0 0.0908041178936
elogios 1.0 0.309298226098
elogiou 1.0 0.0972433276653
embalo 1.0 0.0554224170341
embate 1.0 0.141364280703
embora 1.0 0.839051525207
eletrônico 1.0 0.235120580411
elementos 1.0 0.169376211224
edições 1.0 0.112884782045
effect 1.0 0.0996596197502
eduardo 1.0 0.287946333421
educação 1.0 0.0549934351556
efe 1.0 0.302387314116
efeito 1.0 0.40585045197
efeitos 1.0 0.155631616006
efetiva 1.0 0.0559866268251
eficiente 1.0 0.0901837448111
elemento 1.0 0.0809966417187
eficientes 1.0 0.0499447788355
eficiência 1.0 0.076544548035

Top words for term cluster 1 and doc