In [56]:
import collections
import numpy as np
import pandas as pd
import pickle
import pyLDAvis
import random
import scipy.sparse
import sys
import time

from gensim.models.coherencemodel import CoherenceModel
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim

In [65]:
class FakedGensimDict:
    """
    Locally made class for `~gensim.corpora.dictionary.Dictionary`
    """
    def __init__(self, data):
        if not isinstance(data, dict):
            raise ValueError('`data` must be an instance of `dict`')

        self.id2token = data
        self.token2id = {v: k for k, v in data.items()}

    @staticmethod
    def from_vocab(vocab):
        return FakedGensimDict(dict(zip(range(len(vocab)), vocab)))

In [2]:
conf = SparkConf().setMaster("local").setAll([
     ('spark.executor.memory', '8g'),
     ('spark.driver.memory','8g'),
     ('spark.driver.maxResultSize', '0')
    ])

# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext



In [3]:
def remove_zero_rows(M):
    '''Function that removes all rows from sparse matrix M that contains only zero.'''
    num_nonzeros = np.diff(M.indptr)
    return M[num_nonzeros != 0]

In [4]:
# Load data
S = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/data/view10000_sub100000/csr_matrices_100000sub/S_final_tok100vid.npz')

# Load set of videos to consider
with open('/dlabdata1/youtube_large/olam/data/view10000_sub100000/sorted_idx_relevant_vid_top20_sub100000.pickle', 'rb') as f:
    sorted_idx_relevant_vid_top20 = pickle.load(f)
f.close()

S_20 = S[sorted_idx_relevant_vid_top20,:]
S_20 = remove_zero_rows(S_20)

In [5]:
# Load dictionnary of words
with open('/dlabdata1/youtube_large/olam/data/view10000_sub100000/id2word_tok100vid_sub100000.pickle', 'rb') as f:
    id2word = pickle.load(f)
f.close()

describe_topics = spark.read.json('/dlabdata1/youtube_large/olam/data/view10000_sub100000/LDA_models/top20/describe_topics_75_iter1000_tok100vid.json')

# Print topics
for row in describe_topics.sort('topic').rdd.collect():
    print('Topic ' + str(row.topic) + ': ')
    term_weights = row.termWeights
    for i, token_id in enumerate(row.termIndices):
        print('   With weight of ' + str(term_weights[i]) + ' : ' + id2word[token_id] )
        if i >= 10:
            break

Topic 0: 
   With weight of 0.05188643758626289 : rap
   With weight of 0.04218775942871004 : hip
   With weight of 0.041506975597373624 : hop
   With weight of 0.025811694112528445 : music
   With weight of 0.018647649413352445 : lil
   With weight of 0.015178516827889454 : video
   With weight of 0.012954706626470603 : club
   With weight of 0.009729815432596317 : katie
   With weight of 0.009664152615715065 : new
   With weight of 0.00881766549352615 : freestyle
   With weight of 0.008682127725336936 : brown
Topic 1: 
   With weight of 0.07397074148582329 : art
   With weight of 0.05170701877232562 : water
   With weight of 0.04899947952840779 : watch
   With weight of 0.03445824688733867 : draw
   With weight of 0.03377785481395042 : drawing
   With weight of 0.031886801412986845 : tutorial
   With weight of 0.026485035416909813 : two
   With weight of 0.02434698981684095 : design
   With weight of 0.02369350731835734 : painting
   With weight of 0.021882898107176287 : paint
   Wit

## Computing c_v coherence of a topic model

In [76]:
topics = []

for row in describe_topics.sort('topic').rdd.collect():
    tokenized_topic = []
    for i, token_id in enumerate(row.termIndices):
        tokenized_topic.append(id2word[token_id])
        if i > 5:
            break
    topics.append(tokenized_topic)

In [6]:
describe_topics = describe_topics.sort('topic')

In [7]:
vocabSize = S_20.shape[1]

In [9]:
topic_word_dist = []
for row in describe_topics.collect():
    termIndices = row['termIndices']
    termWeights = row['termWeights']
    
    word_dist = {}
    for i in range(vocabSize):
        word_dist[termIndices[i]] = termWeights[i]
        
    word_dist_ordered = collections.OrderedDict(sorted(word_dist.items()))
    topic_word_dist.append(list(word_dist_ordered.values()))

In [17]:
topic_word_dist = np.matrix(topic_word_dist)

In [46]:
vocab = list(id2word.values())

In [54]:
vocab = np.array(vocab)

In [11]:
texts = []

In [12]:
for i in range(S_20.shape[0]):
    token_indices = list(S_20.getrow(i).nonzero()[1])
    tokens = []
    
    for token_indice in token_indices:
        tokens.append(id2word[token_indice])
    texts.append(tokens)

In [45]:
topic_word_dist.shape

(75, 42757)

In [55]:
coherence_score = metric_coherence_gensim(measure='c_v',
                                          topic_word_distrib=topic_word_dist,
                                          vocab=vocab,
                                          texts=texts)

TypeError: unhashable type: 'numpy.ndarray'

In [78]:
coherence_model = CoherenceModel(topics=topics,
                                 corpus=S_20,
                                 dictionary=FakedGensimDict.from_vocab(vocab),
                                 texts=texts,
                                 coherence='c_v')

In [79]:
coherence_model.get_coherence() ## with 5 words to describe topics

0.630466270527225

In [71]:
coherence_model.get_coherence() ## with 10 words to describe topics

0.5646333509919723

In [75]:
coherence_model.get_coherence() ## with 20 words to describe topics

0.5150608851801445

## Get coherence scores from multiple models to choose optimal number of topics

In [None]:
coherence_scores = []
n_topics_list = [40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120]

for i, n_topics in enumerate(n_topics_list):
    
    # Get describe_topics dataframe
    describe_topics = spark.read.json('/dlabdata1/youtube_large/olam/data/view10000_sub100000/LDA_models/top20/tune/describe_topics_' + str(n_topics) + '_iter1000_tok100vid.json')
    
    
    # Characterize the topics with tokens
    topics = []

    for row in describe_topics.sort('topic').rdd.collect():
        tokenized_topic = []
        for j, token_id in enumerate(row.termIndices):
            tokenized_topic.append(id2word[token_id])
            if j > 10:
                break
        topics.append(tokenized_topic)
        
    # Compute coherence score and append to coherence scores
    coherence_model = CoherenceModel(topics=topics,
                                 corpus=S_20,
                                 dictionary=FakedGensimDict.from_vocab(vocab),
                                 texts=texts,
                                 coherence='c_v')
    
    coherence_scores.append(coherence_model.get_coherencet())