In [1]:
import collections
import fasttext
import gzip
import json
import math
import matplotlib
import nltk
import os
import pickle
import pyLDAvis
import random
import scipy.sparse
import sys
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import zstandard as zstd

from collections import Counter
from gensim.models.coherencemodel import CoherenceModel
from joblib import dump, load
from langdetect import detect
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import LongType, StructField, StructType
from pyspark.ml.clustering import LDA, LDAModel, LocalLDAModel
from pyspark.ml.linalg import Vectors, SparseVector
from scipy.sparse import dok_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neural_network import MLPClassifier

nltk.download('stopwords')

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
[nltk_data] Downloading package stopwords to /home/olam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
conf = SparkConf().setMaster("local[4]").setAll([
    ('spark.executor.memory', '8g'),
    ('spark.driver.memory', '32g'),
    ('spark.driver.maxResultSize', '0')
])

# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()



In [3]:
# Load data
S = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/data/final_res/matrices/S_final.npz')

with open('/dlabdata1/youtube_large/olam/data/final_res/id2word_top20.pickle', 'rb') as f:
    id2word = pickle.load(f)
f.close()

In [4]:
def sort_from_indices(indices, list_):
    '''
    Parameters
    ----------
    indices : (list) 
        list of non-sorted indices of list_
    list_ : (list) 
        list of elements where the corresponding indices are `indices`
        
    Returns
    ----------
    sorted_list : (list)
        list of elements of list_ where the indice of the elements are sorted (ascending)
    '''
    
    sorted_indices = np.arange(len(indices))
    
    oldid2newid = {old_id: new_id for old_id, new_id in zip(indices, list(sorted_indices))}
    
    sorted_list = [list_[oldid2newid[i]] for i in sorted_indices]
    
    return sorted_list

In [5]:
def get_topic_term_dists(describe_topics):
    '''
    Parameters
    ----------
    describe_topics : (pyspark.sql.dataframe.DataFrame) 
        Dataframe generated by the describeTopics() method from pyspark.ml.clustering.LDA()
    
    Returns
    ----------
    topic_term_dists : array-like, shape (`n_topics`, `n_terms`)
        Matrix of topic-term probabilities. Where `n_terms` is `len(vocab)`
    '''
    
    topic_term_dists = []
    
    for row in describe_topics.collect():
    
        termIndices = row.termIndices
        termWeights = row.termWeights

        topic_term_dists.append(sort_from_indices(termIndices, termWeights))
        
    return np.array(topic_term_dists)

In [6]:
def get_doc_topic_dists(topic_docs):
    '''
    Parameters
    ----------
    topic_docs : (pyspark.sql.dataframe.DataFrame)
        Dataframe generated by the transform() method from pyspark.ml.clustering.LDA()
        
    Returns
    ----------
    doc_topic_dists : array-like, shape(`n_docs`, `n_topics`)
           Matrix of document-topic probabilities
    '''
    
    doc_topic_dists = []
    
    for row in topic_docs.collect():
        topicDistribution = row.topicDistribution[1]
        doc_topic_dists.append(topicDistribution)
        
    return np.array(doc_topic_dists)

In [7]:
def get_doc_lengths(S):
    '''
    Parameters
    ----------
    S : sparse_matrix, shape(`n_docs`, `n_terms`)
        Matrix for which each row is a bag-of-word vectors from the data
        
    Returns
    ----------
    doc_lengths : array-like, shape(`n_docs`)
        The length of each document, i.e. the number of words in each document
    
    '''
    
    doc_lengths = S.sum(axis=1)
    
    return np.squeeze(np.array(doc_lengths))

In [8]:
def get_term_frequency(S):
    '''
    Parameters
    ----------
    S : sparse_matrix, shape(`n_docs`, `n_terms`)
        Matrix for which each row is a bag-of-word vectors from the data
        
    Returns
    ----------
    term_frequency : array-like, shape(`n_terms`)
        The count of each particular term over the entire corpus
    
    '''
    
    term_frequency = S.sum(axis=0)
    return np.squeeze(np.array(term_frequency))

In [9]:
def get_visualisation(n_topic):
    '''
    Parameter
    ----------
    n_topic : int
    
    Output
    ----------
    vis : preparedData of pyLDAvis
        A named tuple containing all the data structures required to create
        the visualization. To be passed on to functions like :func:`display`.

    
    '''
    
    path_describe_topics = '/dlabdata1/youtube_large/olam/data/final_res/describe_topics/describe_topics_' + str(n_topic) + '.json'
    path_topic_docs = '/dlabdata1/youtube_large/olam/data/final_res/topics_doc_matrix_' + str(n_topic) + 'json'
    
    # Load data
    describe_topics = spark.read.json(path_describe_topics)
    topic_docs = spark.read.json(path_topic_docs)
    
    # Process data
    topic_term_dists = get_topic_term_dists(describe_topics.sort('topic'))
    doc_topic_dists = get_doc_topic_dists(topic_docs.sort('id'))
    doc_lengths = get_doc_lengths(S)
    term_frequency = get_term_frequency(S)
    vocab = list(id2word.values())
    
    # Compute visualisation
    vis = pyLDAvis.prepare(topic_term_dists=topic_term_dists,
                           doc_topic_dists=doc_topic_dists,
                           doc_lengths=doc_lengths,
                           vocab=vocab,
                           term_frequency=term_frequency)
    
    return vis
    

In [10]:
# Get visualisation data
vis55 = get_visualisation(n_topic=55)

# Get visualisation
pyLDAvis.display(vis55)

In [10]:
# Get visualisation data
vis110 = get_visualisation(n_topic=110)

# Get visualisation
pyLDAvis.display(vis110)