In [1]:
import numpy as np
import pickle
import pyLDAvis
import random
import scipy.sparse
import sys
import time

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.clustering import LDA, LDAModel, LocalLDAModel
from pyspark.ml.linalg import Vectors, SparseVector

  from collections import (
  from collections import (
  from collections import (
  from collections import (
  from collections import (
  class ResultIterable(collections.Iterable):


In [2]:
conf = SparkConf().setMaster("local[12]").setAll([
                                   ('spark.executor.memory', '16g'),  # find
                                   ('spark.driver.memory','64g'), # your
                                   ('spark.driver.maxResultSize', '16G') # setup
                                  ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext

In [16]:
def remove_col(S):
    '''Remove all the unseen tokens from the data given S'''
    
    # First remove unseen tokens
    S = S.tocsc() 
    
    S_keep_idx = []
    
    for i in range(S.shape[1]):
        if S[:,i].count_nonzero()!= 0:
            S_keep_idx.append(i)
    
    return S[:,S_keep_idx]

In [20]:
def get_dict_for_row(row, S):
    ''''''
    tmp_dict = {}
    for key, value in row:
        tmp_dict[key[1]] = value

    return SparseVector(S.shape[1], tmp_dict)

### Process train_data and test_data

We tune the optimal number of topics by minimizing the perplexity on subsample of videos, randomly taken from all the videos

In [8]:
S = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/matrices/S_final3.npz')

In [12]:
# We want to tune the optimal number of topics => find it on random subset of videos
id_vid2train = random.sample(range(0,S.shape[0]), 5000000)

In [18]:
S_sub = S[id_vid2train,:]
S_sub

<5000000x663127 sparse matrix of type '<class 'numpy.uint8'>'
	with 91261253 stored elements in Compressed Sparse Row format>

In [19]:
S_sub = remove_col(S_sub)
S_sub

<5000000x630997 sparse matrix of type '<class 'numpy.uint8'>'
	with 91261253 stored elements in Compressed Sparse Column format>

In [21]:
train_data_idx = set(random.sample(range(0,S_sub.shape[0]), int(0.8*S_sub.shape[0])))
len(train_data_idx)

80000

In [60]:
scipy.sparse.save_npz('/dlabdata1/youtube_large/olam/LDA_Model/testModel/S_sub.npz', S_sub)

In [61]:
with open('/dlabdata1/youtube_large/olam/LDA_Model/testModel/train_data_idx.pickle', 'wb') as f:
    pickle.dump(train_data_idx, f)
f.close()

In [22]:
train_data = []
test_data = []

In [23]:
train_data_idx_sorted = 0
test_data_idx_sorted = 0

for i in range(S_sub.shape[0]):
    
    if i % 1000000 == 0:
        print(str(i) + ' videos processed')
    
    # Data is a list of list of the following elems : index of doc and a bag-of-word sparse Vector
    if i in train_data_idx:
        train_data.append([train_data_idx_sorted, get_dict_for_row(S_sub.getrow(i).todok().items(), S_sub)])
        train_data_idx_sorted += 1
    else:
        test_data.append([test_data_idx_sorted, get_dict_for_row(S_sub.getrow(i).todok().items(), S_sub)])
        test_data_idx_sorted += 1

0 videos processed


### Tuning the best number of topics

Since we are selecting only the best number of topics from subsamples of videos, no need to keep index of the videos

- train the model on `train_data`
compute log-perplexity on `test_data`

- choose the number of topics such that log-perplex is minimized

In [24]:
numbers_topics = [5, 7, 9, 11]
perplex_scores = []
models = []

In [25]:
train_df = spark.createDataFrame(train_data, ["id", "features"])
test_df = spark.createDataFrame(test_data, ["id", "features"])

In [26]:
for n_topic in numbers_topics:
    print('Computing with ' + str(n_topic) + ' topics...')
    lda = LDA(k=n_topic, seed=1)
    model = lda.fit(train_df)
    logperplexity = model.logPerplexity(test_df)
    
    models.append(model)
    perplex_scores.append(logperplexity)
    
n_topics_opt = numbers_topics[np.argmin(perplex_scores)]   
print('The optimal choice for the number of topics : ' + str(n_topics_opt))

Computing with 5 topics...
Computing with 7 topics...
Computing with 9 topics...
Computing with 11 topics...
The optimal choice for the number of topics : 5


In [27]:
perplex_scores

[11.914872457236118, 13.974133856796868, 16.57647259033676, 19.726110597349958]

In [28]:
with open('/dlabdata1/youtube_large/olam/LDA_Model/testModel/perplex_scores_n_topic_optimal.pickle', 'wb') as f:
    pickle.dump({'perplex_scores':perplex_scores, 'n_topics_opt':n_topics_opt}, f)
f.close()

models[np.argmin(perplex_scores)].save('/dlabdata1/youtube_large/olam/LDA_Model/testModel/pyspark_ldamodel_subopt')

In [44]:
model = models[0]

### Get the model on the whole dataset

In [32]:
topic_columns = []
for i in range(n_topics_opt):
    topic_columns.append('Topic' + str(i))

In [None]:
data = []
for i in range(S.shape[0]):
    
    if i % 10000000 == 0:
        print(str(i) + ' videos processed')
    
    # Data is a list of list of the following elems : index of doc and a bag-of-word sparse Vector
    data.append([i, get_dict_for_row(S_sub.getrow(i).todok().items())])

In [None]:
df = spark.createDataFrame(data, ["id", "features"])

In [None]:
lda = LDA(k=n_topics_opt, seed=1)\
                .setTopicDistributionCol('topicDistributionCol')\
                .setK(n_topics_opt)

In [None]:
model = lda.fit(df)

#### Save the model with its attributes for visualization

In [None]:
model.save('/dlabdata1/youtube_large/olam/LDA_Model/pyspark_ldamodel')

In [35]:
model_attributes = {}

In [None]:
# model_attributes['perplexity'] = model.logPerplexity(df)
model_attributes['vocabSize'] = model.vocabSize()
model_attributes['n_topic'] = model.getK()

In [None]:
model_attributes

In [45]:
model.describeTopics(maxTermsPerTopic=10).write\
                    .option('compression', 'gzip')\
                    .json('/dlabdata1/youtube_large/olam/LDA_Model/testModel/describe_topics.json')

In [46]:
spark.createDataFrame(model.topicsMatrix().toArray().tolist(), topic_columns)\
                    .write\
                    .option('compression', 'gzip')\
                    .json('/dlabdata1/youtube_large/olam/LDA_Model/testModel/topics_term_matrix.json')

In [47]:
model.transform(train_df).write\
                    .option('compression', 'gzip')\
                    .json('/dlabdata1/youtube_large/olam/LDA_Model/testModel/topics_doc_matrix.json')

In [None]:
with open('/dlabdata1/youtube_large/olam/LDA_Model/testModel/lda_model_attributes.pickle', 'wb') as f:
    pickle.dump(model_attributes, f)
f.close()

#### Test loading the files

In [3]:
loaded_model = LocalLDAModel.load('/dlabdata1/youtube_large/olam/LDA_Model/testModel/pyspark_ldamodel')

In [109]:
loaded_model.isDistributed()

False

In [5]:
# Load dictionnary of tokens
with open('/dlabdata1/youtube_large/olam/LDA_Model/testModel/lda_model_attributes.pickle', 'rb') as f:
    loaded_model_attributes = pickle.load(f)
f.close()

In [6]:
loaded_model_attributes

{'perplexity': 10.335341077216416, 'vocabSize': 663127, 'n_topic': 6}

In [25]:
describe_topics = spark.read.json('/dlabdata1/youtube_large/olam/LDA_Model/testModel/describe_topics.json')
topics_term_matrix = spark.read.json('/dlabdata1/youtube_large/olam/LDA_Model/testModel/topics_term_matrix.json')
topics_doc_matrix = spark.read.json('/dlabdata1/youtube_large/olam/LDA_Model/testModel/topics_doc_matrix.json')

In [4]:
describe_topics

DataFrame[termIndices: array<bigint>, termWeights: array<double>, topic: bigint]

In [5]:
topics_term_matrix

DataFrame[Topic0: double, Topic1: double, Topic10: double, Topic11: double, Topic12: double, Topic13: double, Topic14: double, Topic2: double, Topic3: double, Topic4: double, Topic5: double, Topic6: double, Topic7: double, Topic8: double, Topic9: double]

In [6]:
topics_doc_matrix

DataFrame[features: struct<indices:array<bigint>,size:bigint,type:bigint,values:array<double>>, id: bigint, topicDistributionCol: struct<type:bigint,values:array<double>>]

## Understand results from LDA

### A) Get tokens of topics

In [7]:
# Load dictionnary of tokens
with open('/dlabdata1/youtube_large/olam/id2word_2.pickle', 'rb') as f:
    id2word = pickle.load(f)
f.close()

In [10]:
# Load tokens_id used in tuning hyperparams
with open('/dlabdata1/youtube_large/olam/LDA_Model/testModel/S_sub_tokens_id.pickle', 'rb') as f:
    S_sub_tokens_id = pickle.load(f)
f.close()

In [40]:
# create vocab array for vis
vocab = []
id2word_sub = {}

for i in range(len(S_sub_tokens_id)):
    vocab.append(id2word[S_sub_tokens_id[i]])
    id2word_sub[i] = id2word[S_sub_tokens_id[i]]

In [8]:
describe_topics.show()

+--------------------+--------------------+-----+
|         termIndices|         termWeights|topic|
+--------------------+--------------------+-----+
|[205918, 198819, ...|[0.00391853497944...|   14|
|[58390, 287827, 2...|[0.01509261744494...|    4|
|[71950, 12409, 17...|[0.02916694486537...|    5|
|[311477, 116761, ...|[0.01234762794046...|    6|
|[289530, 241285, ...|[0.01632303182354...|   11|
|[298781, 139260, ...|[0.01572532080782...|    0|
|[265466, 215696, ...|[0.00551614594822...|   13|
|[149414, 214137, ...|[0.01132450816698...|    1|
|[322850, 171010, ...|[0.01156510359189...|    8|
|[267693, 220355, ...|[0.01635558984405...|    7|
|[298223, 235156, ...|[0.00390041948509...|   12|
|[29986, 32347, 34...|[0.04029124603980...|    3|
|[174471, 248230, ...|[0.01172650364764...|   10|
|[175747, 163335, ...|[0.00492471608770...|    2|
|[30068, 58390, 24...|[0.03728542261304...|    9|
+--------------------+--------------------+-----+



In [13]:
for row in describe_topics.rdd.collect():
    print('Topic ' + str(row.topic) + ': ')
    term_weights = row.termWeights
    for i, token_id in enumerate(row.termIndices):
        print('   With weight of ' + str(term_weights[i]) + ' : ' + id2word_sub[token_id] )

Topic 14: 
   With weight of 0.0039185349794417235 : forza
   With weight of 0.002804167293828836 : horizon
   With weight of 0.0021044264946760843 : boat
   With weight of 0.0016250609288909529 : dinosaur
   With weight of 0.0015809167593738996 : lawn
   With weight of 0.0012363047414706658 : yacht
   With weight of 0.0008979114360114874 : picsart
   With weight of 0.0008933895160013604 : eicma
   With weight of 0.0008074448443757646 : fiora
   With weight of 0.0006858978234397648 : rex
Topic 4: 
   With weight of 0.015092617444949842 : pokemon
   With weight of 0.005510637748795686 : smash
   With weight of 0.003233585129266435 : redmi
   With weight of 0.002823666395235558 : bros
   With weight of 0.0021614336500929148 : redempt
   With weight of 0.001978960650089408 : natok
   With weight of 0.0012775443562956879 : note
   With weight of 0.0012767256136672115 : mele
   With weight of 0.0011539613763994964 : coaster
   With weight of 0.0011142476548980063 : rdr2
Topic 5: 
   With we

### B) Get visualisation

From the documentation of pyLDAvis, we need :

- topic_term_dists: array-like, shape (n_topics, n_terms)
- doc_topic_dists :array-like, shape (n_docs, n_topics)
- doc_lengths :array-like, shape n_docs
- vocab :array-like, shape n_terms
- term_frequency :array-like, shape n_terms


In [14]:
S_sub = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/LDA_Model/testModel/S_sub.npz')

In [15]:
termFrequency = S_sub.sum(axis=0)
termFrequency = np.squeeze(np.asarray(termFrequency))

In [16]:
topics_term_matrix_array = np.array(topics_term_matrix.collect()).T

In [32]:
topics_doc_matrix = topics_doc_matrix.sort('id')
topics_doc_matrix_array = np.squeeze(np.asarray(topics_doc_matrix.select('topicDistributionCol.values').collect()))

In [33]:
doc_lengths = S_sub.sum(axis=1)
doc_lengths = np.squeeze(np.array(doc_lengths))

In [36]:
vis = pyLDAvis.prepare(topic_term_dists=topics_term_matrix_array,
                       doc_topic_dists=topics_doc_matrix_array,
                       doc_lengths=doc_lengths,
                       vocab=vocab,
                       term_frequency=termFrequency)

In [37]:
pyLDAvis.display(vis)

### Close spark session

In [None]:
spark.close()

## Issue : need to remove tokens of length 1/2 , tokens with another alphabet and numerical tokens

Hence, save the S_final2 sparse matrix and get the id2word2 dict that map the token_id to the token in the new matrix

In [129]:
id_token_to_remove = []
token_to_keep = []

In [130]:
def englishAlpha(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return True
    else:
        return False

In [131]:
for token_id, token in id2word.items():
    if len(token) < 3 or nonEnglishAlpha(token) or token.isnumeric():
        id_token_to_remove.append(token_id)
    else:
        token_to_keep.append(token)

In [132]:
len(id_token_to_remove)

81014

In [133]:
len(token_to_keep)

663127

In [134]:
len(token_to_keep) + len(id_token_to_remove) == len(id2word)

True

In [135]:
S

<68638982x744141 sparse matrix of type '<class 'numpy.uint8'>'
	with 1393937498 stored elements in Compressed Sparse Row format>

In [136]:
id_token_to_keep = (np.delete(np.arange(len(id2word)), id_token_to_remove))

In [137]:
S = S[:, id_token_to_keep]

In [138]:
S

<68638982x663127 sparse matrix of type '<class 'numpy.uint8'>'
	with 1251231562 stored elements in Compressed Sparse Row format>

In [139]:
token_to_keep = set(token_to_keep)

In [142]:
id2word_new = {}

k = 0
for i, token in enumerate(id2word.values()):
    if token in token_to_keep:
        id2word_new[k] = token
        k += 1

In [145]:
scipy.sparse.save_npz('/dlabdata1/youtube_large/olam/matrices/S_final2.npz', S)

In [146]:
with open('/dlabdata1/youtube_large/olam/id2word_2.pickle', 'wb') as f:
    pickle.dump(id2word_new, f)
f.close()

## Issue : pyLDAvis, dimensions mismatch ??!

Check if this is because we select subset of videos so there are tokens that are never used => vocab is wrong and sum of distribution is not exactly 1 (weird?!)

In [7]:
S.tocsr()

<68638982x663127 sparse matrix of type '<class 'numpy.uint8'>'
	with 1251231562 stored elements in Compressed Sparse Row format>

In [238]:
S_sub = S_sub.tocsc()

In [239]:
S_sub_keep_idx = []
for i in range(S_sub.shape[1]):
    if S_sub[:,i].count_nonzero()!= 0:
        S_sub_keep_idx.append(i)

In [240]:
S_sub

<1000000x663127 sparse matrix of type '<class 'numpy.uint8'>'
	with 18239875 stored elements in Compressed Sparse Column format>

In [241]:
S_sub = S_sub[:,S_sub_keep_idx]
S_sub

<1000000x428785 sparse matrix of type '<class 'numpy.uint8'>'
	with 18239875 stored elements in Compressed Sparse Column format>

In [245]:
all_data = []

for i in range(S_sub.shape[0]):
    all_data.append([i, get_dict_for_row(S_sub.getrow(i).todok().items(), S_sub)])

In [246]:
all_data = spark.createDataFrame(all_data, ["id", "features"])

In [256]:
lda = LDA(k=20, seed=1)
model = lda.fit(all_data)
print('training complete')

training complete


In [257]:
termFrequency = S_sub.sum(axis=0)
termFrequency = np.squeeze(np.asarray(termFrequency))

In [259]:
topics_term_matrix = spark.createDataFrame(model.topicsMatrix().toArray().tolist(), topic_columns)
topics_term_matrix_array = np.array(topics_term_matrix.collect()).T

In [260]:
topics_doc_matrix = model.transform(all_data)
topics_doc_matrix = topics_doc_matrix.sort('id')
topics_doc_matrix_array = np.squeeze(np.array(topics_doc_matrix.select('topicDistribution').collect()))

In [261]:
doc_lengths = S_sub.sum(axis=1)
doc_lengths = np.squeeze(np.array(doc_lengths))

In [268]:
vocab = []
id2word_sub = {}
for i in range(len(S_sub_keep_idx)):
    vocab.append(id2word[S_sub_keep_idx[i]])
    id2word_sub[i] = id2word[S_sub_keep_idx[i]]

In [263]:
vis = pyLDAvis.prepare(topic_term_dists=topics_term_matrix_array,
                       doc_topic_dists=topics_doc_matrix_array,
                       doc_lengths=doc_lengths,
                       vocab=vocab,
                       term_frequency=termFrequency)

In [264]:
pyLDAvis.display(vis)

In [269]:
for row in model.describeTopics(maxTermsPerTopic=5).rdd.collect():
    print('Topic ' + str(row.topic) + ': ')
    term_weights = row.termWeights
    for i, token_id in enumerate(row.termIndices):
        print('   With weight of ' + str(term_weights[i]) + ' : ' + id2word_sub[token_id] )

Topic 0: 
   With weight of 0.011371145842075483 : war
   With weight of 0.011310337922244503 : beat
   With weight of 0.008066346686018059 : new
   With weight of 0.007861892048312833 : type
   With weight of 0.007758836521532167 : black
Topic 1: 
   With weight of 0.005425557364871488 : ufc
   With weight of 0.003931681556057344 : fight
   With weight of 0.0037950811772294374 : cri
   With weight of 0.0036212938544180977 : mma
   With weight of 0.003602965948907927 : intro
Topic 2: 
   With weight of 0.020218403786891638 : dota
   With weight of 0.010501690907336584 : tank
   With weight of 0.006508102507010567 : bird
   With weight of 0.0059253395537056715 : estat
   With weight of 0.004911100659932576 : angri
Topic 3: 
   With weight of 0.014983418874965092 : fish
   With weight of 0.006894717694286166 : halo
   With weight of 0.006340046616137076 : simul
   With weight of 0.006320611847609999 : train
   With weight of 0.00619561747158423 : chess
Topic 4: 
   With weight of 0.01111