In [2]:
import gensim
import scipy.sparse
import pickle

import numpy as np

In [3]:
def remove_zero_rows(M):
    '''Function that removes all rows from sparse matrix M that contains only zero.'''
    num_nonzeros = np.diff(M.indptr)
    return M[num_nonzeros != 0]

In [4]:
# Load the full sparse matrix
S = scipy.sparse.load_npz('/dlabdata1/youtube_large/olam/matrices/S_full.npz')

In [5]:
S = remove_zero_rows(S)

In [6]:
# Load dictionnary of tokens
with open('/dlabdata1/youtube_large/olam/list_stemmed_tokens.pickle', 'rb') as f:
    list_stemmed_tokens = pickle.load(f)
f.close()

In [7]:
# Load the set of videos to be considered
with open('/dlabdata1/youtube_large/olam/idx_vid_to_consider.pickle', 'rb') as f:
    idx_vid_to_consider = pickle.load(f)
f.close()

In [8]:
oldidx2newidx = {}

In [9]:
for i, idx_vid in enumerate(idx_vid_to_consider):
    oldidx2newidx[idx_vid] = i

## Prep-process 

### A) Remove non-frequent tokens

In [10]:
# Convert to csc in order to fast check the number of non zero element in each column 
S = S.tocsc()

In [16]:
id_tokens_to_consider = []

In [17]:
# Iterate on the columns
for i in range(S.shape[1]):
    
    if i%1000000 == 0:
        print('Processed : ' + str(i) + ' videos')
        
    # Check column has more than 20 non zero entries
    if S[:,i].count_nonzero() >= 20:
        id_tokens_to_consider.append(i)

Processed : 0 videos
Processed : 1000000 videos
Processed : 2000000 videos
Processed : 3000000 videos
Processed : 4000000 videos
Processed : 5000000 videos
Processed : 6000000 videos
Processed : 7000000 videos


In [18]:
len(id_tokens_to_consider)

744141

In [19]:
S = S[:,id_tokens_to_consider]

In [20]:
oldtokenid2newtokenid = {}

In [21]:
# Get old token id from the new token id
for i, old_id in enumerate(id_tokens_to_consider):
    oldtokenid2newtokenid[old_id] = i

In [22]:
set_oldid_token_to_consider = set(oldtokenid2newtokenid.keys())

In [23]:
# Get token for each column index
id2word = {}

for i, token in enumerate(list_stemmed_tokens):
    
    if i in set_oldid_token_to_consider:
        id2word[oldtokenid2newtokenid[i]] = token

In [26]:
# Convert back to csr since there is 100 times more rows than columns! (Memory efficient)
S = S.tocsr()

In [27]:
S

<68638982x744141 sparse matrix of type '<class 'numpy.uint8'>'
	with 1393937498 stored elements in Compressed Sparse Row format>

In [28]:
scipy.sparse.save_npz('/dlabdata1/youtube_large/olam/matrices/S_final.npz', S)

In [27]:
S = S.transpose()

In [28]:
corpus = gensim.matutils.Sparse2Corpus(S)

In [21]:
corpus

<gensim.matutils.Sparse2Corpus at 0x7f8732c8f890>

In [22]:
# Set up log to terminal
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [24]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       num_topics=20,
                                       id2word=id2word, 
                                       workers=3,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

2020-10-02 17:59:34,021 : INFO : using symmetric alpha at 0.05
2020-10-02 17:59:34,023 : INFO : using symmetric eta at 0.05
2020-10-02 17:59:34,133 : INFO : using serial LDA version on this node
2020-10-02 17:59:35,852 : INFO : running online LDA training, 20 topics, 10 passes over the supplied corpus of 68638982 documents, updating every 300 documents, evaluating every ~3000 documents, iterating 50x with a convergence threshold of 0.001000
2020-10-02 17:59:35,856 : INFO : training LDA model using 3 processes
2020-10-02 17:59:36,465 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #100/68638982, outstanding queue size 1
2020-10-02 17:59:37,790 : INFO : PROGRESS: pass 0, dispatched chunk #1 = documents up to #200/68638982, outstanding queue size 2
2020-10-02 17:59:37,793 : INFO : PROGRESS: pass 0, dispatched chunk #2 = documents up to #300/68638982, outstanding queue size 3
2020-10-02 17:59:37,797 : INFO : PROGRESS: pass 0, dispatched chunk #3 = documents up to #400/6863

  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamulticore.py", line 337, in worker_e_step
    worker_lda.do_estep(chunk)  # TODO: auto-tune alpha?
2020-10-02 18:00:06,128 : INFO : PROGRESS: pass 0, dispatched chunk #19 = documents up to #2000/68638982, outstanding queue size 20
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamodel.py", line 742, in do_estep
    gamma, sstats = self.inference(chunk, collect_sstats=True)
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamodel.py", line 680, in inference
    expElogbetad = self.expElogbeta[:, ids]
IndexError: index 753457 is out of bounds for axis 1 with size 744141
Process ForkPoolWorker-25:
2020-10-02 18:00:06,376 : INFO : PROGRESS: pass 0, dispatched chunk #20 = documents up to #2100/68638982, outstanding queue size 21
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
2020-10-02 18

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
2020-10-02 18:00:38,541 : INFO : PROGRESS: pass 0, dispatched chunk #33 = documents up to #3400/68638982, outstanding queue size 34
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 105, in worker
    initializer(*initargs)
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamulticore.py", line 337, in worker_e_step
    worker_lda.do_estep(chunk)  # TODO: auto-tune alpha?
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamodel.py", line 742, in do_estep
    gamma, sstats = self.inference(chunk, collect_sstats=True)
2020-10-02 18:00:40,706 : INFO : PROGRESS: pass 0, dispatched chunk #34 = documents up to #3500/68638982, outstanding queue size 35
  File "/home/olam/.l

2020-10-02 18:01:12,719 : INFO : PROGRESS: pass 0, dispatched chunk #47 = documents up to #4800/68638982, outstanding queue size 48
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamodel.py", line 680, in inference
    expElogbetad = self.expElogbeta[:, ids]
IndexError: index 963665 is out of bounds for axis 1 with size 744141
2020-10-02 18:01:14,077 : INFO : PROGRESS: pass 0, dispatched chunk #48 = documents up to #4900/68638982, outstanding queue size 49
Process ForkPoolWorker-38:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
2020-10-02 18:01:15,251 : INFO : PROGRESS: pass 0, dispatched chunk #49 = documents up to #5000/68638982, outstanding queue size 50
  File "/opt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 105, in worker
    ini

KeyboardInterrupt: 

  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 105, in worker
    initializer(*initargs)
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamulticore.py", line 337, in worker_e_step
    worker_lda.do_estep(chunk)  # TODO: auto-tune alpha?
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamodel.py", line 742, in do_estep
    gamma, sstats = self.inference(chunk, collect_sstats=True)
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamodel.py", line 680, in inference
    expElogbetad = self.expElogbeta[:, ids]
IndexError: index 967280 is out of bounds for axis 1 with size 744141
Process ForkPoolWorker-45:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/mu

  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamodel.py", line 742, in do_estep
    gamma, sstats = self.inference(chunk, collect_sstats=True)
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamodel.py", line 680, in inference
    expElogbetad = self.expElogbeta[:, ids]
IndexError: index 773997 is out of bounds for axis 1 with size 744141
Process ForkPoolWorker-53:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 105, in worker
    initializer(*initargs)
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamulticore.py", line 337, in worker_e_step
    worker_lda.do_estep(chunk)  # TODO: auto-tune alpha?
  File "/home/olam/.local/lib/python3.7

  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 105, in worker
    initializer(*initargs)
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamulticore.py", line 337, in worker_e_step
    worker_lda.do_estep(chunk)  # TODO: auto-tune alpha?
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamodel.py", line 742, in do_estep
    gamma, sstats = self.inference(chunk, collect_sstats=True)
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamodel.py", line 680, in inference
    expElogbetad = self.expElogbeta[:, ids]
IndexError: index 1115756 is out of bounds for axis 1 with size 744141
Process ForkPoolWorker-62:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.7/m

  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamodel.py", line 680, in inference
    expElogbetad = self.expElogbeta[:, ids]
IndexError: index 832224 is out of bounds for axis 1 with size 744141
Process ForkPoolWorker-70:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 105, in worker
    initializer(*initargs)
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamulticore.py", line 337, in worker_e_step
    worker_lda.do_estep(chunk)  # TODO: auto-tune alpha?
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamodel.py", line 742, in do_estep
    gamma, sstats = self.inference(chunk, collect_sstats=True)
  File "/home/olam/.local/lib/python3.7

---

# Spark Implementation

In [31]:
from pyspark import SparkContext

In [None]:
sc = 