In [1]:
import gensim
import scipy.sparse
import pickle

import numpy as np

In [2]:
def remove_zero_rows(M):
    num_nonzeros = np.diff(M.indptr)
    return M[num_nonzeros != 0]

In [3]:
# Load the full sparse matrix
S = scipy.sparse.load_npz('/home/olam/csr_matrices/S_full.npz')

In [4]:
S

<86000000x7987546 sparse matrix of type '<class 'numpy.uint8'>'
	with 1412203435 stored elements in Compressed Sparse Row format>

In [5]:
S = remove_zero_rows(S).transpose().tocsc()

In [6]:
corpus = gensim.matutils.Sparse2Corpus(S)

In [7]:
corpus

<gensim.matutils.Sparse2Corpus at 0x7f9dcb3a0290>

In [8]:
# Load dictionnary of tokens
with open('/home/olam/list_stemmed_tokens.pickle', 'rb') as f:
    list_stemmed_tokens = pickle.load(f)
f.close()

In [9]:
id2word = {}

for i, token in enumerate(list_stemmed_tokens):
    id2word[i] = token

In [11]:
# Set up log to terminal
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [12]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       num_topics=20,
                                       id2word=id2word, 
                                       workers=3,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

2020-09-30 18:42:04,630 : INFO : using symmetric alpha at 0.05
2020-09-30 18:42:04,633 : INFO : using symmetric eta at 0.05
2020-09-30 18:42:05,804 : INFO : using serial LDA version on this node
2020-09-30 18:42:24,312 : INFO : running online LDA training, 20 topics, 10 passes over the supplied corpus of 68638982 documents, updating every 300 documents, evaluating every ~3000 documents, iterating 50x with a convergence threshold of 0.001000
2020-09-30 18:42:24,433 : INFO : training LDA model using 3 processes
2020-09-30 18:42:24,597 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #100/68638982, outstanding queue size 1
2020-09-30 18:42:31,825 : INFO : PROGRESS: pass 0, dispatched chunk #1 = documents up to #200/68638982, outstanding queue size 2
2020-09-30 18:42:31,828 : INFO : PROGRESS: pass 0, dispatched chunk #2 = documents up to #300/68638982, outstanding queue size 3
2020-09-30 18:42:31,831 : INFO : PROGRESS: pass 0, dispatched chunk #3 = documents up to #400/6863

2020-09-30 18:46:53,481 : INFO : topic #10 (0.050): 0.462*"futur" + 0.041*"energi" + 0.030*"island" + 0.017*"bolognes" + 0.016*"remix" + 0.014*"music" + 0.013*"focus" + 0.012*"capricorn" + 0.011*"leav" + 0.011*"kygo"
2020-09-30 18:46:53,818 : INFO : topic diff=0.049468, rho=0.242536
2020-09-30 18:46:54,106 : INFO : PROGRESS: pass 0, dispatched chunk #25 = documents up to #2600/68638982, outstanding queue size 5
2020-09-30 18:47:16,430 : INFO : merging changes from 400 documents into a model of 68638982 documents
2020-09-30 18:47:26,670 : INFO : topic #8 (0.050): 0.208*"walk" + 0.085*"manila" + 0.068*"5" + 0.020*"differ" + 0.017*"love" + 0.015*"world" + 0.013*"best" + 0.013*"food" + 0.012*"tri" + 0.012*"way"
2020-09-30 18:47:26,740 : INFO : topic #13 (0.050): 0.078*"vs" + 0.033*"cri" + 0.023*"game" + 0.023*"fight" + 0.021*"us" + 0.021*"lyric" + 0.020*"futur" + 0.019*"remix" + 0.018*"new" + 0.017*"best"
2020-09-30 18:47:26,808 : INFO : topic #18 (0.050): 0.151*"boy" + 0.045*"31st" + 0.04

2020-09-30 18:51:37,915 : INFO : topic #10 (0.050): 0.338*"futur" + 0.072*"focus" + 0.061*"island" + 0.031*"energi" + 0.029*"spaghetti" + 0.028*"fall" + 0.022*"still" + 0.019*"famous" + 0.019*"make" + 0.017*"instagram"
2020-09-30 18:51:37,986 : INFO : topic #0 (0.050): 0.286*"2017" + 0.109*"2018" + 0.074*"full" + 0.052*"new" + 0.049*"time" + 0.034*"first" + 0.028*"love" + 0.026*"life" + 0.019*"thing" + 0.018*"christma"
2020-09-30 18:51:38,054 : INFO : topic #9 (0.050): 0.081*"live" + 0.044*"show" + 0.040*"15th" + 0.035*"tarot" + 0.032*"push" + 0.029*"star" + 0.026*"love" + 0.026*"wish" + 0.026*"novemb" + 0.026*"virgo"
2020-09-30 18:51:38,115 : INFO : topic #4 (0.050): 0.141*"life" + 0.052*"news" + 0.033*"2017" + 0.030*"1st" + 0.029*"tarot" + 0.025*"love" + 0.025*"15th" + 0.024*"capricorn" + 0.023*"go" + 0.021*"chang"
2020-09-30 18:51:38,176 : INFO : topic #15 (0.050): 0.077*"song" + 0.062*"kid" + 0.059*"friend" + 0.054*"full" + 0.037*"max" + 0.033*"music" + 0.032*"best" + 0.030*"sound"

2020-09-30 18:54:45,545 : INFO : PROGRESS: pass 0, dispatched chunk #66 = documents up to #6700/68638982, outstanding queue size 7
2020-09-30 18:55:35,188 : INFO : merging changes from 700 documents into a model of 68638982 documents
2020-09-30 18:55:45,664 : INFO : topic #7 (0.050): 0.049*"love" + 0.049*"subscrib" + 0.045*"pisc" + 0.043*"15th" + 0.043*"tarot" + 0.038*"season" + 0.030*"1st" + 0.028*"singl" + 0.026*"relationship" + 0.025*"septemb"
2020-09-30 18:55:45,725 : INFO : topic #15 (0.050): 0.065*"kid" + 0.064*"song" + 0.056*"old" + 0.055*"friend" + 0.045*"full" + 0.038*"max" + 0.032*"joe" + 0.030*"best" + 0.027*"music" + 0.027*"tire"
2020-09-30 18:55:45,785 : INFO : topic #10 (0.050): 0.271*"futur" + 0.079*"make" + 0.061*"focus" + 0.051*"still" + 0.048*"island" + 0.047*"rob" + 0.047*"energi" + 0.027*"fall" + 0.024*"instagram" + 0.021*"spaghetti"
2020-09-30 18:55:45,846 : INFO : topic #3 (0.050): 0.333*"reaction" + 0.072*"arm" + 0.044*"best" + 0.031*"real" + 0.021*"tv" + 0.020*"

2020-09-30 18:59:27,388 : INFO : topic diff=0.025628, rho=0.111111
2020-09-30 18:59:27,666 : INFO : PROGRESS: pass 0, dispatched chunk #85 = documents up to #8600/68638982, outstanding queue size 2
2020-09-30 18:59:35,154 : INFO : PROGRESS: pass 0, dispatched chunk #86 = documents up to #8700/68638982, outstanding queue size 2
2020-09-30 18:59:42,531 : INFO : PROGRESS: pass 0, dispatched chunk #87 = documents up to #8800/68638982, outstanding queue size 2
2020-09-30 18:59:59,580 : INFO : merging changes from 300 documents into a model of 68638982 documents
2020-09-30 19:00:10,125 : INFO : topic #17 (0.050): 0.139*"2018" + 0.075*"year" + 0.071*"2017" + 0.045*"world" + 0.039*"2020" + 0.035*"run" + 0.027*"2019" + 0.025*"oil" + 0.023*"popular" + 0.022*"b"
2020-09-30 19:00:10,190 : INFO : topic #4 (0.050): 0.121*"life" + 0.056*"news" + 0.041*"bad" + 0.028*"look" + 0.027*"go" + 0.027*"2017" + 0.026*"chang" + 0.024*"1st" + 0.023*"tarot" + 0.021*"let"
2020-09-30 19:00:10,253 : INFO : topic #0 

2020-09-30 19:03:45,455 : INFO : topic diff=0.166805, rho=0.100504
2020-09-30 19:03:45,590 : INFO : PROGRESS: pass 0, dispatched chunk #103 = documents up to #10400/68638982, outstanding queue size 2
2020-09-30 19:03:53,096 : INFO : PROGRESS: pass 0, dispatched chunk #104 = documents up to #10500/68638982, outstanding queue size 2
2020-09-30 19:03:58,529 : INFO : PROGRESS: pass 0, dispatched chunk #105 = documents up to #10600/68638982, outstanding queue size 3
2020-09-30 19:03:58,531 : INFO : PROGRESS: pass 0, dispatched chunk #106 = documents up to #10700/68638982, outstanding queue size 4
2020-09-30 19:03:58,534 : INFO : PROGRESS: pass 0, dispatched chunk #107 = documents up to #10800/68638982, outstanding queue size 5
2020-09-30 19:03:58,537 : INFO : PROGRESS: pass 0, dispatched chunk #108 = documents up to #10900/68638982, outstanding queue size 6
2020-09-30 19:03:58,540 : INFO : PROGRESS: pass 0, dispatched chunk #109 = documents up to #11000/68638982, outstanding queue size 7
20

Process ForkPoolWorker-1:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 105, in worker
    initializer(*initargs)
  File "/home/olam/.local/lib/python3.7/site-packages/gensim/models/ldamulticore.py", line 334, in worker_e_step
    chunk_no, chunk, worker_lda = input_queue.get()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 105, in worker
    initializer(*initargs)
  File "/opt/anaconda3/lib/

KeyboardInterrupt: 