# Neroma Kossi : 3A ENSAE, AS-DS
> Projet du cours d'éléments logiciels pour le traitement de données massives

In [3]:
%%html
<img src="Apache_Spark_logo.svg.png",width=10,height=10>

Our porject consists of parallelising the Latent Dirichlet Allocation (**LDA**) algorithm. The base paper is [https://www.semanticscholar.org/paper/PLDA%3A-Parallel-Latent-Dirichlet-Allocation-for-Wang-Bai/376ffb536c3dc5675e9ab875b10b9c4a1437da5d](PLDA, a parallel gibbs sampling based algorithm).

The main idea is  to run concurrent Gibb's sampling algorithms. This could be done via a distributed framework like MPI or mapReduce, we will be considering the last one in this project. Pyspark will be the standard library for the mapReduce architecture.

# Table of contents

>## 1. Create the Spark context

 > ## 2. Data pre-processing
  * **2.1. Load the data from file**
  * **2.2. Preprocessing**
  * **2.3. Building the vocabulary and the set of docs**
    * 2.3.1. Building the vocabularies (one per partition)
    * 2.3.2. Building docMaps : the set of all the documents (one per partition)
    * 2.3.3. Test if vocabularies and docMaps are correctly buil
  * **2.4. Prepare the data for the Gibbs samplers**
      * 2.4.1. Encode corpus
      * 2.4.2. Save the whole work
      
>## 3. Parallel LDA with mapReduce
  * **3.1. Set some parameters**
  * **3.2. Run the algorithm**
  * **3.3. Post-training analysis**

>## 4. Conclusion

In [4]:
from pyspark import SparkConf,  SparkContext  # Spark

In [5]:
import numpy as np # math ops
import os, shutil, json #File ops
import pickle as pkl # Serialiser

from datetime import datetime
import time

In [6]:
# Some utilities saved into custom modules

from nlp import preprocessAndGetTokens
from fileUtils import load, pickleLoader, dump, saveByPartition

# 1. Create the Spark Context

In [7]:
driver_memory = '1g' # Max memory available for the driver
executor_memory = '200m' # Max memory by executor
# We have to set those params before instantiating the SparkContext, other It would be too late
pyspark_submit_args = ' --driver-memory {0} --executor-memory {1} pyspark-shell'\
                                .format(driver_memory, executor_memory)
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

In [8]:
conf = SparkConf().setAll([
     ('spark.app.name', 'pLDA'), 
     ('spark.master', 'local[*]'), # the number of cores is set to max
    ('spark.scheduler.mode', 'FAIR')])

In [9]:
spark = SparkContext(conf = conf) # Here we create the Spark context

In [10]:
spark._conf.getAll()

[('spark.rdd.compress', 'True'),
 ('spark.app.name', 'pLDA'),
 ('spark.driver.port', '44889'),
 ('spark.app.id', 'local-1549594580221'),
 ('spark.scheduler.mode', 'FAIR'),
 ('spark.driver.host', '192.168.0.41'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.memory', '1g'),
 ('spark.ui.showConsoleProgress', 'true')]

# 2. Data pre-processing

Our dataset is made of many abstracts of research papers in Computer Science, Neuroscience, and Biomedical. It is available at no cost on https://labs.semanticscholar.org/corpus/

## 2.1. Load the data from file

In [11]:
def LoadPaperAbstract(docstr, trunc = 1500):
    """Convert the paper into json, keep only paper's id and first 1500 chars of its Abstract."""
    
    doc = json.loads(docstr)
    return (doc["id"], doc["paperAbstract"][:trunc], doc["title"][:trunc])

In [12]:
def processPaperAbstract(abstract):
    """This is a wrapper that calls the preprocessAndGetTokens function. The latest function will apply 
    some basic nlp tehchnics on the paper's abstract : lowercase-isation, stopwords removing, stemming..."""
    
    return np.array(preprocessAndGetTokens(abstract))

In [13]:
nbPartitions = 10 # Set the number of partitions, this is important as our Gibbs sampler is designed to 
                # lunch one sampler per partition 

> **Let's read the data**

In [14]:
data = spark.pickleFile("corpus/bigSample")

In [15]:
%%time
# fomart == (doc_id, doc_abstract, doc_title)
data.take(3)

CPU times: user 5.74 ms, sys: 4.26 ms, total: 10 ms
Wall time: 2 s


[('4185240187f80572387276f1c62ad0e62f1209f1',
  'Exponential Smooth Transition Autoregressive (ESTAR) models are an example of a nonlinear time series model. They are regime switching models that connect two autoregressive regimes in a smooth way, due to a smooth transition function of an exponential type. For instance, ESTAR models have been very popular for modeling real exchange rates as their symmetric, U-shaped transition functions allow an economic interpretation, i.e. the real exchange rate between currency A and B should behave the same way as the reciprocal of the real exchange rate between currency B and A. Moreover, one wants to allow the real exchange rate to move freely like a random walk near an equilibrium and being pulled back to it once they move too far off. Such a behavior is in line with important economic theories such as purchasing power parity and is usually modeled with a globally stationary ESTAR model having one unit root regime.',
  'Identification problems i

In [16]:
%%time
if os.path.exists("matrix/docTitles/") :
    shutil.rmtree("matrix/docTitles/")
data.map(lambda x :  {x[0]:x[1]}).saveAsPickleFile("matrix/docTitles/") # Save doc titles

CPU times: user 2.72 ms, sys: 7.72 ms, total: 10.4 ms
Wall time: 3.21 s


## 2.2. Preprocessing

In [17]:
%%time

#Now we do all the preprocessing, and save the dataset
folder = "corpus/train/"
if os.path.exists(folder) :
    shutil.rmtree(folder)
    
data = data.mapValues(processPaperAbstract)\
                    .filter(lambda x : len(x[1]) > 0)\
                    
data.saveAsPickleFile("corpus/train/", 500)

CPU times: user 16.1 ms, sys: 2.67 ms, total: 18.8 ms
Wall time: 1min 39s


In [18]:
data.take(1) # A sample of the tokenized dataset

[('4185240187f80572387276f1c62ad0e62f1209f1',
  array(['pariti', 'theori', 'unit', 'autoregress', 'power', 'equilibrium',
         'regim', 'symmetr', 'ie', 'want', 'import', 'function', 'switch',
         'type', 'stationari', 'root', 'currenc', 'far', 'one', 'move',
         'reciproc', 'instanc', 'moreov', 'behavior', 'pull', 'model',
         'transit', 'popular', 'back', 'estar', 'freeli', 'global', 'allow',
         'way', 'exchang', 'purchas', 'ushap', 'nonlinear', 'rate', 'walk',
         'behav', 'exampl', 'like', 'exponenti', 'due', 'line', 'time',
         'near', 'econom', 'seri', 'usual', 'two', 'veri', 'random',
         'smooth', 'connect', 'real', 'interpret', 'onc'], dtype='<U11'))]

> ***Here, our dataset is in the primal format `(docId, docTokens)`. Next, we will assign a random topic to each word in a document. We will also need to build the Vocaulary and the set of the documents.*** 

## 2.3. Building the vocabulary and the set of docs (one per partition)

**Reloading and partionning the dataset**

In [19]:
corpus = spark.pickleFile("corpus/train" ).repartition(nbPartitions)
corpus.getNumPartitions()

10

In [20]:
%%time
corpus.take(1)

CPU times: user 10 ms, sys: 405 µs, total: 10.4 ms
Wall time: 3.44 s


[('6733a6e1a0fed06fa8800e01fde8bb43ef4ebcae',
  array(['protein', 'gene', 'control', 'recombin', 'vari', 'consist',
         'associ', 'alphaproteas', 'cm', 'analys', 'inhibitor', 'frequenc',
         'data', 'allotyp', 'pig', 'appropri', 'encod', 'order', 'serum',
         'antigen', 'pi', 'two', 'pipoapobpig', 'mate', 'locus'],
        dtype='<U12'))]

### 2.3.1. Build the vocabularies (one per partition)

In [21]:
from builder  import makeVocabularies, makeVocabulariesFolder, getUniqueWords, getUniqueWords2

In [22]:
makeVocabulariesFolder() # Instantiate the vocabularies' folder

In [23]:
%%time

# Here we compute the set of unique words. As word can sometimes be very long, we'd rather retain only their ids
# In next steps, we will assign to each word a number ranging from 0 to V-1, where V == size of ours vocabs
uniqueWordsByPartition = corpus.mapPartitionsWithIndex(getUniqueWords).collect()

CPU times: user 95.3 ms, sys: 246 ms, total: 342 ms
Wall time: 1min 25s


In [24]:
# corpus.glom().map(len).collect()

In [25]:
# Number of documents & words per partition

L = [{"Partition": "%02d"%i, "ndocs": len(x[0]), "nvocabs": len(x[1])} for x, i 
             in zip(uniqueWordsByPartition, range(nbPartitions))  ]
L[:5]

[{'Partition': '00', 'ndocs': 6880, 'nvocabs': 48536},
 {'Partition': '01', 'ndocs': 6885, 'nvocabs': 48917},
 {'Partition': '02', 'ndocs': 6898, 'nvocabs': 48502},
 {'Partition': '03', 'ndocs': 6892, 'nvocabs': 49273},
 {'Partition': '04', 'ndocs': 6900, 'nvocabs': 48311}]

In [26]:
print("Totoal docs : %d "%sum(l["ndocs"] for l in L))

Totoal docs : 68922 


In [27]:
%%time
# Here we build the vocabularies, one per partition

makeVocabularies([ w[1] for w in  uniqueWordsByPartition]) # Build and save the vocabularies

Vocabulary 0 successfully built
Vocabulary 1 successfully built
Vocabulary 2 successfully built
Vocabulary 3 successfully built
Vocabulary 4 successfully built
Vocabulary 5 successfully built
Vocabulary 6 successfully built
Vocabulary 7 successfully built
Vocabulary 8 successfully built
Vocabulary 9 successfully built

 Global vocabulary  built too
CPU times: user 12 s, sys: 2.66 s, total: 14.6 s
Wall time: 11.8 s


In [28]:
del uniqueWordsByPartition # free up somme memory

### 2.3.2. Make docMaps :  the set of all the documents

In [29]:
from builder import makeDocsMaps, makeDocsMapsFolder

In [30]:
makeDocsMapsFolder() # Instantiate the documents' folder

In [31]:
%%time

corpus.mapPartitionsWithIndex(makeDocsMaps).collect()

CPU times: user 3.93 ms, sys: 3.03 ms, total: 6.96 ms
Wall time: 2.78 s


['docMap 0 successfully built',
 'docMap 1 successfully built',
 'docMap 2 successfully built',
 'docMap 3 successfully built',
 'docMap 4 successfully built',
 'docMap 5 successfully built',
 'docMap 6 successfully built',
 'docMap 7 successfully built',
 'docMap 8 successfully built',
 'docMap 9 successfully built']

### 2.3.3. Test if vocabularies and docMaps are correctly built

As voacabularies & docMaps was successfully built, let's load them

In [32]:
%%time
vocabAll = load("matrix/vocabulary/vocabAll")

vocabs = [load("matrix/vocabulary/vocab__%04d__"%ind) for ind in range(nbPartitions)] 

CPU times: user 1.63 s, sys: 85.6 ms, total: 1.71 s
Wall time: 1.71 s


In [33]:
print("Total words in Vocab : ", len(vocabAll))

Total words in Vocab :  228584


In [34]:
%%time
from builder import loadDocsAll
docsAll = loadDocsAll(nbPartitions)

docs = [load("matrix/docsMap/docs__%04d__"%ind) for ind in range(nbPartitions)] 

CPU times: user 35.6 ms, sys: 11 ms, total: 46.6 ms
Wall time: 46 ms


In [35]:
%%time
nbDocs = list(map(len, docs)) # Number of documents per partition
nbVocabs = list(map(len, vocabs)) # Number of unique words (vocabulary) per partition
print(nbDocs[:2], nbVocabs[:2])

[6880, 6885] [48536, 48917]
CPU times: user 278 µs, sys: 61 µs, total: 339 µs
Wall time: 260 µs


## 2.4. Prepare the data for the training step
> This step involves encoding the corpus and adding topics : using ids instead of full text

### 2.4.1. Encoding the corpus

In [36]:
from builder  import encodeAddTopics

In [37]:
%%time 

# The corpius is in full text again, let's change it in the next step
corpus.take(1)

CPU times: user 4.89 ms, sys: 1.08 ms, total: 5.97 ms
Wall time: 147 ms


[('6733a6e1a0fed06fa8800e01fde8bb43ef4ebcae',
  array(['protein', 'gene', 'control', 'recombin', 'vari', 'consist',
         'associ', 'alphaproteas', 'cm', 'analys', 'inhibitor', 'frequenc',
         'data', 'allotyp', 'pig', 'appropri', 'encod', 'order', 'serum',
         'antigen', 'pi', 'two', 'pipoapobpig', 'mate', 'locus'],
        dtype='<U12'))]

In [38]:
nbTopics = 10

In [44]:
os.listdir("corpus/train")

['.part-00009.crc',
 'part-00003',
 '.part-00006.crc',
 '.part-00001.crc',
 'part-00002',
 'part-00008',
 'part-00001',
 'part-00009',
 'part-00007',
 '.part-00007.crc',
 '.part-00003.crc',
 'part-00006',
 '_SUCCESS',
 '.part-00005.crc',
 '.part-00004.crc',
 'part-00004',
 '.part-00000.crc',
 'part-00005',
 '.part-00002.crc',
 'part-00000',
 '._SUCCESS.crc',
 '.part-00008.crc']

In [39]:
# We can notice that all the words have been encoded into symbolic ids, topics  have been added too
corpus2 = corpus.mapPartitionsWithIndex(lambda ind, part : encodeAddTopics(ind, part,docs[ind],
                                                                           vocabs[ind], nbTopics), 
                                       preservesPartitioning = True)

In [40]:
%%time
corpus2.take(1) # Just word's and doc's ids now, topics have been added too

CPU times: user 37.7 s, sys: 673 ms, total: 38.4 s
Wall time: 41 s


[(0,
  (4509, array([47515, 39315, 24789, 39568, 15276,   495,  3385,  4480, 32873,
          10196, 11193, 23603, 39721, 18156, 47922, 37335, 12901, 19408,
           9197, 21665,  9016, 43880, 29411, 43040, 35889]), array([9, 8, 5, 2, 1, 4, 3, 8, 6, 9, 9, 3, 4, 7, 1, 2, 9, 5, 4, 6, 6, 4,
          9, 1, 1])))]

### 2.4.2. Save the whole work for the next step 

In [41]:
from fileUtils import saveAsPickleFile

In [42]:
%%time
# saveAsPickleFile(corpus2)
if os.path.exists("initial_train"):
    shutil.rmtree("initial_train")
corpus2.saveAsPickleFile("initial_train", 1)

Py4JJavaError: An error occurred while calling o155.saveAsObjectFile.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:100)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1096)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1094)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1067)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1032)
	at org.apache.spark.rdd.SequenceFileRDDFunctions$$anonfun$saveAsSequenceFile$1.apply$mcV$sp(SequenceFileRDDFunctions.scala:69)
	at org.apache.spark.rdd.SequenceFileRDDFunctions$$anonfun$saveAsSequenceFile$1.apply(SequenceFileRDDFunctions.scala:54)
	at org.apache.spark.rdd.SequenceFileRDDFunctions$$anonfun$saveAsSequenceFile$1.apply(SequenceFileRDDFunctions.scala:54)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.SequenceFileRDDFunctions.saveAsSequenceFile(SequenceFileRDDFunctions.scala:54)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsObjectFile$1.apply$mcV$sp(RDD.scala:1526)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsObjectFile$1.apply(RDD.scala:1526)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsObjectFile$1.apply(RDD.scala:1526)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.saveAsObjectFile(RDD.scala:1523)
	at org.apache.spark.api.java.JavaRDDLike$class.saveAsObjectFile(JavaRDDLike.scala:565)
	at org.apache.spark.api.java.AbstractJavaRDDLike.saveAsObjectFile(JavaRDDLike.scala:45)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:844)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 15.0 failed 1 times, most recent failure: Lost task 1.0 in stage 15.0 (TID 56, localhost, executor driver): ExecutorLostFailure (executor driver exited caused by one of the running tasks) Reason: Executor heartbeat timed out after 229933 ms
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:78)
	... 41 more


> Here is the end of the data preprocessing, the data is in the right format now and we can run our `Gibbs samplers`. Let's sart

In [None]:
del data, corpus, corpus2 # free up some memories

# 3. Parallel LDA (mapReduce version)

> Here the ML part

## 3.1. Define some parameters

In [None]:
nbVocabAll = len(vocabAll)
alpha = 0.1
beta = 0.1

In [None]:
from builder  import init

In [None]:
# from builder import makeConfig, updateConfig, get_now
# makeConfig(id = "all", countWordsUpdated = {str(ind):False for ind in range(nbPartitions)}, time = get_now())

## 3.2. Training

In [None]:
import importlib, model, builder, fileUtils
importlib.reload(model)
importlib.reload(builder)
importlib.reload(fileUtils)
from fileUtils import saveAsPickleFile
from model import pldaMap0
from builder import updateCountWordsAll, init

In [None]:
# pldaMap(0, 1, alpha, beta, len(vocabAll), nbTopics)

In [None]:
# rdd = spark.pickleFile("pickle/")
# rdd.getNumPartitions()

In [None]:
rdd = spark.pickleFile("initial_train")
# (doc_id, doc_words, doc_topics) <--- the format
rdd.take(1)

In [None]:
%%time

t0 = time.time()
rdd = spark.pickleFile("initial_train").partitionBy(nbPartitions).map(lambda x: x[1])
# rdd = corpus2
init(rdd, vocabs, nbDocs, nbVocabs, len(vocabAll), nbTopics)


for i in range(20):
    rdd = rdd.mapPartitionsWithIndex(lambda ind, part : pldaMap0(ind, part, alpha, beta, nbVocabAll, nbTopics),
                       preservesPartitioning= True )
    saveAsPickleFile(rdd)
    rdd = spark.pickleFile("pickle/").partitionBy(nbPartitions).map(lambda x: x[1])
    updateCountWordsAll()
    if i%10 == 0 :
        print("iteration : {0}, Elapsed : {1}".format(i, time.time() - t0))
print("Total time : {}".format(time.time() - t0))

## 3.3. Post-training analysis

In [None]:
%%time
cl = 5
subdoc = 1
countDocs = load("matrix/countDocs/docs__%04d__"%subdoc)
countDocs[:10]

In [None]:
topics = countDocs.argmax(1)
v = np.where(topics == cl)
countDocs[v]

In [None]:
dks = np.array( list(docs[subdoc].items()))
cluster = dks[v]
cluster[:10]

In [None]:
corpus = spark.pickleFile("corpus/corpus-46/part-00000" ).repartition(nbPartitions)

In [None]:
%%time
corpus.filter(lambda x : np.isin(x[0], cluster[:, 0])).take(10)

# 4. Conclusion