# Ideas

* As the gaps between global updates get higher, the gibbs convergence to the true dist might become worst. How can we test this idea ? By looking at the number of simulation needed befor convergence ?

* As the gaps between global updates get higher, the number of communications will decrease and thus the algorithm will become  faster. Can we test that idea ?

In [1]:
from pyspark.sql import SparkSession, DataFrame, Row
from pyspark import TaskContext
from pyspark import AccumulatorParam
from pyspark.sql  import functions as F
from pyspark.sql.types import StructType,StructField, FloatType ,IntegerType, StringType, MapType, ArrayType,LongType
# from pyspark.sql.functions import UserDefinedFunction as udf

In [2]:
# from pyspark import SparkContext
import numpy as np, os, shutil, json, time
import pickle as pkl
from datetime import datetime
from prep import preprocessAndGetTokens

In [3]:
spark = SparkSession.builder\
    .master("local") \
    .appName("lda") \
    .getOrCreate()#.sparkContext

In [4]:
import json

In [5]:
def processAbstract(docstr):
    doc = json.loads(docstr)
    abstract = doc["paperAbstract"]
    tokens = np.array(preprocessAndGetTokens(abstract))
    return (doc["id"], tokens)

In [6]:
data = spark.sparkContext.textFile("sample-S2-records")

In [7]:
corpus = data.map(processAbstract).filter(lambda x : len(x[1]) > 0)
corpus.take(1)

[('4cbba8127c8747a3b2cfb9c1f48c43e5c15e323e',
  array(['unit', 'analysi', 'compar', 'includ', 'histor', 'decemb',
         'surgeri', 'subclassifi', 'paradigm', 'chemotherapi', 'treatment',
         'diagnosi', 'cancer', 'trend', 'complet', 'januari', 'appear',
         'admiss', 'recent', 'final', 'toward', 'complic', 'result',
         'accord', 'receiv', 'resect', 'ovarian', 'frequenc', 'os',
         'support', 'treat', 'method', 'outcom', 'use', 'versus', 'patient',
         'postop', 'extract', 'evalu', 'time', 'standard', 'diagnos',
         'tumor', 'nactid', 'retrospect', 'achiev', 'overal', 'higher',
         'kaplanmei', 'vs', 'record', 'advanc', 'diseas', 'iv', 'pds',
         'shift', 'site', 'stage', 'identifi', 'neoadjuv', 'intens', 'like',
         'residu', 'epitheli', 'less', 'benefit', 'hypothes', 'total',
         'longer', 'group', 'underw', 'month', 'surviv', 'median',
         'dissemin', 'data', 'signific', 'interv', 'morbid', 'similar',
         'newli', 'frequ

In [8]:
def load(path):
    with open(path, "rb") as f :
        obj = pkl.load(f)
    return obj

def dump(obj,path, mode = "wb"):
    with open(path, mode) as f :
        pkl.dump(obj,f)

In [9]:
def saveByPartition(ind, part, folder = "matrix/corpusTopic", mode = "wb", batchsize = 10):
    root = folder + "/partition__%04d__"%ind
    if os.path.exists(root):
        if mode == "wb":
            shutil.rmtree(root)
    os.mkdir(root)
    write_more = True
    counter = 0
    while write_more :
        write_more = False
        file = root + "/batch_%010d"%counter
        nwrited = 0
        with open(file, mode ) as f :
            for el in part :
                pkl.dump(el, f)
                nwrited += 1
                write_more = True
                if nwrited >= batchsize :
                    break
            
            counter += 1
        if not write_more :
            os.remove(file)
            break
    return []

In [10]:
def makeVocabulary(corpus_rdd,npPartitons,  mode = "wb"):
    vocabByPartition = corpus_rdd\
                .mapValues(lambda x : x[1]).reduceByKey(lambda x, y : np.union1d(x,y) )\
                    .collect()
    
    vocabAll = np.unique(np.concatenate([ v[1] for v in vocabByPartition]))
    vocabAll =  {w:ind for w,ind in zip(vocabAll, range(len(vocabAll))) }
    with open("matrix/vocabulary/vocabAll", mode) as f :
        pkl.dump(vocabAll, f)
    
    for v in vocabByPartition :
        wLocIdGlobId = {w : (ind, vocabAll[w]) for w, ind in zip(v[1], range(len(v[1]))) }
        with open("matrix/vocabulary/vocab__%04d__"%v[0], mode) as f :
            pkl.dump(wLocIdGlobId, f)

In [11]:
def makeDocsMap(corpus_rdd,npPartitons, mode = "wb"):
    docsByPartion = corpus_rdd.mapValues(lambda x : x[0] )\
                .groupByKey().mapValues(list).collect()
    
    for v in docsByPartion :
        partDocLocId = {doc :ind for doc, ind in zip(v[1], range(len(v[1]))) }
        with open("matrix/docsMap/docs__%04d__"%v[0], mode) as f :
            pkl.dump(partDocLocId, f)

In [12]:
nbPartitions = 4

In [13]:
corpus2 = corpus.map(lambda x : (np.random.choice(nbPartitions), x))\
            .partitionBy(nbPartitions).cache()

In [14]:
# corpus2.take(2)

In [15]:
makeVocabulary(corpus2, nbPartitions)

In [16]:
makeDocsMap(corpus2, nbPartitions)

In [17]:
len(load("matrix/vocabulary/vocabAll"))

2701

In [18]:
len(load("matrix/vocabulary/vocab__0000__"))

1044

In [19]:
len(load("matrix/docsMap/docs__0000__"))

21

In [20]:
vocabAll = load("matrix/vocabulary/vocabAll")
vocabAll['provid']

1934

In [21]:
from scipy.sparse import coo_matrix

In [22]:
# countWords2 = corpTop.flatMap(lambda x : [(u,1) for u in zip(x[1], x[2])]).countByKey()
# coo = np.array(list(countWords2.keys()))
# data = np.array(list(countWords2.values()))
# order = coo[:, 0].argsort()
# countWords2 = coo_matrix((data[order] , (coo[order, 0], coo[order, 1])), shape = (nbVocab, nbTopics)).toarray()
# countWords2

In [23]:
# àà

In [24]:
def initCountWordsAll():
    countWords = np.zeros((len(vocabAll), nbTopics))
    for ind in range(nbPartitions):
        countWords_ind = load("matrix/countWords/words__%04d__"%ind)
        table = np.array(list(vocabs[ind].values()))
        countWords[table[:,1]] += countWords_ind[table[:,0]]
    dump(countWords , "matrix/countWords/words_all")
    
    #Now let update each countWords to its correct state (the old values are wrong as they
    # don't take into account the docs in other partitions)
    for ind in range(nbPartitions):
        table = np.array(list(vocabs[ind].values()))
        dump(countWords[table[:,1]], "matrix/countWords/words__%04d__"%ind)
    
    
def updateCountWordsAll():
    countWords = load( "matrix/countWords/words_all")

    for ind in range(nbPartitions):
        deltaWords = load("matrix/deltaWords/deltas__%04d__"%ind)
        table = np.array(list(vocabs[ind].values()))
        countWords[table[:,1]] += deltaWords[table[:, 0]]
    dump(countWords , "matrix/countWords/words_all")
    
    #Now let update each countWords to its correct state (the old values are wrong as they
    # don't take into account the docs in other partitions)
    for ind in range(nbPartitions):
        table = np.array(list(vocabs[ind].values()))
        order = np.argsort(table[:, 0])
        table = table[order]
        dump(countWords[table[:,1]], "matrix/countWords/words__%04d__"%ind)

In [25]:
# corpus.map(lambda x : (len(x[1]) == len(np.unique(x[1])))*1).collect() #check if unique

In [26]:
# freq = corpus.flatMap(lambda x : [(w, 1) for w in x[1]]).countByKey()
# sum(freq.values())

In [27]:
vocabAll

{'abdomin': 0,
 'aber': 1,
 'abhiingig': 2,
 'abil': 3,
 'abl': 4,
 'abnorm': 5,
 'abolish': 6,
 'absenc': 7,
 'absent': 8,
 'absolut': 9,
 'absorb': 10,
 'absorpt': 11,
 'abund': 12,
 'academ': 13,
 'academi': 14,
 'acaij': 15,
 'acceler': 16,
 'accept': 17,
 'access': 18,
 'accesscontrol': 19,
 'accomplish': 20,
 'accord': 21,
 'account': 22,
 'accumul': 23,
 'accur': 24,
 'accuraci': 25,
 'acet': 26,
 'achiev': 27,
 'acid': 28,
 'acidif': 29,
 'acknowledg': 30,
 'acquir': 31,
 'acquisit': 32,
 'across': 33,
 'act': 34,
 'action': 35,
 'activ': 36,
 'acut': 37,
 'adapt': 38,
 'adc': 39,
 'add': 40,
 'addit': 41,
 'address': 42,
 'adequ': 43,
 'adher': 44,
 'adhes': 45,
 'adipos': 46,
 'adjust': 47,
 'adjuv': 48,
 'administ': 49,
 'admiss': 50,
 'adopt': 51,
 'adrenalin': 52,
 'adult': 53,
 'advanc': 54,
 'advantag': 55,
 'advers': 56,
 'advisor': 57,
 'aemail': 58,
 'aep': 59,
 'aerial': 60,
 'aethiop': 61,
 'affect': 62,
 'affin': 63,
 'african': 64,
 'ag': 65,
 'agalnst': 66,
 'aga

In [28]:
vocabs = [load("matrix/vocabulary/vocab__%04d__"%ind) for ind in range(nbPartitions)] 
docs = [load("matrix/docsMap/docs__%04d__"%ind) for ind in range(nbPartitions)] 

In [29]:
docsAll ={}
nbDocs = []
for  ind in range(nbPartitions):
    d = load("matrix/docsMap/docs__%04d__"%ind)
    docsAll.update(d)
    nbDocs.append(len(d))

In [30]:
nbDocs

[21, 13, 24, 20]

In [31]:
def encode(ind, part):
    for x in part :
        el = x[1]
        yield (docs[ind][el[0]], np.array([vocabs[x[0]][w][0] for w in el[1]]) )

In [32]:
corpus2 = corpus2.mapPartitionsWithIndex(encode)

In [33]:
nbTopics = 6
nbVocab = len(vocabAll)
mainPath = "/home/nerk/Documents/3A_ENSAE"

In [34]:
# def setNullDocCounts(idPartition):
#     countDocs = np.zeros((nbDocs, nbTopics))
#     file = mainPath + "/mapReduceLda/matrix/countDocs/countDocs__%d__"%idPartition
#     with open(file, "wb") as f :
#         pkl.dump(countDocs,f)
        
# def setNullWordCounts():
#     file = mainPath + "/mapReduceLda/matrix/countWords/countWords"
#     countWords = np.zeros((nbVocab, nbTopics))
#     with open(file, "wb") as f :
#         pkl.dump(countWords,f)

In [35]:
def getPartitionId(ind, part):
    for el in part :
        pass
    return [ind]

def getPartDocsVocab(ind, it):
    pVocab = np.empty(0)
    c = 0
    for el in it :
        pVocab = np.union1d(pVocab, el[1])
        c += 1
    return  [(ind,c,len(pVocab))]

In [36]:
PartDocsVocab = corpus2.mapPartitionsWithIndex(getPartDocsVocab).collect()
PartDocsVocab

[(0, 21, 1044), (1, 13, 732), (2, 24, 1098), (3, 20, 1008)]

In [37]:
nbDocs = [v[1] for v in PartDocsVocab]
nbDocs

[21, 13, 24, 20]

In [38]:
nbVocabs = [v[2] for v in PartDocsVocab]
nbVocabs

[1044, 732, 1098, 1008]

In [39]:
def loadWordCounts():
    file =  "matrix/countWords/countWords"
    with open(file, "rb") as f :
        countWords = pkl.load(f)
    return countWords

def loadDocCounts(nbPartition):
    file = "matrix/countDocs/countDocs__%d__"%nbPartition
    with open(file, "rb") as f :
        countDocs = pkl.load(f)
    return countDocs

In [40]:
corpTop = corpus2.map(lambda x : (x[0], x[1], np.random.choice(nbTopics, len(x[1]) ))).cache()

In [41]:
corpTop.take(1)

[(0, array([ 762,  342,  668,  534,  784,   41,  602,  140,  459,  323,  634,
          691,  359,  453,   15,  925,  390,  812,  502,  791,  343,  265,
          310,  856,  442,  457,  825,  867,  317, 1023,  932,  690,   39,
          682,  578,  761,  421,    5,  702]), array([0, 0, 3, 4, 0, 3, 3, 2, 4, 4, 5, 3, 2, 0, 2, 1, 5, 0, 4, 5, 0, 5,
         2, 2, 0, 0, 1, 2, 1, 2, 4, 5, 5, 0, 5, 2, 0, 0, 2]))]

In [42]:
corpTop.mapPartitionsWithIndex(saveByPartition).collect()

[]

In [43]:
nbDocs

[21, 13, 24, 20]

In [44]:
def initDocCounts(ind, part):
    # CorpTop must be read from file according to `idPartition`
    countDocs = np.zeros((nbDocs[ind], nbTopics))
    for el in part :
        count = np.bincount(el[2], minlength=nbTopics)
        countDocs[el[0]] = count
        
    file =  "matrix/countDocs/docs__%04d__"%ind
    with open(file, "wb") as f :
        pkl.dump(countDocs,f)
    return []


def initWordCounts(ind, part):
    # CorpTop must be read from file according to `idPartition`
    countWords = np.zeros((nbVocabs[ind], nbTopics))
    
    for el in part :
        countWords[el[1], el[2]] += 1
            
    file = "matrix/countWords/words__%04d__"%ind
    with open(file, "wb") as f :
        pkl.dump(countWords,f)
    return []

In [45]:
corpTop.mapPartitionsWithIndex(initDocCounts).collect()

[]

In [46]:
corpTop.mapPartitionsWithIndex(initWordCounts).collect()

[]

In [47]:
initCountWordsAll()

In [48]:
countWords = load("matrix/countWords/words_all")
countWords[:10]

array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 2., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [49]:
freq = corpus.flatMap(lambda x : [(w, 1) for w in x[1]]).countByKey()
sum(freq.values())

5170

In [50]:
assert int(countWords.sum()) == sum(freq.values())

In [51]:
# updateCountWordsAll()

In [52]:
countWords = load("matrix/countWords/words_all")
countWords[:10]

array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 2., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [53]:
u = [ 104,  168,  588,  723,  990,  998, 1764, 1907, 2140, 2315, 2363,
        2364, 2484, 2573, 2856, 3037, 3059]

In [54]:
nbDocs

[21, 13, 24, 20]

In [55]:
# corpTop.glom().map(len).collect()

In [57]:
alpha = 100
beta = 100

In [58]:
def pldaMap0(ind, part):
    countWords = load("matrix/countWords/words__%04d__"%ind)
    countDocs = load("matrix/countDocs/docs__%04d__"%ind)
    deltaWords = np.zeros(countWords.shape)
    sumWordTopics = countWords.sum(0)
    ndocs = len(countDocs)
    i = 0
    for el in part :
        cDoc = countDocs[el[0]]
        tnews = []
        for w,t in zip(el[1], el[2]) :
            cDoc[t] -= 1
            deltaWords[w,t] -= 1
            countWords[w,t] -= 1
            sumWordTopics[t] -= 1

            proba = (cDoc + alpha)*(countWords[w] + beta)/(sumWordTopics + nbVocab*beta)
        
            tnew = np.random.choice(nbTopics, p =proba/proba.sum())
            
            cDoc[tnew] += 1
            deltaWords[w,tnew] += 1
            countWords[w,tnew] += 1
            sumWordTopics[tnew] += 1
            
            tnews.append(tnew)
            
        countDocs[el[0],:] = cDoc
        i += 1
#         if i == ndocs :
#             #dump(countDocs, "matrix/countDocs/docs__%04d__"%ind )
#             dump(deltaWords, "matrix/deltaWords/deltas__%04d__"%ind )
        yield (el[0],el[1], np.array(tnews))
    
    dump(deltaWords, "matrix/deltaWords/deltas__%04d__"%ind )

            
#     return [(ind, countWords)] #[(ind, deltaWords, countWords)]

In [59]:
def pickleLoader(pklFile):
    try:
        while True:
            yield pkl.load(pklFile)
    except EOFError:
        pass

In [60]:
def plda_one(batchstr, countDocs, countWords,deltaWords, sumWordTopics):
    temp = batchstr + "__temp__"
    with open(batchstr, "rb" ) as f1 :
        with open(temp, "wb") as f2 :
            try:
                for el in pickleLoader(f1) :
                    cDoc = countDocs[el[0]]
                    tnews = []
                    for w,t in zip(el[1], el[2]) :
                        cDoc[t] -= 1
                        deltaWords[w,t] -= 1
                        countWords[w,t] -= 1
                        sumWordTopics[t] -= 1

                        proba = (cDoc + alpha)*(countWords[w] + beta)/(sumWordTopics + nbVocab*beta)

                        tnew = np.random.choice(nbTopics, p =proba/proba.sum())

                        cDoc[tnew] += 1
                        deltaWords[w,tnew] += 1
                        countWords[w,tnew] += 1
                        sumWordTopics[tnew] += 1

                        tnews.append(tnew)

                    countDocs[el[0],:] = cDoc
                    pkl.dump((el[0],el[1], np.array(tnews)), f2)
            except  :
#                 os.remove(temp)
                raise ValueError("************  jjjj  Something went wrong in plda_one")
                    
    os.remove(batchstr)
    os.rename(temp, batchstr)

In [61]:
json.dumps({"a": 1, "b":[1,2]})

'{"a": 1, "b": [1, 2]}'

In [62]:
def makeConfig( **kwargs):
    id_ = kwargs["id"]
    print(id_)
    if isinstance(id_,int):
        file = "configs/config__%04d__"%id_
    else :
        file = "configs/config__all__"
#     print(json.dumps(kwargs))
    with open(file, "w") as f :
        json.dump(kwargs, f)
        
def updateConfig( **kwargs):
    id_ = kwargs["id"]
    if isinstance(id_,int):
        file = "configs/config__%04d__"%id_
    else :
        file = "configs/config__all__"
        
    with open(file, "r") as f:
        config = json.load(f)
        
    config.update(kwargs)
    
    with open(file, "w") as f :
        json.dump(config, f)

In [63]:
import time

In [64]:
def get_now():
    return str(datetime.now())

In [65]:
get_now()

'2019-02-03 23:58:32.065096'

In [66]:
def pldaMap(ind, nrounds):
#     params = next(part)
#     assert ind == params[0]
#     nrounds = params[1]
#     ind, nrounds = params
    
    rounds = 0
    while rounds < nrounds :
        
        makeConfig(id = ind, state = "busy", time = get_now())
        
        countWords = load("matrix/countWords/words__%04d__"%ind)
        countDocs = load("matrix/countDocs/docs__%04d__"%ind)
        deltaWords = np.zeros(countWords.shape)
        sumWordTopics = countWords.sum(0)
        ndocs = len(countDocs)
        root = "matrix/corpusTopic/partition__%04d__/"%ind
        files = os.listdir(root)
        files = [file for file in files if "temp" not in file]

        for  file in files :
            plda_one(root + file, countDocs, countWords, deltaWords, sumWordTopics)

        dump(deltaWords, "matrix/deltaWords/deltas__%04d__"%ind )
        
        rounds += 1
        
        now = get_now()
        updateConfig(id = ind, state = "free", time = now)
        
        
        timeout = 60
        poll = 1
        waited = 0
        while waited < timeout :
            with open("configs/config__all__", "r") as f:
                master = json.load(f)
                
            if master["countWordsUpdated"][str(ind)] and master["time"] >  now:
                
#                 updateConfig(id = "all",countWordsUpdated = False )
                break
            else:
                t0 = time.time()
                time.sleep(poll)
                waited += time.time() - t0
#         time.sleep(poll)
        if waited >= timeout :
            raise ValueError("Timeout : updates last too much to be done, waited %f s\
                             yet now, "%waited)

In [67]:
# params = spark.sparkContext.parallelize([(ind, 10) for ind in range(nbPartitions)])
# params = params.partitionBy(nbPartitions)
# params.glom().collect()

In [68]:
params = spark.sparkContext.parallelize([(ind, 10) for ind in range(nbPartitions)])
params = params.partitionBy(nbPartitions)

In [69]:
params.getNumPartitions()

4

In [70]:
from multiprocessing import Process, Queue

In [72]:
corpTop.mapPartitionsWithIndex(saveByPartition).collect()

corpTop.mapPartitionsWithIndex(initDocCounts).collect()
corpTop.mapPartitionsWithIndex(initWordCounts).collect()

initCountWordsAll()

countWords = load("matrix/countWords/words_all")
countWords[:10]

array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 2., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [73]:
makeConfig(id = "all", countWordsUpdated = {str(ind):False for ind in range(nbPartitions)}, time = get_now())

all


In [74]:
def supervise(nrounds):
    
    processes = [Process(target= pldaMap, args = (ind, nrounds)) for ind in range(nbPartitions)]
    if  __name__ == "__main__":
        # Run processes
        for p in processes:
            p.start()
    count = 0        
    while count < nrounds :
        allFree = True
        for id_ in range(nbPartitions):
            with open("configs/config__%04d__"%id_, "r") as f :
                slave = json.load(f)
                if slave["state"] == "busy":
                    allFree = False
                    
        if allFree :
            updateCountWordsAll()
            updateConfig(id = "all", 
                         countWordsUpdated = {str(ind):True for ind in range(nbPartitions)}, time = get_now() )
            count += 1
        
            
        if not all([p.is_alive() for  p in processes]) :
            for p in processes :
                p.kill() 
            raise ValueError("Some process is died !!!!!!!!!!!!")
        time.sleep(1)

In [75]:
master = Process(target = supervise, args = [150] )
master.start()

0
1
2
3
3
1
0
2
1
3
0
2
1
3
0
2
1
3
0
2
1
3
0
2
1
3
0
2
2
1
3
0
0
2
1
3
3
0
2
1
3
0
2
1
3
0
2
1
3
0
2
1
3
0
2
1
1
3
0
2
1
3
0
2
2
1
3
0
0
2
1
3
0
2
1
3
0
2
1
3
3
0
1
2
3
0
1
2
3
0
1
2
2
3
0
1
2
3
1
0
2
3
1
0
1
0
2
3
1
0
2
3
1
0
2
3
1
0
2
3
3
1
0
2
3
1
0
2
3
1
0
2
2
3
1
0
2
3
1
0
0
2
3
1
0
2
3
1
0
2
1
3
0
2
1
3
3
0
2
1
3
0
2
1
1
3
0
2
1
3
0
2
0
2
1
3
0
2
1
3
0
1
2
3
1
0
2
3
1
0
2
3
1
2
0
3
0
2
3
1
0
2
3
1
0
3
2
1
0
3
2
1
1
0
3
2
1
3
0
2
2
1
3
0
3
0
2
1
3
0
1
2
3
0
1
2
3
1
0
2
3
1
0
2
3
1
0
2
1
3
0
2
0
2
1
3
0
2
1
3
3
0
2
1
3
0
2
1
1
3
0
2
1
3
0
2
1
3
0
2
2
1
3
0
0
2
1
3
0
2
1
3
0
2
1
3
0
2
1
3
0
1
2
3
3
1
0
2
3
1
0
2
2
3
1
0
2
0
3
1
0
2
3
1
1
0
2
3
1
0
2
3
1
0
2
3
1
0
2
3
1
0
2
3
3
2
1
0
0
3
2
1
0
3
2
1
0
3
2
1
0
3
1
2
0
1
3
2
0
1
3
2
2
0
1
3
3
2
0
1
1
3
2
0
1
3
2
0
1
3
2
0
0
1
3
2
0
1
3
2
0
1
3
2
2
1
0
3
2
1
0
3
3
2
1
0
3
2
1
0
0
3
2
1
0
3
2
1
0
3
2
1
0
3
2
1
1
0
3
2
2
1
0
3
2
1
0
3
0
3
1
2
3
0
1
2
0
3
1
2
0
1
3
2
1
0
3
2
1
0
3
2
2
1
0
3
0
3
2
1
0
3
2
1
0
3
2
1
0
3
2
1
1
0
3
2
1
0
3
2


In [110]:
master.is_alive()

False

In [111]:
# master.terminate()

In [112]:
# for p in processes :
#     print(p.is_alive())

In [113]:
countWords = load("matrix/countWords/words_all")
countWords[:10]

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 1., 2.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.]])

In [158]:
nbDocs

[21, 13, 24, 20]

In [160]:
cl = 1
subdoc = 2
countDocs = load("matrix/countDocs/docs__%04d__"%subdoc)
countDocs[:10]

array([[ 8.,  8.,  5.,  7., 10.,  9.],
       [19., 19., 10., 18., 13., 17.],
       [ 8., 10., 12.,  9.,  9., 11.],
       [13., 14., 14., 14., 15., 14.],
       [12., 12.,  6.,  9.,  8.,  8.],
       [ 5.,  7.,  4.,  5., 12.,  6.],
       [20., 17., 13., 14., 12., 15.],
       [12., 13., 10., 18., 17., 17.],
       [15., 13., 18., 18., 15., 21.],
       [11., 10., 15., 18., 13.,  9.]])

In [161]:
topics = countDocs.argmax(1)

In [162]:
v = np.where(topics == cl)

In [163]:
countDocs[v]

array([[ 7., 17.,  4.,  2.,  8.,  7.],
       [22., 27., 16., 14., 26., 22.],
       [14., 23., 18., 17., 11., 11.],
       [11., 13.,  9., 10.,  5., 10.]])

In [164]:
# docs[cl]

In [165]:
dks = np.array( list(docs[subdoc].items()))

In [166]:
cluster = dks[v]
cluster

array([['3016c9b9e3aeafe6fc93558316b472953d8b5bbf', '11'],
       ['88092fb7437b2f2afd8cd7643a8fd1377d6764fd', '16'],
       ['6374215829deb1843e01716dfa815404016602b2', '19'],
       ['630cd25ecf943ac9c1c0cec3b862e8cac5875f76', '20']], dtype='<U40')

In [167]:
corpus.filter(lambda x : np.isin(x[0], cluster[:, 0])).collect()

[('3016c9b9e3aeafe6fc93558316b472953d8b5bbf',
  array(['penetr', 'main', 'account', 'theori', 'accord', 'winter',
         'critic', 'lower', 'effect', 'valu', 'invers', 'electron',
         'featur', 'follow', 'air', 'depend', 'flayer', 'temperatur',
         'success', 'data', 'frequenc', 'height', 'variat', 'elay',
         'pressur', 'regular', 'densiti', 'explain', 'sun', 'simpl',
         'determin', 'annual', 'summer', 'km', 'wherea', 'noon', 'observ',
         'maximum', 'appleton', 'show', 'limit', 'photoion', 'high',
         'accumul', 'higher'], dtype='<U10')),
 ('88092fb7437b2f2afd8cd7643a8fd1377d6764fd',
  array(['perfusionbas', 'therefor', 'atherosclerosi', 'rest', 'find',
         'avail', 'addit', 'viabl', 'emiss', 'though', 'definit',
         'prognost', 'ischem', 'furthermor', 'often', 'anatom', 'may',
         'process', 'appear', 'comput', 'even', 'mild', 'threshold',
         'random', 'limit', 'individu', 'poor', 'end', 'result', 'moder',
         'suffici', 'ct

In [168]:
dt = data.map(lambda x : json.loads(x)).filter(lambda x : np.isin(x["id"], cluster[:,0])).cache()

In [169]:
dt.collect()

[{'entities': ['Electron', 'Tree accumulation', 'anatomical layer'],
  'journalVolume': '139',
  'journalPages': '328-329',
  'pmid': '',
  'year': 1937,
  'outCitations': [],
  's2Url': 'https://semanticscholar.org/paper/3016c9b9e3aeafe6fc93558316b472953d8b5bbf',
  's2PdfUrl': '',
  'id': '3016c9b9e3aeafe6fc93558316b472953d8b5bbf',
  'authors': [{'name': 'Leiv Harang', 'ids': ['49288990']}],
  'journalName': 'Nature',
  'paperAbstract': 'THE successive accumulation of data on the critical frequencies, that is, the limiting penetrating frequencies, of the E- and F-layers determined from noon observations show the following main features1: the critical frequencies of the E-layer show a regular annual variation depending on the height of the sun, whereas the critical frequencies of the F2-layer show an inverse annual variation, with high values during winter and lower values in summer. Appleton2 has explained this as a temperature effect, the density of the air at 200–400 km. in winter b

In [87]:
np.where(np.any(countWords < 0, 1))

(array([], dtype=int64),)

In [88]:
countWords[countWords < 0]

array([], dtype=float64)

In [89]:
# # the old plda
# corpTop.mapPartitionsWithIndex(saveByPartition).collect()

# corpTop.mapPartitionsWithIndex(initDocCounts).collect()
# corpTop.mapPartitionsWithIndex(initWordCounts).collect()
# initCountWordsAll()
# countWords = load("matrix/countWords/words_all")
# countWords[:10]

In [90]:
# rdd = corpTop

In [91]:
# rdd = rdd.mapPartitionsWithIndex(pldaMap0)
# rdd.collect()

In [92]:
# updateCountWordsAll()

In [93]:
# wordCounts2 = load("matrix/countWords/words_all")

In [94]:
# wordCounts2[wordCounts2 < 0]

In [95]:
# wordCounts2.sum()

In [96]:
# wordCounts2 == wordCounts