In [1]:
from pyspark.sql import SparkSession

In [2]:
import numpy as np, os, shutil, json, time
import pickle as pkl
from datetime import datetime
from nlp import preprocessAndGetTokens
# from paperAbstracts import processPaperAbstract

In [3]:
from fileUtils import load, pickleLoader, dump, saveByPartition

In [4]:
spark = SparkSession.builder\
    .master("local") \
    .appName("lda") \
    .getOrCreate().sparkContext

In [5]:
def processPaperAbstract(abstract):
    return np.array(preprocessAndGetTokens(abstract))

In [6]:
def LoadPaperAbstract(docstr):
    doc = json.loads(docstr)
    return (doc["id"], doc["paperAbstract"])

In [7]:
data = spark.textFile("sample-S2-records").map(LoadPaperAbstract)

In [8]:
data.take(1) 

[('4cbba8127c8747a3b2cfb9c1f48c43e5c15e323e',
  'Primary debulking surgery (PDS) has historically been the standard treatment for advanced ovarian cancer. Recent data appear to support a paradigm shift toward neoadjuvant chemotherapy with interval debulking surgery (NACT-IDS). We hypothesized that stage IV ovarian cancer patients would likely benefit from NACT-IDS by achieving similar outcomes with less morbidity. Patients with stage IV epithelial ovarian cancer who underwent primary treatment between January 1, 1995 and December 31, 2007, were identified. Data were retrospectively extracted. Each patient record was evaluated to subclassify stage IV disease according to the sites of tumor dissemination at the time of diagnosis. The Kaplan–Meier method was used to compare overall survival (OS) data. A total of 242 newly diagnosed stage IV epithelial ovarian cancer patients were included in the final analysis; 176 women (73%) underwent PDS, 45 (18%) NACT-IDS, and 21 (9%) chemotherapy onl

In [9]:
corpus = data.mapValues(processPaperAbstract).filter(lambda x : len(x[1]) > 0)
corpus.take(1)

[('4cbba8127c8747a3b2cfb9c1f48c43e5c15e323e',
  array(['unit', 'analysi', 'compar', 'includ', 'histor', 'decemb',
         'surgeri', 'subclassifi', 'paradigm', 'chemotherapi', 'treatment',
         'diagnosi', 'cancer', 'trend', 'complet', 'januari', 'appear',
         'admiss', 'recent', 'final', 'toward', 'complic', 'result',
         'accord', 'receiv', 'resect', 'ovarian', 'frequenc', 'os',
         'support', 'treat', 'method', 'outcom', 'use', 'versus', 'patient',
         'postop', 'extract', 'evalu', 'time', 'standard', 'diagnos',
         'tumor', 'nactid', 'retrospect', 'achiev', 'overal', 'higher',
         'kaplanmei', 'vs', 'record', 'advanc', 'diseas', 'iv', 'pds',
         'shift', 'site', 'stage', 'identifi', 'neoadjuv', 'intens', 'like',
         'residu', 'epitheli', 'less', 'benefit', 'hypothes', 'total',
         'longer', 'group', 'underw', 'month', 'surviv', 'median',
         'dissemin', 'data', 'signific', 'interv', 'morbid', 'similar',
         'newli', 'frequ

In [10]:
# corpus.map(lambda x: len(x[1])).distinct().collect()

# Patitionning

In [11]:
nbPartitions = 10

In [12]:
def randomPartitionner(x, nbPartitions):
    return (np.random.choice(nbPartitions),x)

In [13]:
corpus2 = corpus.map(lambda x  : randomPartitionner(x, nbPartitions))\
            .partitionBy(nbPartitions).cache()

In [14]:
corpus2.take(1)

[(0,
  ('cb61fc1ebdeb5835460c18044d331388d5b1067a',
   array(['allopurinol', 'peripher', 'immedi', 'control', 'blood', 'group',
          'anaesthet', 'rat', 'oxygenfre', 'dismutas', 'cardiovascular',
          'resist', 'involv', 'prevent', 'rise', 'improv', 'free', 'scald',
          'burn', 'scaveng', 'catalas', 'signific', 'fall', 'blocker',
          'pressur', 'cent', 'product', 'per', 'oxygen', 'increas',
          'investig', 'rate', 'surfac', 'plus', 'treatment', 'cardiac',
          'mean', 'appli', 'seen', 'arteri', 'caus', 'output', 'superoxid',
          'slight', 'infus', 'pretreat', 'radic', 'affect', 'total', 'area',
          'suggest', 'howev', 'heart', 'injuri', 'bodi', 'lower'],
         dtype='<U14')))]

# Make the vocabularies

In [15]:
from builder  import makeVocabularies, getUniqueWords

In [16]:
uniqueWordsByPartition = corpus2.mapPartitionsWithIndex(getUniqueWords).collect()

print( "Number of unique words by partition : %s"%str([len(u) for u in uniqueWordsByPartition]))

Number of unique words by partition : [645, 446, 556, 540, 571, 397, 310, 218, 480, 324]


In [17]:
makeVocabularies(uniqueWordsByPartition) # Build and save the vocabularies

Vocabulary 0 successfully built
Vocabulary 1 successfully built
Vocabulary 2 successfully built
Vocabulary 3 successfully built
Vocabulary 4 successfully built
Vocabulary 5 successfully built
Vocabulary 6 successfully built
Vocabulary 7 successfully built
Vocabulary 8 successfully built
Vocabulary 9 successfully built

 Global vocabulary  built too


# Make docMaps

In [18]:
from builder import makeDocsMaps

In [19]:
corpus2.mapPartitionsWithIndex(makeDocsMaps).collect()

['docMap 0 successfully built',
 'docMap 1 successfully built',
 'docMap 2 successfully built',
 'docMap 3 successfully built',
 'docMap 4 successfully built',
 'docMap 5 successfully built',
 'docMap 6 successfully built',
 'docMap 7 successfully built',
 'docMap 8 successfully built',
 'docMap 9 successfully built']

# Test if vocabularies and docMaps are correctly built

In [20]:
vocabSize = len(load("matrix/vocabulary/vocabAll"))
print("vocabSize : %d "%vocabSize)

vocabSize : 2701 


In [21]:
ndocs00 = len(load("matrix/docsMap/docs__0000__"))
print("Number of docs in docs00 : %d"%ndocs00)

Number of docs in docs00 : 10


# As voacabularies & docMaps was well built, let's load them

In [22]:
vocabAll = load("matrix/vocabulary/vocabAll")

vocabs = [load("matrix/vocabulary/vocab__%04d__"%ind) for ind in range(nbPartitions)] 


In [23]:
from builder import loadDocsAll
docsAll = loadDocsAll(nbPartitions)

docs = [load("matrix/docsMap/docs__%04d__"%ind) for ind in range(nbPartitions)] 

In [24]:
nbDocs = list(map(len, docs))
nbVocabs = list(map(len, vocabs))

# Encode corpus : using ids instead of doc full text

In [25]:
from builder  import encode

In [26]:
corpus2.take(1)

[(0,
  ('cb61fc1ebdeb5835460c18044d331388d5b1067a',
   array(['allopurinol', 'peripher', 'immedi', 'control', 'blood', 'group',
          'anaesthet', 'rat', 'oxygenfre', 'dismutas', 'cardiovascular',
          'resist', 'involv', 'prevent', 'rise', 'improv', 'free', 'scald',
          'burn', 'scaveng', 'catalas', 'signific', 'fall', 'blocker',
          'pressur', 'cent', 'product', 'per', 'oxygen', 'increas',
          'investig', 'rate', 'surfac', 'plus', 'treatment', 'cardiac',
          'mean', 'appli', 'seen', 'arteri', 'caus', 'output', 'superoxid',
          'slight', 'infus', 'pretreat', 'radic', 'affect', 'total', 'area',
          'suggest', 'howev', 'heart', 'injuri', 'bodi', 'lower'],
         dtype='<U14')))]

In [27]:
# Notice that all the words have been encoded into symbolic ids
corpus2 = corpus2.mapPartitionsWithIndex(lambda ind, part : encode(ind, part, docs, vocabs))

# End data preparation by adding random topics and saving 

In [28]:
nbTopics = 5

In [29]:
corpTop = corpus2.map(lambda x : (x[0], x[1], np.random.choice(nbTopics, len(x[1]) ))).cache()

In [30]:
corpTop.mapPartitionsWithIndex(saveByPartition).collect() # Save bacthes by partition

[]

In [31]:
corpTop.take(1)

[(0, array([ 26, 419, 289, 136,  75, 269,  30, 475, 406, 180,  89, 497, 313,
         448, 507, 293, 250, 513,  80, 516,  90, 540, 236,  74, 446,  98,
         455, 416, 405, 295, 312, 476, 580, 427, 611,  88, 349,  40, 523,
          46,  92, 402, 577, 550, 300, 447, 471,  21, 604,  43, 573, 283,
         275, 304,  76, 338]), array([2, 3, 4, 1, 4, 3, 2, 4, 0, 0, 1, 4, 3, 1, 1, 2, 4, 4, 2, 4, 3, 4,
         3, 2, 0, 1, 0, 2, 3, 3, 4, 0, 4, 0, 1, 0, 3, 3, 3, 3, 0, 2, 3, 2,
         1, 1, 0, 1, 0, 1, 0, 2, 4, 4, 2, 0]))]

In [32]:
import fileUtils, importlib
importlib.reload(fileUtils)

<module 'fileUtils' from '/home/nerk/Documents/3A_ENSAE/mapReduceLda/fileUtils.py'>

# Here the ML part

# Define some parameters

In [33]:
nbVocabAll = len(vocabAll)
alpha = 100
beta = 100

In [34]:
from builder  import init

In [35]:
# from builder import makeConfig, updateConfig, get_now
# makeConfig(id = "all", countWordsUpdated = {str(ind):False for ind in range(nbPartitions)}, time = get_now())

# Training

In [36]:
from fileUtils import saveAsPickleFile
from model import pldaMap0
from builder import updateCountWordsAll, init

In [37]:
# pldaMap(0, 1, alpha, beta, len(vocabAll), nbTopics)

In [38]:
t0 = time.time()
rdd = corpTop
init(corpTop, vocabs, nbDocs, nbVocabs, len(vocabAll), nbTopics)
for i in range(500):
    rdd = rdd.mapPartitionsWithIndex(lambda ind, part : pldaMap0(ind, part, alpha, beta, nbVocabAll, nbTopics),
                       preservesPartitioning= True )
    saveAsPickleFile(rdd)
    rdd = spark.pickleFile("pickle/")
    updateCountWordsAll()
print("Time : {}".format(time.time() - t0))

Time : 43.2295606136322


# Post-training analysis

In [43]:
cl = 0
subdoc = 2
countDocs = load("matrix/countDocs/docs__%04d__"%subdoc)
countDocs[:10]

array([[ 6.,  7.,  8.,  9., 10.],
       [13., 17., 28., 24., 22.],
       [11., 18., 24., 16., 14.],
       [22., 19., 18., 24., 17.],
       [25., 24., 27., 32., 19.],
       [13., 13.,  9., 20., 15.],
       [15., 16., 18., 13., 15.],
       [13., 11., 12.,  7., 11.]])

In [44]:
topics = countDocs.argmax(1)
v = np.where(topics == cl)
countDocs[v]

array([[13., 11., 12.,  7., 11.]])

In [45]:
dks = np.array( list(docs[subdoc].items()))
cluster = dks[v]
cluster

array([['248647e053c227b6375d119bb16a7508fdfbef2b', '7']], dtype='<U40')

In [46]:
corpus.filter(lambda x : np.isin(x[0], cluster[:, 0])).collect()

[('248647e053c227b6375d119bb16a7508fdfbef2b',
  array(['result', 'sea', 'theori', 'wave', 'eastern', 'coast', 'discuss',
         'critic', 'zone', 'indic', 'compar', 'linear', 'propag', 'differ',
         'notic', 'model', 'accuraci', 'shallow', 'employ', 'practic',
         'tsunami', 'south', 'equat', 'neglect', 'depth', 'appropri',
         'nonlinear', 'seismogen', 'would', 'term', 'furthermor', 'verifi',
         'bottom', 'china', 'also', 'appli', 'chines', 'describ', 'applic',
         'forecast', 'influenc', 'region', 'exert', 'characterist', 'ocean',
         'simul', 'paper', 'friction', 'enough', 'water', 'hydrodynam',
         'along', 'potenti', 'base'], dtype='<U12'))]