In [1]:

from pyspark import SparkContext
sc = SparkContext(master="local[4]")

In [2]:

import re
import numpy as np




# load up all of the  26,754 documents in the corpus
corpus = sc.textFile ("pubmed.txt")

# each entry in validLines will be a line from the text file
validLines = corpus.filter(lambda x : 'id=' in x)

# now we transform it into a bunch of (docID, text) pairs
keyAndText = validLines.map(lambda x : (x[x.index('id=') + 3 : x.index('> ')], x[x.index('> ') + 2:]))

# now we split the text in each (docID, text) pair into a list of words
# after this, we have a data set with (docID, ["word1", "word2", "word3", ...])
# we have a bit of fancy regular expression stuff here to make sure that we do not
# die on some of the documents
regex = re.compile('[^a-zA-Z]')
keyAndListOfWords = keyAndText.map(lambda x : (str(x[0]), regex.sub(' ', x[1]).lower().split()))

# now get the top 20,000 words... first change (docID, ["word1", "word2", "word3", ...])
# to ("word1", 1) ("word2", 1)...
allWords = keyAndListOfWords.flatMap(lambda x: ((j, 1) for j in x[1]))

# now, count all of the words, giving us ("word1", 1433), ("word2", 3423423), etc.
allCounts = allWords.reduceByKey (lambda a, b: a + b)

# and get the top 20,000 words in a local array
# each entry is a ("word1", count) pair
topWords = allCounts.top (20000, lambda x : x[1])

# and we'll create a RDD that has a bunch of (word, dictNum) pairs
# start by creating an RDD that has the number 0 thru 20000
# 20000 is the number of words that will be in our dictionary
twentyK = sc.parallelize(range(20000))

# now, we transform (0), (1), (2), ... to ("mostcommonword", 1) ("nextmostcommon", 2), ...
# the number will be the spot in the dictionary used to tell us where the word is located
# HINT: make use of topWords in the lambda that you supply
dictionary = twentyK.map(lambda x:(topWords[x][0],x))


# finally, print out some of the dictionary, just for debugging
# dictionary.collect()


In [56]:
d = dictionary.collect()

wordToIdx = {}
IdxToWord = {}
for i in d:
    wordToIdx[i[0]] = i[1]

for i in d:
    wordToIdx[i[1]] = i[0]

In [375]:
wordDoc = keyAndListOfWords.flatMap(lambda x: ((j, x[0]) for j in x[1]))
wordDicDoc = dictionary.join(wordDoc)
docDic = wordDicDoc.map(lambda x: (x[1][1], x[1][0]))
docDicList = docDic.groupByKey()

#the conversion from listOfAllDictonaryPos to NumPy array
def listToArray(docDicList):
    result = np.zeros(20000)
    for i in docDicList:
        result[i] += 1
    return result

docDicCount = docDicList.map(lambda x: (x[0], listToArray(x[1])))

In [376]:
a1 = docDicCount.lookup('ParasiticDisease/2617030')
print (a1[0][a1[0].nonzero()])

[ 6.  4.  7. 11.  3.  2.  2.  4.  2.  1.  1.  1.  2.  1.  1.  1.  2. 10.
  1.  1.  2.  2.  1.  1.  1.  2.  3.  2.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  2.  1.  1.  2.  1.  1.  1.  1.  1.  1.  1.  2.  2.  1.  4.  2.
  2.  1.  2.  1.  1.  1.  1.  2.  2.  1.  1.  1.  1.  6.  1.  2.  1.  1.
  1.  1.  2.  1.  1.  2.  1.  2.]


In [3]:
wordDoc = keyAndListOfWords.flatMap(lambda x: ((j, x[0]) for j in x[1]))
wordDicDoc = dictionary.join(wordDoc)
docDic = wordDicDoc.map(lambda x: (x[1][1], x[1][0]))
docDicList = docDic.groupByKey()


def listToArray(docDicList):
    result = np.zeros(20000)
    for i in docDicList:
        result[i] += 1
    return result

docDicCount = docDicList.map(lambda x: (x[0], listToArray(x[1])))
# docDicCount.collect()

In [8]:
a1 = docDicCount.lookup('Wounds/23778438')
print (a1[0][a1[0].nonzero()])

[ 9.  8. 16.  5.  4.  5.  6.  5.  6.  2.  8.  1.  1.  1.  8.  4.  2.  2.
  1.  1.  7.  1.  1.  1.  3.  2.  2.  1.  2.  1.  1.  3.  1.  1.  2.  1.
  1.  1.  1.  1.  1.  1.  1.  2.  1.  2.  1.  1.  1.  2.  1.  1.  2.  1.
  1.  1.  1.  2.  6.  2.  1.  1.  1.  2.  1.  2.  2.  1.  4.  1. 10.  1.
  4.  1.  1.  1.  3.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  4.
  1.  1.  1.  1.  1.  1.  4.  2.  1.  2.  2.  2.  1.  3.  1.  1.  1.  1.
  1.  1.  3.  1.  3.  1.  1.  2.  1.  1.  1.  1.  1.  1.  1.  1.  2.  1.
  4.  2.]


In [3]:
# get the pair of (word, doc)
WordOfDocPair = keyAndListOfWords.flatMap(lambda x : ((word, x[0]) for word in x[1]))
# WordOfDocPair.collect()

# combine word with dict(id and doc pairs)
DicWithDocPair = dictionary.join(WordOfDocPair).map(lambda x : (x[1][1], x[1][0]))
# DicWithDocPair.collect()

# group by doc
DocWithWordListPair = DicWithDocPair.groupByKey()

# DocWithWordListPair.collect()

def wordFrequency(list):
#     total 20000 words
    l = np.zeros(20000)
    for fre in list:
        l[fre] += 1
    return l

wordFrequencyArr = DocWithWordListPair.map(lambda x : (x[0], wordFrequency(x[1])))




In [4]:
task1_1 = wordFrequencyArr.lookup("Wounds/23778438")
print(task1_1[0][task1_1[0].nonzero()])

[ 9.  8. 16.  5.  4.  5.  6.  5.  2.  6.  1.  1.  1.  8.  4.  2.  2.  1.
  3.  1.  1.  1.  2.  8.  1.  2.  2.  1.  1.  1.  2.  1.  3.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  2.  1.  1.  7.  2.  1.  1.  2.  1.  1.  1.  1.
  2.  2.  2.  1.  2.  1.  4.  1.  1.  1.  2.  1.  1.  1.  1.  6.  1.  2.
  1.  1.  1.  1.  1.  1.  4.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  2.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  2.
  1.  3. 10.  1.  1.  2.  1.  1.  3.  4.  2.  1.  1.  1.  1.  1.  1.  1.
  4.  3.  1.  1.  2.  1.  1.  3.  1.  1.  1.  1.  1.  1.  1.  2.  1.  1.
  2.]


In [7]:
task1_2 = wordFrequencyArr.lookup("ParasiticDisease/2617030")
print(task1_2[0][task1_2[0].nonzero()])
task1_3 = wordFrequencyArr.lookup("RxInteractions/1966748")
print(task1_3[0][task1_3[0].nonzero()])

[ 6.  4.  7. 11.  3.  2.  2.  4.  2.  1.  1.  1.  2.  1.  1.  1.  2. 10.
  1.  1.  2.  2.  1.  1.  1.  2.  3.  2.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  2.  1.  1.  2.  1.  1.  1.  1.  1.  1.  1.  2.  2.  1.  4.  2.
  2.  1.  2.  1.  1.  1.  1.  2.  2.  1.  1.  1.  1.  6.  1.  2.  1.  1.
  1.  1.  2.  1.  1.  2.  1.  2.]
[ 4. 11. 12.  1.  2.  2.  1.  7.  1.  1.  1.  1.  1.  1.  1.  1.  1.  3.
  2.  2.  2.  1.  3.  1.  2.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  7.  4.  7.  1.  1.  1.  1.  1.  2.  2.  8.  3.  2.  2.  1.
  1.  1.  1.  1.  1.  1.  2.  1.  2.  5.  3.  1.  5.  1.]


In [200]:
def task1(key):
    WordOfCountPair = keyAndListOfWords.flatMap(lambda x : ((word, 1) for word in x[1] if x[0] == key))
    count = WordOfCountPair.reduceByKey(lambda a, b,: a + b)
    DicWithWordPair = dictionary.leftOuterJoin(count)
    
    IDWithWordCountPair = DicWithWordPair.map(lambda x: (x[1][0], x[1][1]) if x[1][1] != None else (x[1][0], 0))
    ans = IDWithWordCountPair.sortBy(lambda x : x[0])
    ans = ans.map(lambda x : x[1])
    ans = np.array(ans.collect())
#     print(ans[ans.nonzero()])
    return ans

In [75]:
a1 = task1("Wounds/23778438")
a2 = task1("ParasiticDisease/2617030")
a3 = task1("RxInteractions/1966748")
print(a1[a1.nonzero()])
print(a2[a2.nonzero()])
print(a3[a3.nonzero()])

[ 9  8 16  5  4  5  6  5  2  6  1  1  1  8  4  2  2  1  3  1  1  1  2  8
  1  2  2  1  1  1  2  1  3  1  1  1  1  1  1  1  1  1  2  1  1  7  2  1
  1  2  1  1  1  1  2  2  2  1  2  1  4  1  1  1  2  1  1  1  1  6  1  2
  1  1  1  1  1  1  4  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  2  1  1  1  1  1  1  1  1  1  1  2  1  3 10  1  1  2  1  1  3  4  2  1
  1  1  1  1  1  1  4  3  1  1  2  1  1  3  1  1  1  1  1  1  1  2  1  1
  2]
[ 6  4  7 11  3  2  2  4  2  1  1  1  2  1  1  1  2 10  1  1  2  2  1  1
  1  2  3  2  1  1  1  1  1  1  1  1  1  1  2  1  1  2  1  1  1  1  1  1
  1  2  2  1  4  2  2  1  2  1  1  1  1  2  2  1  1  1  1  6  1  2  1  1
  1  1  2  1  1  2  1  2]
[ 4 11 12  1  2  2  1  7  1  1  1  1  1  1  1  1  1  3  2  2  2  1  3  1
  2  1  1  1  1  1  1  1  1  1  1  1  1  1  1  7  4  7  1  1  1  1  1  2
  2  8  3  2  2  1  1  1  1  1  1  1  2  1  2  5  3  1  5  1]


array([ 4, 11, 12, ...,  0,  0,  0])

In [135]:

import math

def tf_o(i, d):
    WordOfCountPair = keyAndListOfWords.flatMap(lambda x : ((word, 1) for word in x[1] if x[0] == d and word == i))
    count = WordOfCountPair.reduceByKey(lambda a, b,: a + b)
    DicWithWordPair = dictionary.join(count)
    wordCounts = DicWithWordPair.collect()
    totalWords = sum(task1(d))
#     print(task1(d))
#     print(wordCounts[0][1][1])
    return wordCounts[0][1][1]/totalWords

def tf(tokens):
    d = {}
    for word in tokens:
        if not word in d:
            d[word] = 1
        else:
            d[word] += 1
    for word in d:
        d[word] = float(d[word])/len(tokens)
    return d

def IDF(RDD):
    N = RDD.count()
    uniqueTokens = RDD.map(lambda x: list(set(x[1])))
    tokenSumPairTuple = uniqueTokens.flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda a, b : a + b)
    return (tokenSumPairTuple.map(lambda x: (x[0], math.log(float(N)/x[1]))))
    

# def tfidf(tokens, idfs):
#     tfs = tf(tokens)
#     for tk in tfs:
#         tfs[tk] = tfs[tk]*idfs[tk]
#     tfIdfDict = tfs
#     return tfIdfDict

def task2(dic):
    WordOfCountPair = keyAndListOfWords.flatMap(lambda x : ((word, 1) for word in x[1] if x[0] == dic))
#     WordOfAllPair =  keyAndListOfWords.flatMap(lambda x : ((word, 1) for word in x[1]))
#     temp = WordOfCountPair.groupByKey().collect()
#     tfs = []
    
#     for word in temp:
#         tfs.append(tf(word[0], dic))
        
#     alltf = sc.parallelize(tfs)
#     return alltf
#     print(tfs)
#     return WordOfCountPair

    dicWord = keyAndListOfWords.filter(lambda x : x[0] == dic)
    tfs = dicWord.map(lambda x :  tf(x[1]))
    idfs = IDF(keyAndListOfWords)
    v = sc.parallelize(list(tfs.collect()[0].items()))
    tfidf = v.join(idfs)
    tfidf = tfidf.map(lambda x : (x[0], x[1][0]*x[1][1]))
    ans = dictionary.join(tfidf).sortBy(lambda x : x[1][0])
    ans = ans.map(lambda x : x[1][1])
    
    return ans
    


In [136]:
idfs = IDF(keyAndListOfWords)

t = dictionary.leftOuterJoin(idfs).map(lambda x : x[1])
a = t.collect()
a.sort()
listidfs = sc.parallelize(a).map(lambda x:x[1])
# print(listidfs.collect())

In [137]:
def task2_(dic):
    WordOfCountPair = keyAndListOfWords.flatMap(lambda x : ((word, 1) for word in x[1] if x[0] == dic))
#     WordOfAllPair =  keyAndListOfWords.flatMap(lambda x : ((word, 1) for word in x[1]))
#     temp = WordOfCountPair.groupByKey().collect()
#     tfs = []
    
#     for word in temp:
#         tfs.append(tf(word[0], dic))
        
#     alltf = sc.parallelize(tfs)
#     return alltf
#     print(tfs)
#     return WordOfCountPair

    dicWord = keyAndListOfWords.filter(lambda x : x[0] == dic)
    dicWord = dicWord.flatMap(lambda x : ((j, 1) for j in x[1]))
    wordInDic = dicWord.join(dictionary).map(lambda x : x[0])
    wordInDic = sc.parallelize(list(("", wordInDic.collect())))
    tfs = wordInDic.map(lambda x :  tf(x))
    
    v = sc.parallelize(list(tfs.collect()[1].items()))
    tfidf = v.join(idfs)
    tfidf = tfidf.map(lambda x : (x[0], x[1][0]*x[1][1]))
    ans = dictionary.join(tfidf).sortBy(lambda x : x[1][0])
    ans = ans.map(lambda x : x[1][1])
    return ans
    

print(task2_("PalliativeCare/16552238").collect())

[0.00030536152464218823, 0.00031608906181866437, 0.0010109571298143113, 0.001138354377174771, 0.0022983935131899356, 0.0017637067917049495, 0.0037106939975623462, 0.008141868291551526, 0.008404488452830174, 0.010443583552035554, 0.004356553821749023, 0.00912791573232544, 0.00976969260722648, 0.007514681851074296, 0.012763262138793465, 0.01664033854732166, 0.011183156632449528, 0.00842848413858375, 0.01663488327770401, 0.011979305919624551, 0.01158353398271924, 0.011727175393084936, 0.012769563423350519, 0.01151381007445576, 0.02701277254745636, 0.025453426641852083, 0.014128818302419579, 0.012812731686721084, 0.015241424714046542, 0.013553174425551739, 0.015411364471691532, 0.02735505943460903, 0.051325867194142595, 0.013982486245226236, 0.014118947411273682, 0.015214570377672735, 0.06011235190128617, 0.020281331052188295, 0.017158314610712988, 0.05988860485034235, 0.017474507554299048, 0.043960105565918985, 0.018183494568019952, 0.04307933842454735, 0.019862682803495773, 0.01856798590

In [138]:
print(task2_("SquamousCellCancer/23991972").collect())

[0.00037613541967644544, 0.00036811205324298617, 0.0006339543668210577, 0.0020395515924381317, 0.0012353865133395904, 0.002106649778980912, 0.006648326745632537, 0.014292586770227886, 0.013419540530183733, 0.017516670339826235, 0.002431252337060525, 0.005480585859934489, 0.00581687425474857, 0.006237140176910122, 0.008112391175909867, 0.0039027461319834996, 0.010418996171878189, 0.007461638284272563, 0.008006493907359889, 0.010557082591120582, 0.045898178598855505, 0.007547198045428542, 0.007763996264545148, 0.017709411159994855, 0.007974881624988484, 0.008793230846507448, 0.008735598564799435, 0.0109229716940837, 0.009566504595502486, 0.027493195187799187, 0.00906576550785381, 0.022913885931462624, 0.013102612913174497, 0.011502831661992535, 0.011800114633624244, 0.011401014016662912, 0.010916631664897494, 0.012657066395917538, 0.011626610520078254, 0.01426722961704369, 0.0116776909061573, 0.012236434330060985, 0.012252787038418627, 0.014694175361625319, 0.0126217854255584, 0.01284402

In [139]:
print(task2_("HeartFailure/25940075").collect())

[0.0004155235936586739, 0.00020645817202333015, 0.0006052945853192016, 0.0009913769765775222, 0.002001638907689463, 0.0015999871739095534, 0.002019744834116214, 0.004792986619751849, 0.002894701118020838, 0.006522915903279181, 0.0106430402064767, 0.0016649881093471865, 0.00985809560616895, 0.004929047803084475, 0.005317680786211882, 0.0030580348909037777, 0.005024448320722709, 0.0031652646598110956, 0.007320338601353158, 0.011663598487219201, 0.004239394231219247, 0.004533653641077001, 0.05013998503549675, 0.0045856393187413925, 0.009882060725405303, 0.04241210898773829, 0.005536774095398117, 0.005115482960170855, 0.005307705457093328, 0.011987318506688698, 0.0713340651732439, 0.00649171787941011, 0.005852920734955281, 0.006230356682704815, 0.005811074743896136, 0.006091087071687763, 0.008292711307248225, 0.008740937085533512, 0.0237134373084492, 0.008558911241543262, 0.009120702125820331, 0.007844634279127481, 0.009487629143546901, 0.008281348433416805, 0.024936857846422307, 0.0167235

In [19]:

task2("PalliativeCare/16552238").collect()


[0.0001326168251376662,
 0.00013727573533782176,
 0.00043905310291910484,
 0.0004943810244574161,
 0.0009981796200206178,
 0.0007659681273327474,
 0.0016115339271728455,
 0.003535968471403884,
 0.0036500229582837433,
 0.004535590707944603,
 0.0018920272849001234,
 0.0039642034338268186,
 0.004242923589209455,
 0.0032635848611800807,
 0.005543014317962698,
 0.0072268072081037695,
 0.004856783215732583,
 0.003660444152196005,
 0.0072244380146115296,
 0.005202546457923902,
 0.005030664889633763,
 0.005093047561528386,
 0.005545750931074728,
 0.005000384181018206,
 0.011731498058267944,
 0.01105428273608558,
 0.006136067824554493,
 0.00556449866964991,
 0.006619266649654261,
 0.005886068865289396,
 0.006693070548655456,
 0.01188015136458619,
 0.02229054090131527,
 0.006072516619589873,
 0.006131780950998362,
 0.006607603959551943,
 0.02610646272495503,
 0.008808070161618449,
 0.007451761354192594,
 0.02600929061538801,
 0.007589082204808764,
 0.019091631271163045,
 0.007896991352608818,
 0

In [77]:
task2("SquamousCellCancer/23991972").collect()

[0.00015920750835054853,
 0.00015581144376414696,
 0.0002683349928500519,
 0.0008632846315398535,
 0.0005229042476453187,
 0.0008916854003467517,
 0.0028140490911038522,
 0.0060496486332751835,
 0.0056801127978257865,
 0.0074143122224453765,
 0.0010290805026759188,
 0.0023197773286333756,
 0.0024621187158699476,
 0.0026400054205295152,
 0.0034337462475418842,
 0.0016519222995067067,
 0.004410067048366589,
 0.003158300913202961,
 0.003388922922257938,
 0.004468515132738812,
 0.019427403722941276,
 0.0031945159455453787,
 0.003286280513503595,
 0.007495893972345144,
 0.003375542335250326,
 0.0037219264663452376,
 0.0036975323479205954,
 0.004623385664382667,
 0.004049231421981397,
 0.011637093646245905,
 0.0038372826973788335,
 0.00969880126927409,
 0.00554596628145963,
 0.004868824024754065,
 0.0049946555171172255,
 0.004825727488849542,
 0.0046207021089481005,
 0.005357379014331841,
 0.0049212161222578175,
 0.006038915665928498,
 0.004942837008161689,
 0.005179337331379004,
 0.00518625

In [78]:
task2("HeartFailure/25940075").collect()

[0.00017989033062838002,
 8.938079423888458e-05,
 0.0002620468361902039,
 0.0004291913499390565,
 0.0008665584587686879,
 0.000692673595695229,
 0.0008743969573303768,
 0.0020750011813596646,
 0.0012531869408572147,
 0.0028239298957014004,
 0.00460763251678782,
 0.0007208140910737776,
 0.004267810793474771,
 0.0021339053967373853,
 0.002302154124113647,
 0.001323898127550571,
 0.0021752066150604,
 0.0013703205181833094,
 0.003169153692848613,
 0.005049457167301184,
 0.0018353374911955686,
 0.0019627295895930075,
 0.02170682633522525,
 0.0019852354614272546,
 0.004278185879965087,
 0.018361239710297413,
 0.0023970049783839167,
 0.002214617737891645,
 0.0022978355600642188,
 0.005189603484434812,
 0.03088226219862301,
 0.0028104235078225707,
 0.002533872594049818,
 0.0026972738508445715,
 0.0025157564406424492,
 0.0026369806286192093,
 0.0035901176290172097,
 0.0037841655355198107,
 0.010266127225633272,
 0.003705362093890901,
 0.003948575113461643,
 0.0033961341201004234,
 0.00410742679

In [249]:
def computeTFIDF(dic, idfs):
    dicWord = keyAndListOfWords.filter(lambda x : x[0] == dic)
    dicWord = dicWord.flatMap(lambda x : ((j, 1) for j in x[1]))
    wordInDic = dicWord.join(dictionary).map(lambda x : x[0])
    wordInDic = sc.parallelize(list(("", wordInDic.collect())))
    tfs = wordInDic.map(lambda x :  tf(x))
    v = sc.parallelize(list(tfs.collect()[1].items()))
    tfidf = v.join(idfs)
    tfidf = tfidf.map(lambda x : (x[0], x[1][0]*x[1][1]))
    ans = dictionary.leftOuterJoin(tfidf).sortBy(lambda x : x[1][0])
    ans = ans.map(lambda x : x[1][1] if x[1][1] else 0)
    return ans

def task2_for3(dic, idfs):
    WordOfCountPair = keyAndListOfWords.flatMap(lambda x : ((word, 1) for word in x[1] if x[0] == dic))
#     WordOfAllPair =  keyAndListOfWords.flatMap(lambda x : ((word, 1) for word in x[1]))
#     temp = WordOfCountPair.groupByKey().collect()
#     tfs = []
    
#     for word in temp:
#         tfs.append(tf(word[0], dic))
        
#     alltf = sc.parallelize(tfs)
#     return alltf
#     print(tfs)
#     return WordOfCountPair

    dicWord = keyAndListOfWords.filter(lambda x : x[0] == dic)
    dicWord = dicWord.flatMap(lambda x : ((j, 1) for j in x[1]))
    wordInDic = dicWord.join(dictionary).map(lambda x : x[0])
    wordInDic = sc.parallelize(list(("", wordInDic.collect())))
    tfs = wordInDic.map(lambda x :  tf(x))
    
    v = sc.parallelize(list(tfs.collect()[1].items()))
    tfidf = v.join(idfs)
    tfidf = tfidf.map(lambda x : (x[0], x[1][0]*x[1][1]))
    ans = dictionary.join(tfidf).sortBy(lambda x : x[1][0])
    ans = ans.map(lambda x : x[1][1])
    return ans.collect()


def tf_all(tokens):
    d = [0.0] * 20000
#     d = {}
    
#     for word in wordToIdx:
#         d[word] = 0.0
    
    for word in tokens:
        if word in wordToIdx:
            d[wordToIdx[word]] += 1.0
    total = sum(d)
    for word in range(len(d)):
        d[word] = float(d[word])/total
    
    return d


def muls(l1, l2):
    return [a*b for a,b in zip(l1,l2)]

def computeTFIDF_all():
#     doc = keyAndListOfWords.map(lambda x : x[0]).collect()
    
#     dicWord = keyAndListOfWords.filter(lambda x : x[0] in doc[:])
#     dicWord = keyAndListOfWords.filter(lambda x : x[0] == "SquamousCellCancer/23991972")
#     dicWord = dicWord.map(lambda x : x[1])
#     docs = sc.parallelize(doc)
#     tfs = docs.map(lambda x : list(task_tf(x)))


    tfs = keyAndListOfWords.map(lambda x : (x[0], tf_all(x[1])))
    
    idfsl = listidfs.collect()
    tfs  = tfs.map(lambda x : (x[0], muls(idfsl, x[1])) )
#     tfidf = 
#     tfidf = tfidf.map(lambda x : (x[0], x[1][0]*x[1][1]))
    
#     ans = dictionary.join(tfidf).sortBy(lambda x : x[1][0])
#     ans = ans.map(lambda x : x[1][1])
    return tfs
    
    
def distance(t1, t2):
#     from scipy.spatial import distance
#     dst = distance.euclidean(t1, t2)
    dst = np.sqrt(np.sum((t1-t2)**2))
    return [dst]
    
import heapq

import collections
def kNN(k, tfidf):
#     doc = keyAndListOfWords.map(lambda x : x[0]).collect()
    heap = []
    target = tfidf.collect()
#     for d in range(10):
#         t = computeTFIDF(doc[d], idfs)
# #         dst = distance(target, t.collect())
#         dst = distance(np.array(target), np.array(t.collect()))
#         heapq.heappush(heap, (-dst, doc[d]))
#         print(dst)
# #         print(idfs.collect())
#         if len(heap) > k:
#             heapq.heappop(heap)
#     for d in range(10000, 10010):
#         t = computeTFIDF(doc[d], idfs)
#         dst = distance(np.array(target), np.array(t.collect()))
#         heapq.heappush(heap, (-dst, doc[d]))
#         print(dst)
#         if len(heap) > k:
#             heapq.heappop(heap)
#     print(heap)     
#     dic = collections.Counter()
#     for ele in heap:
#         cata = ele[1].split('/')[0]
#         dic[cata] += 1
#     print(dic)
    dic = computeTFIDF_all() 
    dist = dic.map(lambda x : (x[0], list(distance(np.array(target), np.array(x[1])))))
    allDist = dist.collect()
    
    for i in range(len(allDist)):
        heapq.heappush(heap, (-allDist[i][1][0], allDist[i][0]))
        if len(heap) > k:
            heapq.heappop(heap)
    
    allRk = collections.Counter()
    
    for i, j in heap:
#         print(j)
        allRk[j.split('/')[0]] += 1
        
#     allRk.sort(lambda x : x[0])
#     print(allRk)
#     print(heap)
    maxM = -1
    tie = set()
    alls = []
    for i in allRk:
        if allRk[i] > maxM:
            maxM = allRk[i]
    
    for i in allRk:
        if allRk[i] == maxM:
            tie.add(i)
            alls.append(i)
    
    ans = ""
    ans_d = float('inf')
    
    if len(tie) == 1:
        return alls[0]
    else:
        for t in tie:
            for di, cat in heap:
                if t == cat.split('/')[0]:
                    if ans_d > -di:
                        ans_d = -di
                        ans = cat.split('/')[0]     
    return ans
    
#     return dic.most_common(1)
    
    
def predictLabel(k, s):
#     ss = s.split()
#     Text = sc.parallelize(('', ss))
#     
    Text = sc.parallelize(list([s]))
    Text = Text.map(lambda x : ( regex.sub(' ', x).lower().split()))
    tfs = Text.map(lambda x : tf_all(x))
    
    idfsl = listidfs.collect()
    tfidf = tfs.map(lambda x : muls(idfsl, x))
#     return tfidf.collect()
#     idfs = IDF(keyAndListOfWords)
     
#     v = sc.parallelize(list(tfs.collect()[0].items()))
#     tfidf = v.join(idfs)
#     tfidf = tfidf.map(lambda x : (x[0], x[1][0]*x[1][1]))
#     tfidf = dictionary.leftOuterJoin(tfidf).sortBy(lambda x : x[1][0])
#     tfidf = tfidf.map(lambda x : x[1][1] if x[1][1] else 0)
    
   
    return kNN(k, tfidf)
    



In [244]:
s = 'Simulation technology for health care professional skills training and assessment.  Changes in medical practice that limit instruction time and patient availability, the expanding options for diagnosis and management, and advances in technology are contributing to greater use of simulation technology in medical education. Four areas of high-technology simulations currently being used are laparoscopic techniques, which provide surgeons with an opportunity to enhance their motor skills without risk to patients; a cardiovascular disease simulator, which can be used to simulate cardiac conditions; multimedia computer systems, which includes patient-centered, case-based programs that constitute a generalist curriculum in cardiology; and anesthesia simulators, which have controlled responses that vary according to numerous possible scenarios. Some benefits of simulation technology include improvements in certain surgical technical skills, in cardiovascular examination skills, and in acquisition and retention of knowledge compared with traditional lectures. These systems help to address the problem of poor skills training and proficiency and may provide a method for physicians to become self-directed lifelong learners.'
print(predictLabel(10, s))

Asthma/26315994
QA/12474426
QA/22269037
PalliativeCare/28603881
HeartFailure/26415691
QA/9081907
MedicalEducation/23574731
Homeopath/11795090
PalliativeCare/16276811
Asthma/10867386
QA


In [248]:
print(predictLabel (10, 'Propofol inhibits T-helper cell type-2 differentiation by inducing apoptosis via activating gamma-aminobutyric acid receptor.  Propofol has been shown to attenuate airway hyperresponsiveness in asthma patients. Our previous study showed that it may alleviate lung inflammation in a mouse model of asthma. Given the critical role of T-helper cell type-2 (Th2) differentiation in asthma pathology and the immunomodulatory role of the gamma-aminobutyric acid type A (GABAFor in vivo testing, chicken ovalbumin-sensitized and challenged asthmatic mice were used to determine the effect of propofol on Th2-type asthma inflammation. For in vitro testing, Th2-type cytokines as well as the cell proliferation and apoptosis were measured to assess the effects of propofol on Th2 cell differentiation and determine the underlying mechanisms.We found that propofol significantly decreased inflammatory cell counts and interleukin-4 and inflammation score in vivo. Propofol, but not intralipid, significantly reduced the Th2-type cytokine interleukin-5 secretion and caused Th2 cell apoptosis without obvious inhibition of proliferation in vitro. A GABA receptor agonist simulated the effect of propofol, whereas pretreatment with an antagonist reversed this effect.This study demonstrates that the antiinflammatory effects of propofol on Th2-type asthma inflammation in mice are mediated by inducing apoptosis without compromising proliferation during Th2 cell differentiation via activation of the GABA receptor.Copyright Â© 2016 Elsevier Inc. All rights reserved.'))

PalliativeCare/28603881
Asthma/12688617
Asthma/19891606
Asthma/26315994
ParasiticDisease/1354651
Asthma/15270735
Asthma/10867386
Asthma/24026573
BrainInjuries/19397790
RxInteractions/20452333
Asthma


In [250]:
print(predictLabel (10, 'Evaluation of isopathic treatment of Salmonella enteritidis in poultry.  Salmonellosis is a common problem worldwide in commercially reared poultry. It is associated with human Salmonellosis. No fully satisfactory method of control is available.Nosodes to an antibiotic-resistant strain of Salmonella enterica serovar Enteritidis in D30 (30X) potency were prepared. One day old chicks (N = 180) were divided into four groups: two control and two different preparations of the nosode. Treatments were administered in drinking water for 10 days. The birds were challenged by a broth culture of the same Salmonella, by mouth, on day 17. Cloacal swabs were taken twice weekly for Salmonella enterica serovar Enteritidis.Birds receiving active treatment were less likely to grow the strain of Salmonella from cloacal swabs compared to control.Isopathy is low cost and non-toxic. It may have a role to play in the widespread problem of Salmonella in poultry. Further research should be conducted.'))

PalliativeCare


In [127]:
print(predictLabel (10, 'Management of the neck after chemoradiotherapy for head and neck cancers in Asia: consensus statement from the Asian Oncology Summit 2009.  The addition of a planned neck dissection after radiotherapy has traditionally been considered standard of care for patients with positive neck-nodal disease. With the acceptance of chemoradiotherapy as the new primary treatment for patients with locally advanced squamous-cell head and neck cancers, and the increasing numbers of patients who achieve a complete response, the role of planned neck dissection is now being questioned. The accuracy and availability of a physical examination or of different imaging modalities to identify true complete responses adds controversy to this issue. This consensus statement will address some of the controversies surrounding the role of neck dissection following chemoradiotherapy for squamous-cell carcinomas of the head and neck, with particular reference to patients in Asia.'))

SquamousCellCancer/22903756
SquamousCellCancer/11891955
SquamousCellCancer/11801768
PalliativeCare/27841133
SquamousCellCancer/15987995
PalliativeCare/9156507
SquamousCellCancer/12525191
SquamousCellCancer/9874426
SquamousCellCancer/9081382
SquamousCellCancer/9484945
defaultdict(<class 'int'>, {'SquamousCellCancer': 8, 'PalliativeCare': 2})


In [129]:
print(predictLabel (10, '[Quality management in the hospital with special reference to the medical departments].  Present German health legislation requires standardisation of systems and procedures not only for medical departments but for entire clinics as well. Changes in organisation and treatment possibilities should enable clinic administrations to better manage the quality of the services provided. This system of quality control and quality management is based on the introduction of ISO 9004, part 2. Medical and non-medical procedures are standardised and streamlined through flow-diagrams, the details of which are summarised in a series of quality handbooks with an emphasis on guidelines and standards. These handbooks are distributed clinic-wide, and although acceptance is not yet determined, objections are probable. Future analysis will show the effects due to quality management in the clinical setting.'))

QA/12512551
Homeopath/11795090
PalliativeCare/16499673
QA/11824034
PalliativeCare/16276811
QA/9081907
PalliativeCare/28603881
QA/12474426
Asthma/10867386
Asthma/26315994
[('QA', 4)]


In [247]:
print(predictLabel (10, 'Suicide attempts involving power drills.  A 61-year-old man was found dead next to a power drill soiled with blood and bone dust. A 5 mm circular wound of the forehead corresponded to the size of the drill bit. Subarachnoid haemorrhage was present over the anterior pole of the left frontal lobe with a penetrating injury extending 75 mm into the frontal lobe white matter towards, but not involving, the basal ganglia. No major intracranial vessels had been injured and there was no significant intraparenchymal haemorrhage. Death was due to haemorrhage from self-inflicted stab wounds to the abdomen with an associated penetrating intracranial wound from a power drill. Deaths due to power drills are rare and are either accidents or suicides. Wounds caused by power drills may be mistaken for bullet entrance wounds, and the marks around a wound from the drill chuck as muzzle imprints. A lack of internal bevelling helps to distinguish the entrance wound from that due to a projectile. Significant penetration of the brain may occur without lethal injury. Copyright © 2013 Elsevier Ltd and Faculty of Forensic and Legal Medicine. All rights reserved.'))

PalliativeCare/6168434
Wounds/25639178
Asthma/26315994
Asthma/12688617
PalliativeCare/16276811
Wounds/8664147
QA/9081907
Asthma/10867386
Homeopath/11795090
PalliativeCare/28603881
Asthma


In [233]:
print(predictLabel (10, 'Neurobehavioral recovery.  This review discusses recent programs in early and late neurobehavioral recovery from closed head injury (CHI). The research on early recovery has encompassed the relationship of localized brain lesions to the duration of impaired consciousness and features of posttraumatic amnesia. Of the research on late neurobehavioral outcome of CHI, studies emanating from the Traumatic Coma Data Bank are reviewed in detail, including analysis of acute neurologic indices in relation to recovery of memory, information processing speed, and other cognitive measures. Recent studies concerning the neurobehavioral outcome of CHI in children are discussed as are investigations of behavioral disturbance, psychosocial outcome, and family variables. The review concludes with an assessment of recent studies concerning the efficacy of rehabilitation directed toward the cognitive sequelae of CHI and preliminary trials to evaluate the potential use of psychoactive drugs in the postacute management of head injured patients.'))

Wounds/12900110
Asthma/10867386
BrainInjuries/16394900
BrainInjuries/18289430
QA/9081907
Homeopath/11795090
Asthma/26315994
BrainInjuries/12434929
BrainInjuries/11721739
PalliativeCare/28603881
Counter({'BrainInjuries': 4, 'Asthma': 2, 'Wounds': 1, 'QA': 1, 'Homeopath': 1, 'PalliativeCare': 1})


In [134]:
print(predictLabel (10, 'Ethical issues arising from the requirement to sign a consent form in palliative care.  French healthcare networks aim to help healthcare workers to take care of patients by improving cooperation, coordination and the continuity of care. When applied to palliative care in the home, they facilitate overall care including medical, social and psychological aspects. French legislation in 2002 required that an information document explaining the functioning of the network should be given to patients when they enter a healthcare network. The law requires that this document be signed. Ethical issues arise from this legislation with regard to the validity of the signature of dying patients. Signature of the consent form by a guardian or trustee, a designated person--the Person of Trust--transforms the doctor-patient relationship into a triangular doctor-patient-third-party relationship.'))

PalliativeCare/17969829
Homeopath/11795090
PalliativeCare/19825893
QA/9081907
PalliativeCare/16276811
PalliativeCare/21952569
PalliativeCare/28603881
PalliativeCare/27911489
PalliativeCare/21313865
Asthma/10867386
[('PalliativeCare', 7)]


In [147]:
print(wordToIdx)

