In [1]:
import pyspark
import math

In [2]:
sc = pyspark.SparkContext(appName = "teste")

In [3]:
sc

In [161]:
rdd = sc.sequenceFile("part-00000")

In [162]:
N_docs = rdd.count()
N_docs

36133

In [347]:
rdd_ip = rdd.filter(lambda x: "iphone" in x[1]) 
rdd_an = rdd.filter(lambda x: "android" in x[1])

In [348]:
rdd_in = rdd_ip.intersection(rdd_an)

In [327]:
def conta_palavras(item):
    url, conteudo = item 
    palavras = conteudo.strip().split()
    return [(palavra.lower(), 1) for palavra in palavras]

def conta_documentos(item):
    url, conteudo = item 
    palavras = conteudo.strip().split()
    return [(palavra.lower(), 1) for palavra in set(palavras)]

def junta_contagens(nova_contagem, contagem_atual):
    return nova_contagem + contagem_atual

In [328]:
rdd_iphone = rdd_ip.flatMap(conta_palavras).reduceByKey(junta_contagens)
rdd_android = rdd_an.flatMap(conta_palavras).reduceByKey(junta_contagens)
rdd_inter = rdd_in.flatMap(conta_palavras).reduceByKey(junta_contagens)

In [329]:
rdd_inter.count()

22559

In [330]:
rdd_iphone_docs = rdd_ip.flatMap(conta_documentos).reduceByKey(junta_contagens)
rdd_android_docs = rdd_an.flatMap(conta_documentos).reduceByKey(junta_contagens)
rdd_inter_docs = rdd_in.flatMap(conta_documentos).reduceByKey(junta_contagens)

In [331]:
doc_min = 10
doc_max = 0.7* N_docs
def filtra_doc_freq(item):
    contagem = item[1]
    return (contagem < doc_max) and (contagem >= doc_min)

In [332]:
rdd_iphone_docs_filtrado = rdd_iphone_docs.filter(filtra_doc_freq)
rdd_android_docs_filtrado = rdd_android_docs.filter(filtra_doc_freq)
rdd_inter_docs_filtrado = rdd_inter_docs.filter(filtra_doc_freq)

In [333]:
def computa_idf(item):
    palavra, contagem = item
    idf = math.log10(N / contagem)
    return (palavra,idf)

def computa_freq(item):
    palavra, contagem = item
    freq = math.log10(1 + contagem)
    return (palavra,freq)

In [334]:
rdd_iphone_idf = rdd_iphone_docs_filtrado.map(computa_idf)
rdd_iphone_freq = rdd_iphone.map(computa_freq)

rdd_android_idf = rdd_android_docs_filtrado.map(computa_idf)
rdd_android_freq = rdd_android.map(computa_freq)

rdd_inter_idf = rdd_inter_docs_filtrado.map(computa_idf)
rdd_inter_freq = rdd_inter.map(computa_freq)

In [335]:
def computa_rel(item):
    palavra, contagem = item
    freq, idf = contagem
    relevancia = freq*idf
    return (palavra, relevancia)

In [336]:
rdd_inter_join = rdd_inter_freq.join(rdd_inter_idf)
rdd_inter_rel = rdd_inter_join.map(computa_rel)

list_inter_rel = rdd_inter_rel.takeOrdered(100, key=lambda x: -x[1])
list_inter_rel

[('celular', 10.918929897720963),
 ('de', 10.65226020572554),
 ('gratis', 10.187427917608622),
 ('rastrear', 10.099324307691498),
 ('for', 9.743732509039093),
 ('i', 9.685260126683293),
 ('espião', 9.627203382368709),
 ('online', 9.606827906253104),
 ('to', 9.367426026465914),
 ('in', 9.256673011156547),
 ('pelo', 9.130897202133154),
 ('a', 9.111594893926162),
 ('para', 9.087596802319817),
 ('-', 9.028483087349832),
 ('e', 9.005251086580257),
 ('your', 8.883836035177545),
 ('que', 8.87850930431145),
 ('com', 8.875611785506019),
 ('and', 8.758118909897044),
 ('is', 8.685157116840974),
 ('como', 8.584007505857327),
 ('the', 8.565080621558074),
 ('em', 8.45919112679308),
 ('of', 8.421448145475372),
 ('android', 8.408239152291245),
 ('o', 8.276317554492827),
 ('do', 8.26762520600069),
 ('sites', 8.158209278209814),
 ('2020', 8.132449320022237),
 ('um', 8.120040911779983),
 ('aplicativo', 8.067211582341322),
 ('no', 7.99630614162871),
 ('blog', 7.922892022557706),
 ('site', 7.91077121442457

In [341]:
rdd_iphone_join = rdd_iphone_freq.join(rdd_iphone_idf)
rdd_iphone_rel = rdd_iphone_join.map(computa_rel)

list_iphone_rel = rdd_iphone_rel.takeOrdered(100, key=lambda x: -x[1])
list_iphone_rel

[('(1)', 13.171358817662837),
 ('responder', 11.23270080026728),
 ('partir', 10.439828386208955),
 ('vc', 10.310292078914369),
 ('loans', 10.292629807799113),
 ('às', 10.063822292513974),
 ('i', 10.013767888288601),
 ('de', 9.996157747553783),
 ('brides', 9.986804889042837),
 ('celular', 9.984559747362555),
 ('*', 9.937219397872994),
 ('dating', 9.936477251250722),
 ('rastrear', 9.923924312490264),
 ('you', 9.913718553739209),
 ('gratis', 9.867305400545431),
 ('to', 9.865663621325218),
 ('at', 9.748279077399857),
 ('for', 9.731716588731594),
 ('this', 9.715955004477694),
 ('payday', 9.65360680181539),
 ('espião', 9.627203382368709),
 ('bride', 9.608221080400568),
 ('the', 9.600961447736143),
 ('+', 9.471633105909886),
 ('your', 9.368620922972084),
 ('e', 9.353389343991836),
 ('que', 9.337890123396962),
 ('like', 9.335971872943439),
 ('russian', 9.315497045044866),
 ('and', 9.306761451262243),
 ('am', 9.296034189977442),
 ('in', 9.294793222836091),
 ('abril', 9.27448606262811),
 ('a', 9

In [None]:
#Nao funciona no cluster
result = rdd_idf.collect()
result1 = rdd_freq.collect()

In [None]:
test = sorted(result, key = lambda x: x[1])
test1 = sorted(result1, key = lambda x: x[1])