In [1]:
import pyspark
import math

In [2]:
sc = pyspark.SparkContext(appName = 'hey mundo')

In [3]:
sc

In [4]:
rdd = sc.sequenceFile('part-00000')

In [5]:
result = rdd.take(10)

In [6]:
len(result)

10

In [7]:
def conta_docs_ford_honda(item):
    url, conteudo = item
    palavras = conteudo.strip().split()
    if 'ford' in palavras and 'honda' in palavras:
        return [(palavra.lower(),1) for palavra in (palavras) if palavra.isalpha()]
    else:
        return []

In [8]:
def conta_docs_ford(item):
    url, conteudo = item
    palavras = conteudo.strip().split()
    if 'ford' in palavras and 'honda' not in palavras:
        return [(palavra.lower(),1) for palavra in (palavras) if palavra.isalpha()]
    else:
        return []

In [9]:
def conta_docs_honda(item):
    url, conteudo = item
    palavras = conteudo.strip().split()
    if 'honda' in palavras and 'ford' not in palavras:
        return [(palavra.lower(),1) for palavra in (palavras) if palavra.isalpha()]
    else:
        return []

In [10]:
def conta_palavras(item):
    url, conteudo = item
    palavras = conteudo.strip().split()
    return [(palavra.lower(),1) for palavra in palavras if palavra.isalpha() and palavra == 'honda' or palavra =='ford'] 

In [11]:
def junta_contagens(nova_contagem, contagem_atual):
    return (nova_contagem + contagem_atual)

In [12]:
rdd_doc_freq_ford_honda = rdd.flatMap(conta_docs_ford_honda).reduceByKey(junta_contagens)

In [13]:
rdd_doc_freq_ford = rdd.flatMap(conta_docs_ford).reduceByKey(junta_contagens)

In [14]:
rdd_doc_freq_honda = rdd.flatMap(conta_docs_honda).reduceByKey(junta_contagens)

In [15]:
rdd_palavra_freq = rdd.flatMap(conta_palavras).reduceByKey(junta_contagens)

In [16]:
rdd_palavra_freq.take(10)

[('ford', 85), ('honda', 67)]

In [17]:
rdd_doc_freq_ford_honda.take(10)

[('últimas', 16),
 ('das', 20),
 ('todas', 21),
 ('verificação', 1),
 ('possível', 3),
 ('no', 77),
 ('agosto', 17),
 ('março', 15),
 ('foi', 13),
 ('apontada', 1)]

In [18]:
rdd_doc_freq_ford.take(10)

[('carpoint', 36),
 ('perfil', 12),
 ('nissan', 20),
 ('fernando', 13),
 ('julho', 86),
 ('mário', 6),
 ('agosto', 43),
 ('tem', 38),
 ('modelos', 6),
 ('no', 101)]

In [19]:
rdd_doc_freq_honda.take(10)

[('automóvel', 14),
 ('veicular', 9),
 ('no', 515),
 ('garantir', 6),
 ('saída', 5),
 ('peças', 35),
 ('diminuir', 1),
 ('precisa', 15),
 ('todas', 26),
 ('foi', 115)]

In [20]:
rdd.count()

36133

In [21]:
rdd_palavra_freq.count()

2

In [22]:
rdd_doc_freq_ford_honda.count()

4208

In [23]:
rdd_doc_freq_ford.count()

3123

In [24]:
rdd_doc_freq_honda.count()

6710

In [25]:
result_ford_honda = rdd_doc_freq_ford_honda.collect()
sorted(result_ford_honda, key=lambda x : -x[1])[:10]

[('gasolina', 1834),
 ('serie', 1744),
 ('mecanico', 1491),
 ('automatico', 1092),
 ('de', 1040),
 ('trator', 786),
 ('pa', 651),
 ('e', 557),
 ('carregadeira', 456),
 ('turbo', 451)]

In [26]:
result_ford = rdd_doc_freq_ford.collect()
sorted(result_ford, key=lambda x : -x[1])[:10]

[('de', 780),
 ('do', 296),
 ('o', 281),
 ('e', 254),
 ('caminhão', 234),
 ('a', 220),
 ('cavalo', 184),
 ('mecânico', 179),
 ('que', 165),
 ('respostas', 139)]

In [27]:
result_honda = rdd_doc_freq_honda.collect()
sorted(result_honda, key=lambda x : -x[1])[:10]

[('de', 3223),
 ('a', 1716),
 ('o', 1676),
 ('e', 1295),
 ('que', 1228),
 ('em', 931),
 ('do', 849),
 ('um', 738),
 ('é', 713),
 ('da', 682)]

In [28]:
N = rdd.count()

In [29]:
DOC_COUNT_MIN = 5
DOC_COUNT_MAX = 0.7*N
def filtra_doc_freq(item):
    contagem = item[1]
    return (contagem < DOC_COUNT_MAX) & (contagem >= DOC_COUNT_MIN)

rdd_palavra_freq_filter_filtrado = rdd_palavra_freq.filter(filtra_doc_freq)
# rdd_doc_freq_filter_filtrado = rdd_doc_freq.filter(filtra_doc_freq)

In [30]:
rdd_doc_freq_filter_filtrado_ford_honda = rdd_doc_freq_ford_honda.filter(filtra_doc_freq)

In [31]:
rdd_doc_freq_filter_filtrado_ford = rdd_doc_freq_ford.filter(filtra_doc_freq)

In [32]:
rdd_doc_freq_filter_filtrado_honda = rdd_doc_freq_honda.filter(filtra_doc_freq)

In [33]:
rdd_doc_freq_filter_filtrado_ford_honda.take(10)

[('últimas', 16),
 ('das', 20),
 ('todas', 21),
 ('no', 77),
 ('agosto', 17),
 ('março', 15),
 ('foi', 13),
 ('outros', 6),
 ('nissan', 61),
 ('elétrica', 5)]

In [34]:
rdd_doc_freq_filter_filtrado_ford.take(10)

[('carpoint', 36),
 ('perfil', 12),
 ('nissan', 20),
 ('fernando', 13),
 ('julho', 86),
 ('mário', 6),
 ('agosto', 43),
 ('tem', 38),
 ('modelos', 6),
 ('no', 101)]

In [35]:
rdd_doc_freq_filter_filtrado_honda.take(10)

[('automóvel', 14),
 ('veicular', 9),
 ('no', 515),
 ('garantir', 6),
 ('saída', 5),
 ('peças', 35),
 ('precisa', 15),
 ('todas', 26),
 ('foi', 115),
 ('boa', 62)]

In [36]:
rdd_doc_freq_ford_honda.count()

4208

In [37]:
rdd_doc_freq_ford.count()

3123

In [38]:
rdd_doc_freq_honda.count()

6710

In [39]:
def computa_idf(item):
    palavra,contagem = item
    idf = math.log10(N/contagem)
    return (palavra,idf)

In [40]:
rdd_idf = rdd_palavra_freq_filter_filtrado.map(computa_idf)

In [41]:
rdd_idf_doc_ford_honda = rdd_doc_freq_filter_filtrado_ford_honda.map(computa_idf)

In [42]:
rdd_idf_doc_ford = rdd_doc_freq_filter_filtrado_ford.map(computa_idf)

In [43]:
rdd_idf_doc_honda = rdd_doc_freq_filter_filtrado_honda.map(computa_idf)

In [44]:
result_ford_honda = rdd_idf_doc_ford_honda.collect()

In [45]:
result_ford = rdd_idf_doc_ford.collect()

In [46]:
result_honda = rdd_idf_doc_honda.collect()

In [47]:
sorted(result_ford_honda, key= lambda x: x[1])[-15:]

[('gerais', 3.8589340167215473),
 ('disco', 3.8589340167215473),
 ('arrizo', 3.8589340167215473),
 ('minha', 3.8589340167215473),
 ('cep', 3.8589340167215473),
 ('empresa', 3.8589340167215473),
 ('fluídos', 3.8589340167215473),
 ('edc', 3.8589340167215473),
 ('od', 3.8589340167215473),
 ('dakar', 3.8589340167215473),
 ('rd', 3.8589340167215473),
 ('twingo', 3.8589340167215473),
 ('wrangler', 3.8589340167215473),
 ('marchas', 3.8589340167215473),
 ('yaris', 3.8589340167215473)]

In [48]:
sorted(result_ford, key= lambda x: x[1])[-15:]

[('design', 3.8589340167215473),
 ('sync', 3.8589340167215473),
 ('vitória', 3.8589340167215473),
 ('espaço', 3.8589340167215473),
 ('g', 3.8589340167215473),
 ('livre', 3.8589340167215473),
 ('ônix', 3.8589340167215473),
 ('ganha', 3.8589340167215473),
 ('sabe', 3.8589340167215473),
 ('enviar', 3.8589340167215473),
 ('with', 3.8589340167215473),
 ('pinheiros', 3.8589340167215473),
 ('yaris', 3.8589340167215473),
 ('branco', 3.8589340167215473),
 ('preto', 3.8589340167215473)]

In [49]:
sorted(result_honda, key= lambda x: x[1])[-15:]

[('infelizmente', 3.8589340167215473),
 ('levei', 3.8589340167215473),
 ('prefiro', 3.8589340167215473),
 ('oficial', 3.8589340167215473),
 ('josé', 3.8589340167215473),
 ('intervalo', 3.8589340167215473),
 ('fluence', 3.8589340167215473),
 ('certidão', 3.8589340167215473),
 ('militar', 3.8589340167215473),
 ('confundir', 3.8589340167215473),
 ('fatores', 3.8589340167215473),
 ('matéria', 3.8589340167215473),
 ('azera', 3.8589340167215473),
 ('considero', 3.8589340167215473),
 ('franco', 3.8589340167215473)]

In [50]:
len(result)

10

In [51]:
len(result_ford_honda)

1160

In [52]:
len(result_ford)

518

In [53]:
len(result_honda)

1360

In [54]:
type(result)

list

In [55]:
def freq_normalized(item):
    palavra,contagem = item
    freq = math.log10(1 + contagem)
    return(palavra,freq)
    

In [56]:
rdd_freq = rdd_palavra_freq_filter_filtrado.map(freq_normalized)

In [57]:
rdd_freq_ford_honda = rdd_doc_freq_filter_filtrado_ford_honda.map(freq_normalized)

In [58]:
rdd_freq_ford =rdd_doc_freq_filter_filtrado_ford.map(freq_normalized)

In [59]:
rdd_freq_honda = rdd_doc_freq_filter_filtrado_honda.map(freq_normalized)

In [60]:
sorted(rdd_freq.collect(), key= lambda x: x[1])[-15:]

[('honda', 1.8325089127062364), ('ford', 1.9344984512435677)]

In [61]:
sorted(rdd_freq_ford_honda.collect(), key= lambda x: x[1])[-15:]

[('a', 2.432969290874406),
 ('escavadeira', 2.437750562820388),
 ('flex', 2.499687082618404),
 ('alcool', 2.534026106056135),
 ('diesel', 2.53655844257153),
 ('turbo', 2.655138434811382),
 ('carregadeira', 2.6599162000698504),
 ('e', 2.7466341989375787),
 ('pa', 2.81424759573192),
 ('trator', 2.8959747323590648),
 ('de', 3.017450729510536),
 ('automatico', 3.038620161949703),
 ('mecanico', 3.17376882313665),
 ('serie', 3.241795431295199),
 ('gasolina', 3.263636068588108)]

In [62]:
sorted(rdd_freq_ford.collect(), key= lambda x: x[1])[-15:]

[('mais', 2.0681858617461617),
 ('janeiro', 2.0791812460476247),
 ('com', 2.089905111439398),
 ('é', 2.113943352306837),
 ('respostas', 2.146128035678238),
 ('responder', 2.146128035678238),
 ('que', 2.220108088040055),
 ('mecânico', 2.255272505103306),
 ('cavalo', 2.2671717284030137),
 ('a', 2.3443922736851106),
 ('caminhão', 2.3710678622717363),
 ('e', 2.406540180433955),
 ('o', 2.450249108319361),
 ('do', 2.4727564493172123),
 ('de', 2.8926510338773004)]

In [63]:
sorted(rdd_freq_honda.collect(), key= lambda x: x[1])[-15:]

[('se', 2.677606952720493),
 ('no', 2.7126497016272113),
 ('não', 2.798650645445269),
 ('para', 2.8048206787211623),
 ('com', 2.82865989653532),
 ('da', 2.8344207036815323),
 ('é', 2.8536982117761744),
 ('um', 2.8686444383948255),
 ('do', 2.929418925714293),
 ('em', 2.9694159123539814),
 ('que', 3.089551882886454),
 ('e', 3.1126050015345745),
 ('o', 3.2245330626060857),
 ('a', 3.2347702951609163),
 ('de', 3.508395033133053)]

In [64]:
rdd_freq \
    .join(rdd_idf)\
    .map(lambda x: (x[0],x[1][0]*x[1][1]))\
    .takeOrdered(10,key = lambda x: -x[1])

[('ford', 5.084800346058365), ('honda', 5.006101390630037)]

In [65]:
rdd_freq_ford_honda\
    .join(rdd_idf_doc_ford_honda)\
    .map(lambda x: (x[0],x[1][0]*x[1][1]))\
    .takeOrdered(10,key = lambda x: -x[1])

[('mi', 5.198862060391179),
 ('sport', 5.198844667090954),
 ('s', 5.198820199005647),
 ('do', 5.198818157235582),
 ('ford', 5.198735037154214),
 ('golf', 5.198193943457401),
 ('com', 5.197755914137743),
 ('chassis', 5.195985542011936),
 ('o', 5.193396077626297),
 ('volkswagen', 5.192507604494547)]

In [66]:
rdd_freq_ford\
    .join(rdd_idf_doc_ford)\
    .map(lambda x: (x[0],x[1][0]*x[1][1]))\
    .takeOrdered(10,key = lambda x: -x[1])

[('cavalo', 5.198820199005647),
 ('mecânico', 5.19851812078333),
 ('que', 5.195985542011936),
 ('a', 5.193957336541311),
 ('caminhão', 5.189528165358948),
 ('respostas', 5.182661459823505),
 ('responder', 5.182661459823505),
 ('e', 5.18145019892832),
 ('é', 5.173483818232933),
 ('o', 5.168059786518126)]

In [67]:
rdd_freq_honda\
    .join(rdd_idf_doc_honda)\
    .map(lambda x: (x[0],x[1][0]*x[1][1]))\
    .takeOrdered(10,key = lambda x: -x[1])

[('blindado', 5.198452700961411),
 ('como', 5.1984223655756105),
 ('sua', 5.197914927639032),
 ('tem', 5.197455761843707),
 ('ao', 5.197397770732923),
 ('ser', 5.19569831397644),
 ('outubro', 5.195077050530865),
 ('respostas', 5.194228378137632),
 ('são', 5.193237644545963),
 ('as', 5.193106042424439)]

### TO DO
Ver pq o filtro nao fununcia  
passar pro zepelin

