In [1]:
import pandas as pd
import gzip
import re

from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser, PhrasePlugin, SequencePlugin
from whoosh import scoring, columns, fields, index, sorting

from IPython.core.display import display, HTML

## Base de dados utilizada

A base de dados utilizada está disponível em: http://jmcauley.ucsd.edu/data/amazon/ e contém o texto e notas de reviews da seção de video games da Amazon.

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient = 'index')

df = getDF('reviews_Video_Games_5.json.gz')

In [3]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2HD75EMZR8QLN,700099867,123,"[8, 12]",Installing the game was a struggle (because of...,1.0,Pay to unlock content? I don't think so.,1341792000,"07 9, 2012"
1,A3UR8NLLY1ZHCX,700099867,"Alejandro Henao ""Electronic Junky""","[0, 0]",If you like rally cars get this game you will ...,4.0,Good rally game,1372550400,"06 30, 2013"
2,A1INA0F5CWW3J4,700099867,"Amazon Shopper ""Mr.Repsol""","[0, 0]",1st shipment received a book instead of the ga...,1.0,Wrong key,1403913600,"06 28, 2014"
3,A1DLMTOTHQ4AST,700099867,ampgreen,"[7, 10]","I got this version instead of the PS3 version,...",3.0,"awesome game, if it did not crash frequently !!",1315958400,"09 14, 2011"
4,A361M14PU2GUEG,700099867,"Angry Ryan ""Ryan A. Forrest""","[2, 2]",I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,DIRT 3,1308009600,"06 14, 2011"


Renomeando os missings

In [4]:
df.isnull().sum()

reviewerID           0
asin                 0
reviewerName      2813
helpful              0
reviewText           0
overall              0
summary              0
unixReviewTime       0
reviewTime           0
dtype: int64

In [5]:
df = df.fillna("-")

Mudando o formato da data para facilitar ordenação

In [6]:
df['reviewTime'] = [re.sub(',' , '', x) for x in df['reviewTime']]
data = []

for row in range(len(df)):
    s = df['reviewTime'][row].split()
    if len(s[0]) == 1:
        s[0] = '0' + s[0]
    if len(s[1]) == 1:
        s[1] = '0' + s[1]
    data.append(s[2] + '-' + s[0] + '-' + s[1])

df['reviewTime'] = data

In [7]:
df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2HD75EMZR8QLN,0700099867,123,"[8, 12]",Installing the game was a struggle (because of...,1.0,Pay to unlock content? I don't think so.,1341792000,2012-07-09
1,A3UR8NLLY1ZHCX,0700099867,"Alejandro Henao ""Electronic Junky""","[0, 0]",If you like rally cars get this game you will ...,4.0,Good rally game,1372550400,2013-06-30
2,A1INA0F5CWW3J4,0700099867,"Amazon Shopper ""Mr.Repsol""","[0, 0]",1st shipment received a book instead of the ga...,1.0,Wrong key,1403913600,2014-06-28
3,A1DLMTOTHQ4AST,0700099867,ampgreen,"[7, 10]","I got this version instead of the PS3 version,...",3.0,"awesome game, if it did not crash frequently !!",1315958400,2011-09-14
4,A361M14PU2GUEG,0700099867,"Angry Ryan ""Ryan A. Forrest""","[2, 2]",I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,DIRT 3,1308009600,2011-06-14
...,...,...,...,...,...,...,...,...,...
231775,A1ICREREXO9J81,B00KHECZXO,Frustrated gamer,"[0, 1]",Funny people on here are rating sellers that a...,5.0,this is for rating the system not the seller,1405814400,2014-07-20
231776,A3VVMIMMTYQV5F,B00KHECZXO,Johnny Saigon,"[8, 11]",All this is is the Deluxe 32GB Wii U with Mari...,1.0,Get the Other Bundle Which Includes Extra Whee...,1403308800,2014-06-21
231777,A1DD4B97M4DUC5,B00KHECZXO,migit,"[62, 66]",The package should have more red on it and sho...,1.0,Fake bundle,1401321600,2014-05-29
231778,A2Q9CNJ4T6ZK99,B00KHECZXO,"Philip Brown ""Philip & Chana""","[33, 36]",Can get this at Newegg for $329.00 and the pac...,1.0,Looks Like We Have Gougers Again.,1401667200,2014-06-02


<hr>

## Indexação

1. Utilizando o Whoosh indexar o seu corpus e implementar uma funçao que apresentes para os top 10 documentos da resposta a uma consulta, trechos dos documentos contendo os termos da consulta.

Indexando o corpus

In [8]:
schema = Schema(reviewerID = TEXT(sortable = True), asin = TEXT(sortable = True, stored = True), reviewerName = TEXT(stored = True),
                reviewText = TEXT(stored = True), overall = NUMERIC(sortable = True, stored = True), summary = TEXT(stored = True), 
                unixReviewTime = NUMERIC(sortable = True, stored = True), reviewTime = TEXT(sortable = True, stored = True))

In [9]:
myindex = create_in("indexdir", schema)
writer = myindex.writer()

In [10]:
for row in df.iterrows():
    writer.add_document(reviewerID = row[1][0], asin = row[1][1], reviewerName = row[1][2], reviewText = row[1][4],
                        overall = row[1][5], summary = row[1][6], unixReviewTime = row[1][7], reviewTime = row[1][8])

In [11]:
writer.commit()

Realizando consultas

In [12]:
def busca(consulta, limit = 10):
    query = QueryParser('reviewText', myindex.schema).parse(consulta)
    with myindex.searcher(weighting = scoring.TF_IDF()) as searcher: # utilizando tf-idf
        results = [(dict(hit), hit.highlights('reviewText')) for hit in searcher.search(query, limit = limit)]
    return results

In [13]:
res = busca('smash bros melee')

In [14]:
len(res)

10

In [15]:
for i in range(10):
    display(HTML(res[i][1]))

<hr>

2. Contruir uma funçao de busca que aceite consultas frasais e facetaçao dos resultados.

In [16]:
def busca2(consulta, limit = 10, frasal = False, sort_by = [], reverse = []): # sort_by = vetor de char das variáveis de ordenação, reverse = vetor booleano que indica se é pra reverter a ordem do sort
    
    if len(sort_by) != len(reverse):
         raise Exception('Tamanho do sort_by deve ser igual ao tamanho do reverse')
    
    if frasal:
        consulta = '"' + consulta + '"'
        
    query = QueryParser('reviewText', myindex.schema).parse(consulta)
    
    if len(sort_by) > 0:
        
        if type(sort_by) != list:
            sort_by = [sort_by]
        if type(reverse) != list:
            reverse = [reverse]

        facets = []
        for i in range(len(sort_by)):
            if sort_by[i] == 'score':
                 facets.append(sorting.ScoreFacet())
            else:
                facets.append(sorting.FieldFacet(sort_by[i], reverse[i]))

        with myindex.searcher(weighting = scoring.TF_IDF()) as searcher:
            results = [(dict(hit), hit.highlights('reviewText')) for hit in searcher.search(query, limit = limit, sortedby = facets)]
        return results
    
    else:
        with myindex.searcher(weighting = scoring.TF_IDF()) as searcher: 
            results = [(dict(hit), hit.highlights('reviewText')) for hit in searcher.search(query, limit = limit)]
        return results

In [17]:
res = busca2('the legend of zelda: twilight princess', limit = 9999, frasal = True) 
len(res)

86

In [18]:
res2 = busca2('the legend of zelda: twilight princess', limit = 9999, frasal = False) 
len(res2)

162

Podemos ver que quando utilizamos **frasal = True**, a função retorna menos resultados como esperado.

No entanto, o highlight acaba marcando todos os tokens e não apenas a frase, tem uma issue disso em:
https://github.com/whoosh-community/whoosh/issues/486.

In [19]:
for i in range(10):
    display(HTML(res[i][1]))

Ordenando os resultados pela nota (maior pro menor) e data (mais recente para mais antigo)

In [20]:
res = busca2('the legend of zelda twilight princess', limit = 9999, frasal = False, sort_by = ['overall', 'reviewTime'], 
             reverse = [True, True])

In [21]:
res

[({'asin': 'B0009UBR3A',
   'overall': 5.0,
   'reviewText': "the legend of zelda twilight princess is the greatest video game in the universe a true classic the best of the best the legend of zelda twilight princess rocks the legend of zelda twilight princess storyline is the greatest storyline in the universe a true classic the best of the best the legend of zelda twilight princess storyline rocks metroid other m is the greatest video game in the universe true classic the best of the best metroid other m rocks nintendo is the greatest video game company in the universe there true genius's the best of the best nintendo rocks all the games that nintendo made over the years are the greatest games in the universe there true classics the best of the best all of nintendo games rocks",
   'reviewTime': '2014-04-25',
   'reviewerName': 'Ivan Orozco',
   'summary': 'the legend of zelda twilight princess is classic',
   'unixReviewTime': 1398384000},
  'the <b class="match term0">legend</b> of