In [1]:
import time
import pathlib
import sys
import os
import json
import math
import webbrowser
from operator import itemgetter
from tqdm import tqdm_notebook as tqdm
from IPython.display import clear_output
from warp import *

# Some global variables to use all the way in the code
stop_words = ["a", "o", "ao", "de", "da", "do", "um", "uma", "uns", "pra", "para", "por", "com", "gb"]
INVERTED_INDEX_PATH = "../../data/index_data/inverted_index.json"
INVERTED_INDEX_ATT_PATH = "../../data/index_data/inverted_index_att.json"
FULL_INFO_PATH = "../../data/index_data/full_info.json"
STORE_PATH = "../../data/index_data/store_path.json"
K = 10

## Auxiliar Functions

In [44]:
def load_json_file(path_to_file):
    """
    Loads a json file and return the dictionary.
    
    Parameters:
    path_to_file(str): Path to the json file
    
    Returns:
    dictionary: Contains the information that is stored in the json file as a dictionary
    """
    dictionary = {}
    with open(path_to_file) as json_file:
        dictionary = json.load(json_file)
    return dictionary

def cossine_score(query, inverted_index, weight_bool=False):
    # dictionary to the save the score of each page
    docs_scores = {}
    words_in_index = 0
    
    # adds the scores to each page ID (term-at-a-time)
    for word in query:
        if word in inverted_index:
            words_in_index += 1
            w_tq = idf(inverted_index[word], NUMBER_OF_DOCUMENTS)
            #print("idf_t {}: {}".format(word, w_tq))
            for doc_sc in inverted_index[word]:                    
                tf = (1 + math.log(doc_sc[1]) if weight_bool else doc_sc[1])
                wf_td = (tf * w_tq if weight_bool else tf)
                if doc_sc[0] in docs_scores:
                    docs_scores[doc_sc[0]] += wf_td * w_tq
                else:                    
                    docs_scores[doc_sc[0]]  = wf_td * w_tq
    
    if(words_in_index < 1):
        print("Não foi possível encontrar nenhum item com essa cosulta, tente novamente.")
        return []
    
    ranked = sorted(docs_scores.items(), key=itemgetter(1), reverse=True)
    urls = get_pages_urls(ranked)    
    return (urls,ranked)

def spearman(rank1, rank2):
    k = len(rank1)
    diff = 0
    i = 0
    j = 0
    
    for elem1 in rank1:
        for elem2 in rank2:
            if(elem1[0] == elem2[0]):
                diff += (i - j)**2
            j += 1
        i += 1
        j = 0
    result = 1 - ((6 * diff) / (k * (k**2 - 1)))
    return result

def rank_by_attributes(query, inverted_index, product):
    weight_model = 3
    weight_price = 1
    weight_ram = 2
    weight_hd = 2
    weight_screen = 1
    
    docs_scores = {}
    
    for word in query:
        if word in inverted_index:
            for doc_sc in inverted_index[word]:
                if product[word] == 'price':
                    pass
                if product[word] == 'model':
                    pass
                if product[word] == 'ram':
                    pass
                if product[word] == 'hd':
                    pass
                if product[word] == 'screen':
                    pass
            
    
def get_pages_urls(ranked):
    full_info = load_json_file(FULL_INFO_PATH)
    urls = []
    
    for i in range(0,K):
        doc = ranked[i][0]
        for info in full_info:
            if(info["idx"] == doc):
                page = info["url"].split("/")
                page = page[-2] + "/" + page[-1]
                urls.append(page)
    return urls

def idf(doc_list, N):
    return math.log(1 + N/len(doc_list))

def show_in_browser(pages_list):
    pages_path = os.getcwd()
    idx_cut    = pages_path.find("query/")
    pages_path = pages_path[:idx_cut] + "data/pages/"
    
    print("\ns - Para sair")
    print("a - Abre todas as páginas do ranking\n")
    cmd = ''
    while(cmd.lower() != 's'):
        cmd = input("Qual página do ranking você deseja abrir (1 - {})? ".format(len(pages_list)))
        try:
            filename = 'file:///' + pages_path + pages_list[int(cmd)-1]
            webbrowser.open_new_tab(filename)
        except:
            if(cmd.lower() == 's'):
                break
            elif(cmd.lower() == 'a'):
                for page in pages_list:
                    filename = 'file:///' + pages_path + page
                    webbrowser.open_new_tab(filename)
            else:
                print("Página inválida")


In [45]:
# Number of documents in collection
NUMBER_OF_DOCUMENTS = len(load_json_file(STORE_PATH))

## Search using a single string

In [50]:
def search_by_keywords():
    query = input("Digite o smartphone ou características do smartphone que deseja pesquisar: ")
    query = query.lower().split(" ")

    # filter stop words and loads json file
    query = [word for word in query if (word not in stop_words)]
    inverted_index = load_json_file(INVERTED_INDEX_PATH)
    
    list_no_weight, rank_no_weight = cossine_score(query, inverted_index, False)
    list_weight, rank_weight = cossine_score(query, inverted_index, True)
    
    print("Coeficiente de Spearman: {}".format(spearman(rank_no_weight, rank_weight)))
    
    if(len(list_weight) > 0):        
        print("-------No weighting-------")
        print(*list_no_weight, sep = "\n")
        #show_in_browser(list_no_weight)
    
    if(len(list_weight) > 0):        
        print("\n-------With weighting-------")
        print(*list_weight, sep = "\n")
        #show_in_browser(list_weight)
    
    # print(rank_docs(docs_scores))

In [None]:
search_by_keywords()

## Search using attributes and values

In [51]:
def search_by_attributes():
    product = {
        "price" :"",
        "model" :"",
        "ram"   :"", 
        "hd"    :"", 
        "screen":""
    } 

    product["price"]  = input("Insira a faixa de preço do smartphone: ")
    product["model"]  = input("Insira o modelo do smartphone: ")
    product["ram"]    = input("Informe a faixa de RAM do smartphone: ")
    product["hd"]     = input("Insira a faixa de capacidade de armazenamento desejada: ")
    product["screen"] = input("Informe o tamanho de tela: ")
    
    # intervals
    price_set = False
    ram_set = False
    hd_set = False
    screen_set = False
    
    inverted_att_index = load_json_file(INVERTED_INDEX_ATT_PATH)
    #print(*inverted_att_index.values(), sep='\n')

    inverted_index = {}
    min_loss = 1
    
    #treat the fields and values
    for key in inverted_att_index:
        splitted = key.split(".")
        field = splitted[0]
        if len(splitted) > 2:
            value = splitted[1:]
            value = ".".join(value)
        else:
            value = splitted[1]
            
        if(field == 'model'):
            loss = nomr_l(product[field], value)
            if(loss < min_loss):
                min_loss = loss
                inverted_index[product[field]] = inverted_att_index[field+"."+value]
                
        elif(field =='price' and not price_set):
            if(int(product[field]) < int(value)):
                inverted_index[product[field]] = inverted_att_index[field+"."+value]
                price_set = True
        elif(field =='ram' and not ram_set):
            if(int(product[field]) < int(value)):
                inverted_index[product[field]] = inverted_att_index[field+"."+value]
                ram_set = True
        elif(field =='hd' and not hd_set):
            if(int(product[field]) < int(value)):
                inverted_index[product[field]] = inverted_att_index[field+"."+value]
                hd_set = True
        elif(field =='screen' and not screen_set):
            if(int(product[field]) < int(value)):
                inverted_index[product[field]] = inverted_att_index[field+"."+value]
                screen_set = True
        
        ranked = rank_by_attributes()

In [None]:
search_by_attributes()

# User's Menu

In [54]:
CMD = -1;
while(CMD != 0):
    print("1 - Para fazer uma busca através de palavras chaves")
    print("2 - Para uma busca através de atributos específicos")
    print("3 - Para limpar a tela")
    print("0 - Para sair")
    CMD = int(input())
    
    if(CMD == 0):
        pass
    elif(CMD == 1):
        search_by_keywords()
    elif(CMD == 2):
        search_by_attributes()
    elif(CMD == 3):
        clear_output()
    else:
        print("Comando inválido")

1 - Para fazer uma busca através de palavras chaves
2 - Para uma busca através de atributos específicos
3 - Para limpar a tela
0 - Para sair
1
Digite o smartphone ou características do smartphone que deseja pesquisar: iphone 8
idf_t iphone: 0.9360933591703348
idf_t iphone: 0.9360933591703348
Coeficiente de Spearman: 1.0
-------No weighting-------
ibyte/2.html
taqi/10.html
taqi/11.html
taqi/12.html
taqi/13.html
taqi/14.html
taqi/9.html
taqi/0.html
taqi/1.html
taqi/6.html

-------With weighting-------
ibyte/2.html
taqi/10.html
taqi/11.html
taqi/12.html
taqi/13.html
taqi/14.html
taqi/9.html
taqi/0.html
taqi/1.html
taqi/6.html
1 - Para fazer uma busca através de palavras chaves
2 - Para uma busca através de atributos específicos
3 - Para limpar a tela
0 - Para sair
1
Digite o smartphone ou características do smartphone que deseja pesquisar: samsung S9
idf_t samsung: 0.9560789819325678
idf_t samsung: 0.9560789819325678
Coeficiente de Spearman: 1.0
-------No weighting-------
magazineluiza/23