In [284]:
import time
import pathlib
import sys
import os
import json
import math
import webbrowser
from operator import itemgetter
from tqdm import tqdm_notebook as tqdm
from IPython.display import clear_output
from warp import *

# Some global variables to use all the way in the code
stop_words = ["a", "o", "ao", "de", "da", "do", "um", "uma", "uns", "pra", "para", "por", "com", "gb"]
INVERTED_INDEX_PATH = "../../data/index_data/inverted_index.json"
INVERTED_INDEX_ATT_PATH = "../../data/index_data/inverted_index_att.json"
FULL_INFO_PATH = "../../data/index_data/full_info.json"
STORE_PATH = "../../data/index_data/store_path.json"
K = 5

## Auxiliar Functions

In [313]:
def load_json_file(path_to_file):
    """
    Loads a json file and return the dictionary.
    
    Parameters:
    path_to_file(str): Path to the json file
    
    Returns:
    dictionary: Contains the information that is stored in the json file as a dictionary
    """
    dictionary = {}
    with open(path_to_file) as json_file:
        dictionary = json.load(json_file)
    return dictionary

def cossine_score(query, inverted_index, weight_bool=False):
    # dictionary to the save the score of each page
    docs_scores = {}
    
    # adds the scores to each page ID (term-at-a-time)
    for word in query:
        if word in inverted_index:
            w_tq = idf(inverted_index[word], NUMBER_OF_DOCUMENTS)
            print("{}: {}".format(word, w_tq))
            for doc_sc in inverted_index[word]:                    
                tf = (1 + math.log(doc_sc[1]) if weight_bool else doc_sc[1])
                wf_td = (tf * w_tq if weight_bool else tf)
                if doc_sc[0] in docs_scores:
                    docs_scores[doc_sc[0]] += wf_td * w_tq
                else:                    
                    docs_scores[doc_sc[0]]  = wf_td * w_tq
    
    ranked = sorted(docs_scores.items(), key=itemgetter(1), reverse=True)
    print(ranked)

    urls = []
    full_info = load_json_file(FULL_INFO_PATH)
    
    for i in range(0,K):
        doc = ranked[i][0]
        for info in full_info:
            if(info["idx"] == doc):
                page = info["url"].split("/")
                page = page[-2] + "/" + page[-1]
                urls.append(page)
    
    return urls

def idf(doc_list, N):
    return math.log(1 + N/len(doc_list))

def show_in_browser(pages_list):
    pages_path = os.getcwd()
    idx_cut    = pages_path.find("query/")
    pages_path = pages_path[:idx_cut] + "data/pages/"
    
    print("\ns - Para sair")
    print("a - Abre todas as páginas do ranking\n")
    cmd = ''
    while(cmd.lower() != 's'):
        cmd = input("Qual página do ranking você deseja abrir (1 - {})? ".format(len(pages_list)))
        try:
            filename = 'file:///' + pages_path + pages_list[int(cmd)-1]
            webbrowser.open_new_tab(filename)
        except:
            if(cmd.lower() == 's'):
                break
            elif(cmd.lower() == 'a'):
                for page in pages_list:
                    filename = 'file:///' + pages_path + page
                    webbrowser.open_new_tab(filename)
            else:
                print("Página inválida")

In [314]:
# Number of documents in collection
NUMBER_OF_DOCUMENTS = len(load_json_file(STORE_PATH))

## Search using a single string

In [318]:
def search_by_keywords():
    query = input("Digite o smartphone ou características do smartphone que deseja pesquisar: ")
    query = query.lower().split(" ")

    # filter stop words and loads json file
    query = [word for word in query if (word not in stop_words)]
    inverted_index = load_json_file(INVERTED_INDEX_PATH)
    
    list_no_weight = cossine_score(query, inverted_index, False)
    list_weight = cossine_score(query, inverted_index, True)
    
    print("-------No weighting-------")
    print(*list_no_weight, sep = "\n")
    show_in_browser(list_no_weight)
    
    print("\n-------With weighting-------")
    print(*list_weight, sep = "\n")
    show_in_browser(list_weight)
    
    # print(rank_docs(docs_scores))

In [319]:
search_by_keywords()

Digite o smartphone ou características do smartphone que deseja pesquisar: iphone
iphone: 0.9360933591703348
[(183, 30.891080852621048), (404, 20.594053901747365), (405, 20.594053901747365), (406, 20.594053901747365), (407, 20.594053901747365), (408, 20.594053901747365), (433, 20.594053901747365), (402, 18.721867183406694), (403, 18.721867183406694), (430, 18.721867183406694), (431, 18.721867183406694), (432, 18.721867183406694), (181, 16.849680465066026), (186, 16.849680465066026), (187, 16.849680465066026), (192, 16.849680465066026), (411, 16.849680465066026), (412, 16.849680465066026), (413, 16.849680465066026), (418, 16.849680465066026), (419, 16.849680465066026), (420, 16.849680465066026), (421, 16.849680465066026), (178, 15.91358710589569), (184, 15.91358710589569), (185, 15.91358710589569), (415, 15.91358710589569), (416, 15.91358710589569), (417, 15.91358710589569), (206, 14.977493746725356), (207, 14.977493746725356), (208, 14.977493746725356), (409, 14.041400387555022), (410,

Qual página do ranking você deseja abrir (1 - 5)? s

-------With weighting-------
ibyte/2.html
taqi/10.html
taqi/11.html
taqi/12.html
taqi/13.html

s - Para sair
a - Abre todas as páginas do ranking

Qual página do ranking você deseja abrir (1 - 5)? s


## Search using attributes and values

In [300]:
def search_by_attributes():
    product = {
        "price" :"",
        "model" :"",
        "ram"   :"", 
        "hd"    :"", 
        "screen":""
    } 

    product["price"]  = input("Insira a faixa de preço do smartphone (Ex: R$1000,00 - R$2000,00): ")
    product["model"]  = input("Insira o modelo do smartphone: ")
    product["ram"]    = input("Informe a faixa de RAM do smartphone (Ex: 4GB - 8GB: ")
    product["hd"]     = input("Insira a faixa de capacidade de armazenamento desejada (Ex: 32GB - 64GB): ")
    product["screen"] = input("""Informe o menor e maior tamanho de tela (Ex: 4,7" - 5,2": """)

## User's Menu

In [301]:
CMD = -1;
while(CMD != 0):
    print("1 - Para fazer uma busca através de palavras chaves")
    print("2 - Para uma busca através de atributos específicos")
    print("3 - Para limpar a tela")
    print("0 - Para sair")
    CMD = int(input())
    
    if(CMD == 0):
        pass
    elif(CMD == 1):
        search_by_keywords()
    elif(CMD == 2):
        search_by_attributes()
    elif(CMD == 3):
        clear_output()
    else:
        print("Comando inválido")

1 - Para fazer uma busca através de palavras chaves
2 - Para uma busca através de atributos específicos
3 - Para limpar a tela
0 - Para sair
1
Digite o smartphone ou características do smartphone que deseja pesquisar: samsung 32gb
-------No weighting-------
magazineluiza/23.html
kabum/23.html
avenida/0.html
colombo/19.html
kabum/25.html

s - Para sair
a - Abre todas as páginas do ranking

Qual página do ranking você deseja abrir (1 - 5)? a
Qual página do ranking você deseja abrir (1 - 5)? s

-------With weighting-------
magazineluiza/23.html
kabum/23.html
avenida/0.html
colombo/19.html
kabum/25.html

s - Para sair
a - Abre todas as páginas do ranking



KeyboardInterrupt: 

In [51]:
math.log2(0)

ValueError: math domain error

In [235]:
pages_path = os.getcwd()
idx_cut    = pages_path.find("query/")
pages_path = pages_path[:idx_cut]


'/Users/Matheus/Documents/Git/Faculdade/IF962 - Recuperação da informação/RI_smartphone/Parte2/'