In [155]:
%load_ext autoreload
%autoreload 2

import re
import sys
import nltk   
import time
import glob
import json, bson
import pathlib
import requests
import unicodedata
import numpy as np
import pandas as pd
from collections import defaultdict

#from utils import *
from bs4 import BeautifulSoup
from html.parser import HTMLParser
from tqdm import tqdm_notebook as tqdm

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

agent = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0',
          'Accept-Language': 'pt-BR'}

files_path = r"../../../Parte2/data/*/*" 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Most frequent Attributes

In [2]:
# Low case only, remenber toremove non-ascii
most_frequents = {
    'price':['preço', 'preco'],
    'model':['modelo'],
    'ram':['memoria ram', 'ram', 'memória RAM'],
    'hd':['armazenamento interno', 'memória interna', 'memoria interna', 'interna'],
    'screen':['tamanho da tela', 'tela', 'tamanho do display', 'display', 'tamanho']
}



## Read Files and Extract Informations

In [3]:
def check_string(text):
    if len(text) <= 2:
        return False
    if text[:3] == 'var':
        return False
    if text[0] == u'\xa0':
        return False
    return True

In [97]:
stores_path = glob.glob(files_path)
dictionary  = []
pages_dict  = []


for page in tqdm(stores_path):
    with open(page, "r", encoding='utf-8') as f:
        doc= f.read()

    s = BeautifulSoup(doc, "html.parser")
    for script in s(["script", "style"]):
        script.decompose()    # rip it out
    
    all_text = s.body.find_all(text=True)

    html_page = [x for x in all_text if check_string(x)]
    html_page = ' '.join(html_page)

    clean_text = re.sub(r'[^\w\s$]','', html_page.lower())
    clean_text = re.sub(r'[\n\t]','', clean_text)
    #to lower case

    clean_text = [text for text in clean_text.split(' ') if len(text) > 2]
    
    word_dict = defaultdict(lambda: 0)
    words, counts = np.unique(clean_text, return_counts=True)
    for word, count in zip(words, counts):
        word_dict[word] = count

    pages_dict.append(word_dict)

    dictionary += list(words)
dictionary = np.unique(dictionary)

HBox(children=(IntProgress(value=0, max=434), HTML(value='')))




## Create Inverted Index

In [189]:
inverted_index = {}
for key in tqdm(dictionary):
    
    inverted_index[key] = []
    for page_idx, page_dict in enumerate(pages_dict):
        count = page_dict[key]
        if count > 0 :
            inverted_index[key].append((int(page_idx), int(count)))

HBox(children=(IntProgress(value=0, max=20453), HTML(value='')))




## Save Inverted Index

In [190]:
save_path = '../../data/index_data/'
with open(save_path + 'inverted_index.json', 'w') as f:
    json.dump(inverted_index, f)

In [191]:
with open(save_path + 'inverted_index.bson', 'wb') as fp:
    fp.write(bson.dumps(inverted_index))

In [176]:
full_path = [os.path.abspath(store_path) for store_path in stores_path]
with open(save_path + 'store_path.json', 'w') as f:
    json.dump(full_path, f)
    
#with open('store_path.bson', 'wb') as fp:
#    fp.write(bson.dumps(dict(full_path)))

# Load Test

In [177]:
with open(save_path + 'inverted_index.bson', 'rb') as fp:
    inverted_index_bson = bson.loads(fp.read())


In [178]:
with open(save_path +'inverted_index.json') as json_file:
    inverted_index_json = json.load(json_file)

## Create Inverted Index With Low Small Data Compression

In [192]:
inverted_index_c = {}
for key in tqdm(dictionary):
    idx_count = 0
    inverted_index_c[key] = []
    for page_idx, page_dict in enumerate(pages_dict):
        count = page_dict[key]
        if count > 0 :
            inverted_index_c[key].append((int(idx_count), int(count)))
            idx_count = 0
        idx_count += 1

HBox(children=(IntProgress(value=0, max=20453), HTML(value='')))




In [193]:
save_path = '../../data/index_data/'
with open(save_path + 'inverted_index_sc.json', 'w') as f:
    json.dump(inverted_index_c, f)

In [194]:
with open(save_path + 'inverted_index_sc.bson', 'wb') as fp:
    fp.write(bson.dumps(inverted_index_c))