In [81]:
%load_ext autoreload
%autoreload 2

import os
import re
import sys
import nltk   
import time
import glob
import re
import json, bson
import pathlib
import requests
import unicodedata
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from html.parser import HTMLParser
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import warp

agent = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0',
          'Accept-Language': 'pt-BR'}

stores_path = r"../../../Parte2/data/pages/*"
files_path  = stores_path + r"/*" 
stores      = [pathlib.Path(store).stem for store in glob.glob(stores_path)]
file_list   = sorted(glob.glob(files_path))


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Read Files and Extract Informations

In [125]:
def to_float(value):
    s = re.sub("[^0-9,]", "", value);
    s = s.replace(',','.')
    if len(s):
        return float(s)
    else:
        return -1
    
def check_string(text):
    if len(text) <= 2:
        return False
    if text[:3] == 'var':
        return False
    if text[0] == u'\xa0':
        return False
    return True

In [53]:
dictionary  = []
pages_dict  = []


for page in tqdm(file_list):
    with open(page, "r", encoding='utf-8') as f:
        doc= f.read()

    s = BeautifulSoup(doc, "html.parser")
    for script in s(["script", "style"]):
        script.decompose()    # rip it out
    
    all_text = s.body.find_all(text=True)

    html_page = [x for x in all_text if check_string(x)]
    html_page = ' '.join(html_page)

    clean_text = re.sub(r'[^\w\s$]','', html_page.lower())
    clean_text = re.sub(r'[\n\t]','', clean_text)
    #to lower case

    clean_text = [text for text in clean_text.split(' ') if len(text) > 2]
    
    word_dict = defaultdict(lambda: 0)
    words, counts = np.unique(clean_text, return_counts=True)
    for word, count in zip(words, counts):
        word_dict[word] = count

    pages_dict.append(word_dict)

    dictionary += list(words)
dictionary = np.unique(dictionary)

HBox(children=(IntProgress(value=0, max=434), HTML(value='')))




## Create Inverted Index

In [54]:
inverted_index = {}
for key in tqdm(dictionary):
    
    inverted_index[key] = []
    for page_idx, page_dict in enumerate(pages_dict):
        count = page_dict[key]
        if count > 0 :
            inverted_index[key].append((int(page_idx), int(count)))

HBox(children=(IntProgress(value=0, max=20453), HTML(value='')))




## Save Inverted Index

In [55]:
save_path = '../../data/index_data/'
with open(save_path + 'inverted_index.json', 'w') as f:
    json.dump(inverted_index, f)

In [56]:
with open(save_path + 'inverted_index.bson', 'wb') as fp:
    fp.write(bson.dumps(inverted_index))

In [59]:
full_path = [os.path.abspath(store_path) for store_path in stores_path]
with open(save_path + 'store_path.json', 'w') as f:
    json.dump(full_path, f)
    
#with open('store_path.bson', 'wb') as fp:
#    fp.write(bson.dumps(dict(full_path)))

# Load Test

In [60]:
with open(save_path + 'inverted_index.bson', 'rb') as fp:
    inverted_index_bson = bson.loads(fp.read())


In [121]:
with open(save_path +'inverted_index.json') as json_file:
    inverted_index_json = json.load(json_file)

## Create Inverted Index With Low Small Data Compression

In [None]:
inverted_index_c = {}
for key in tqdm(dictionary):
    idx_count = 0
    inverted_index_c[key] = []
    for page_idx, page_dict in enumerate(pages_dict):
        count = page_dict[key]
        if count > 0 :
            inverted_index_c[key].append((int(idx_count), int(count)))
            idx_count = 0
        idx_count += 1

In [None]:
save_path = '../../data/index_data/'
with open(save_path + 'inverted_index_sc.json', 'w') as f:
    json.dump(inverted_index_c, f)

In [None]:
with open(save_path + 'inverted_index_sc.bson', 'wb') as fp:
    fp.write(bson.dumps(inverted_index_c))

## Most frequent Attributes

In [6]:
# Low case only, remenber toremove non-ascii

most_frequents = warp.most_frequents
ranges = {
    'price':[500, 1000, 2000, 5000, 999999],
    'ram':[2, 4, 8, 16],
    'hd':[16, 32, 64, 128, 256],
}


dict_keys(['price', 'model', 'ram', 'hd', 'screen'])

In [62]:
dict_list = []
for idx, url in enumerate(tqdm(file_list)):
    for store in stores:
        if store in url:
            function = getattr(warp,f'get_fields_{store}')
            dict_url = function(url)
            dict_url['store'] = store
            dict_url['idx']   = idx
            dict_list.append(dict_url)
            #print(dict_url)
            #print(idx, url, dict_url['Preço'])

HBox(children=(IntProgress(value=0, max=434), HTML(value='')))




## Create Inverted Index for Most Frequent Attributes

In [123]:
inverted_index = {}
last_value = 0
for enum_value in ranges.keys():
    for act_idx, target in enumerate(tqdm(ranges[enum_value])):
        if act_idx == 0:
            last_value = 0
        else:
            last_value = ranges[enum_value][act_idx - 1]
        index = f'{enum_value}.{target}'
        inverted_index[index] = []
        for page_idx, page_dict in enumerate(dict_list):
            price = to_float(page_dict[enum_value])

            if last_value < price < target :
                inverted_index[index].append((int(page_idx), (price)))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


