In [1]:
import numpy as np
import pandas as pd
import random

from functions.hyperloglog import HashLogLog, HyperLogLog
from functions.kmeans import KMeans
from functions.data_preprocess import remove_html_tags, clean_data

[nltk_data] Downloading package wordnet to /home/michele/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/michele/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/michele/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 1. Hashing

## Sequentially adding data to the HLL

In [2]:
log2m, bits = 6, 32

HLL_6 = HyperLogLog(log2m=log2m, bits=bits)

hll_6 = HLL_6.structure('hash.txt')

print(hll_6)

[23, 21, 23, 21, 23, 25, 22, 23, 23, 20, 21, 23, 21, 20, 22, 22, 21, 26, 20, 21, 26, 22, 21, 22, 22, 22, 20, 22, 25, 23, 26, 22, 24, 21, 21, 21, 21, 20, 20, 21, 24, 21, 24, 25, 22, 21, 25, 25, 25, 22, 20, 24, 22, 26, 20, 24, 24, 26, 21, 26, 20, 25, 22, 21]


## Cardinality and (relative) error of the filter

In [3]:
estimate_6 = HLL_6.cardinality(hll_6)
error_6 = HLL_6.error()

print('Total bits: {}\nBits for the buckets: {}'.format(bits, log2m))
print('Estimate: {}\nError: {}'.format(estimate_6, error_6))

Total bits: 32
Bits for the buckets: 6
Estimate: 139604896
Error: 0.13


### Observation
Maybe we can do a little bit better by increasing the lenght of the root to 11 bits. This seems to be a good point for the tradeoff between error and efficiency!

In [4]:
log2m, bits = 11, 32

HLL_11 = HyperLogLog(log2m=log2m, bits=bits)

hll_11 = HLL_11.structure('hash.txt')

estimate_11 = HLL_11.cardinality(hll_11)

error_11 = HLL_11.error()

print('Total bits: {}\nBits for the buckets: {}'.format(bits, log2m))
print('Estimate: {}\nError: {:.3f}'.format(estimate_11, error_11))

Total bits: 32
Bits for the buckets: 11
Estimate: 123189077
Error: 0.023


# Real error
To compute the real error of the algorithm we used, we have to count the exact number of unique elements that appear in the `hash.txt` file. In order to do that, there are severel options. One of those is to use shell commands, and that's what we've chosen to do.

In [5]:
!sort hash.txt | uniq | wc -l

125000000


# 2. Clustering

## 2.0 Preprocess data

In [2]:
data_path = './data/Reviews.csv'

data = pd.read_csv(data_path)

In [3]:
# Converting the reviews to lowercase and removing html tags

data['Text'] = data['Text'].str.lower()

In [10]:
# Removing html tags

data['Text'] = data['Text'].apply(remove_html_tags)

In [12]:
# Removing punctuation, special characters, digits 
# Tokenizing, lemmatizing the data

data['Text'] = data['Text'].apply(clean_data)

In [13]:
data.to_csv('./data/clean_data.csv')