# Load Packages

In [1]:
import io
import json
import numpy as np
import unidecode
import re
import sys

# Fix seed
np.random.seed(100)

# Preprocessing

In [2]:
# Regex for removing all non-alphabet letters or spaces
regex = re.compile('[^a-zA-Z ]')

def preprocessing(documents):
    for i in range(len(documents)):
        # Remove all accents
        documents[i] = unidecode.unidecode(documents[i])
        
        # Remove all non-alphabet letters or spaces
        documents[i] = regex.sub(' ', documents[i])
        
        # Remove extra spaces
        documents[i] = ' '.join([token for token in documents[i].split(' ') if token])
        
        # To lower
        documents[i] = documents[i].lower()
    return documents

# Finding Similar Items

## Utils

### Universal Hashing

In [3]:
def universal_hash(prime, size):
    # Check values
    if prime < size:
        raise ValueError("Prime number should be greater than size")
    
    # Generate random values
    a = np.random.randint(1, prime)
    b = np.random.randint(1, prime)
    
    # Return hash function
    return lambda x: ((a * x + b) % prime) % size

## Shingling

In [4]:
def to_shingles(documents, k, padding='#'):
    # Map keeping the "Hash" of the shingles
    shingle_map = {}
    idx = 0
    
    # New document structure
    doc_shingles = []
    for i, document in enumerate(documents):
        shingles = set()
        # Split each document in k-shingles
        for j in range(0, len(document), k):
            # Get shingle
            shingle = document[j:j+k]            
            
            # For the last Shingle, if necessary add padding
            if j + k > len(document):
                shingle += padding * (k - len(shingle))
            
            # For efficience purposes, apply hash
            if shingle in shingle_map:
                hashed_shingle = shingle_map[shingle]
            else:
                shingle_map[shingle] = idx
                hashed_shingle = idx
                idx += 1
            
            # Append to set of shingles
            shingles.add(hashed_shingle)
            
        # Attribute to document
        doc_shingles.append(shingles)
        
    return doc_shingles, shingle_map

## Min-Hashing

In [5]:
def compute_min_hashing(documents, shingles_size, k, prime=2**61-1):
    # Instantiate hash methods to be used as permutations
    hash_methods = [universal_hash(prime, shingles_size)
                    for i in range(k)]
    
    # Signature of each document
    signatures = [[sys.maxsize
                   for j in range(k)]
                  for i in range(len(documents))]
    
    # Each shingle for each document just need to be computed once
    computed = [set() for i in range(len(documents))]
    
    for i, document in enumerate(documents):
        for shingle in document:
            # Shingle already computed
            if shingle in computed[i]:
                continue
            
            # Compute hash for shingle
            computed[i].add(shingle)
            for j, hash_method in enumerate(hash_methods):
                hash_value = hash_method(shingle)
                
                # Check if "permutation position" is lower
                if hash_value < signatures[i][j]:
                    signatures[i][j] = hash_value
    
    # Return signature of all documents
    return signatures

## Locality-Sensitive Hashing

In [6]:
def compute_lsh(signatures, rows, bands, threshold, prime=2**61-1):
    # Make n_buckets as large as possible
    # For now, we will use "1GB"
    n_buckets = int(10**9 / bands)
    
    # Instantiate hash methods
    hash_methods = [universal_hash(prime, n_buckets)
                    for i in range(bands)]
    
    # Buckets for all hashes
    hash_buckets = [{} for i in range(bands)]
    
    for i, signature in enumerate(signatures):
        for j in range(0, len(signature), rows):
            # Get mini signature
            mini_signature = signature[j:j+rows]
            
            # "Merge" entries of vector
            value = 1
            for item in mini_signature:
                value *= item
                
            # Compute hash/bucket for the band
            for k, hash_method in enumerate(hash_methods):
                hash_value = hash_method(value)
                
                if hash_value in hash_buckets[k]:
                    hash_buckets[k][hash_value].append(i)
                else:
                    hash_buckets[k][hash_value] = [i]
    
    # Find all candidates
    candidates = set()
    for hash_bucket in hash_buckets:
        for bucket, values in hash_bucket.items():
            # Only interested in pairs
            if len(values) < 2:
                continue
            
            # Check if items are candidates (> threshold)
            for i in range(0, len(values)):
                for j in range(i + 1, len(values)):
                    equal_values = 0
                    for k in range(rows * bands):
                        if signatures[values[i]][k] == signatures[values[j]][k]:
                            equal_values += 1
                    if equal_values >= threshold * rows * bands:
                        # Keep order, so we can eliminate duplicates
                        if values[i] > values[j]:
                            candidates.add((values[j], values[i]))
                        else:
                            candidates.add((values[i], values[j]))
    
    return list(candidates)

# Experiments

## Parameters

In [7]:
data_path = "../datasets/development.json"

## Hyperparameters

In [8]:
k = 5
n_rows = 20
n_bands = 5
n_hashes = n_rows * n_bands
threshold = 0.999

## Load dataset

In [9]:
data_reader = io.open(data_path, mode="r", encoding="utf-8")

# Go to beginning
data_reader.seek(0)

# Parse all text from json
documents = [document['description']
            for document in json.loads(data_reader.readline())]

data_reader.close()

## Run

In [10]:
parsed_documents = preprocessing(documents)

In [11]:
documents_shingles, map_shingles = to_shingles(parsed_documents, k)

In [12]:
signatures = compute_min_hashing(documents_shingles, len(map_shingles), n_hashes)

In [13]:
candidates = compute_lsh(signatures, n_rows, n_bands, threshold)

## Analysis

In [14]:
len(map_shingles)

45678

In [15]:
len(candidates)

3389

In [26]:
for i in range(2):
    idx = np.random.randint(0, len(candidates))
    item_a = candidates[idx][0]
    item_b = candidates[idx][1]
    print('%s:\t%s' % (item_a, documents[item_a]))
    print('%s:\t%s' % (item_b, documents[item_b]))
    print()

535:	atividades profissionais elaborar e emitir desenhos relativos aos projetos de produtos maquinas e ferramentas desenvolver desenhos em autocad solidworks e outras ferramentas elaborar fluxogramas layouts pecas para fabricacao e documentacao tecnica
5923:	atividades profissionais elaborar e emitir desenhos relativos aos projetos de produtos maquinas e ferramentas desenvolver desenhos em autocad solidworks e outras ferramentas elaborar fluxogramas layouts pecas para fabricacao e documentacao tecnica

6010:	atividades apoio a area comercial atender central de telefones e celulares da empresa atualizacao e confeccao de planilhas de controle apoio ao faturamento quando necessario suporte area comercial como liberacao de pedidos pesquisa no cadastro de clientes liberacao de agenda para vendedores e todas as demais funcoes de suporte horario das as empresa empresa comercial e distribuidora localizada na zona norte de porto alegre requisitos conhecimento solido na funcao disponibilidade pa