# Load Packages

In [2]:
import io
import json
import multiprocessing
import numpy as np
import pickle
import unidecode
import re
import sys

# Fix seed
np.random.seed(100)

# Preprocessing

In [3]:
# Regex for removing all non-alphabet letters or spaces. Also remove line breaks
regex = re.compile(r'([^a-zA-Z :]|[\t\n\r\f\v])')

def preprocessing(documents):
    for i in range(len(documents)):
        # Remove all accents
        documents[i] = unidecode.unidecode(documents[i])
        
        # Remove all non-alphabet letters or spaces
        documents[i] = regex.sub(' ', documents[i])
        
        # Remove extra spaces
        documents[i] = ' '.join([token for token in documents[i].split(' ') if token])
        
        # To lower
        documents[i] = documents[i].lower()
    return documents

# Finding Similar Items

## Utils

### Universal Hashing

In [4]:
def universal_hash(prime, size):
    # Check values
    if prime < size:
        raise ValueError("Prime number should be greater than size")
    
    # Generate random values
    a = np.random.randint(1, prime)
    b = np.random.randint(1, prime)
    
    # Return hash function
    return lambda x: ((a * x + b) % prime) % size

### Local Parser

In [5]:
# Regex for finding local patterns
lookup_local = ["local\s*:\s*", "local\s*de\s*trabalho\s*:\s*",
                "local\s*da\s*vaga\s*:\s*", "cidade\s*:\s*",
                "localizada\s*em\s*", "local\s*do\s*trabalho\s*:\s*",
                "localizacao\s*:\s*"]
local = re.compile("(" + "|".join(lookup_local) + ")", re.IGNORECASE)

# Get a fix legth chuck after pattern
fix_length = 10

def parse_local(document):
    match = local.search(document)
    if match:
        start, end = match.end(), match.end() + fix_length
        return document[start:end]

### Formation Parser

In [6]:
# Regex for finding formation patterns
lookup_formation = ["formacao\s*requerida\s*:\s*","formacao\s*desejada\s*:\s*",
                    "escolaridade\s*requerida\s*:\s*", "escolaridade\s*:\s*",
                    "escolaridade\s*desejada\s*:\s*", "formacao\s*:\s*"]
formation = re.compile("(" + "|".join(lookup_formation) + ")", re.IGNORECASE)

# Get a fix legth chuck after pattern
fix_length = 10

def parse_formation(document):
    match = formation.search(document)
    if match:
        start, end = match.end(), match.end() + fix_length
        return document[start:end]

## Shingling

In [7]:
def to_shingles(documents, k):
    # Map keeping the "Hash" of the shingles
    shingle_map = {}
    idx = 0
    
    # New document structure
    doc_shingles = []
    for i, document in enumerate(documents):
        shingles = set()
        # Split each document in k-shingles
        for j in range(0, len(document) - k + 1):
            # Get shingle
            shingle = document[j:j+k]
            
            # For efficience purposes, apply hash
            if shingle in shingle_map:
                hashed_shingle = shingle_map[shingle]
            else:
                shingle_map[shingle] = idx
                hashed_shingle = idx
                idx += 1
            
            # Append to set of shingles
            shingles.add(hashed_shingle)
            
        # Attribute to document
        doc_shingles.append(shingles)
        
    return doc_shingles, shingle_map

## Min-Hashing

In [8]:
def compute_min_hashing(documents, shingles_size, k, prime=2**61-1):
    # Due to multiprocessing
    np.random.seed()
    
    # Instantiate hash methods to be used as permutations
    hash_methods = [universal_hash(prime, shingles_size)
                    for i in range(k)]
    
    # Signature of each document
    signatures = [[sys.maxsize
                   for j in range(k)]
                  for i in range(len(documents))]
    
    # Each shingle for each document just need to be computed once
    computed = [set() for i in range(len(documents))]
    
    for i, document in enumerate(documents):
        for shingle in document:
            # Shingle already computed
            if shingle in computed[i]:
                continue
            
            # Compute hash for shingle
            computed[i].add(shingle)
            for j, hash_method in enumerate(hash_methods):
                hash_value = hash_method(shingle)
                
                # Check if "permutation position" is lower
                if hash_value < signatures[i][j]:
                    signatures[i][j] = hash_value
    
    # Return signature of all documents
    return np.array(signatures, dtype=np.uint64)

In [9]:
def compute_min_hashing_multiprocessing(documents, shingles_size, k, prime=2**61-1):
    # Instantiate
    threads = multiprocessing.cpu_count() - 1
    pool = multiprocessing.Pool(processes=threads)
    n, m = divmod(k, threads)    
    
    # Shared variable
    manager = multiprocessing.Manager()
    shared_documents = manager.list(documents)
    
    # Start threading
    jobs = []
    for i in range(threads):
        batch = n + (m if i == 0 else 0)
        job = pool.apply_async(compute_min_hashing,
                               args = (shared_documents, shingles_size, batch, prime))
        jobs.append(job)
    
    pool.close()
    pool.join()
    
    # Join all results
    results = [job.get() for job in jobs]
    return np.hstack(results)

## Locality-Sensitive Hashing

In [10]:
def compute_lsh(signatures, rows, bands, prime=2**61-1):
    # Make n_buckets as large as possible
    # For now, we will use "1GB"
    n_buckets = int(10**9)
    
    # Instantiate hash methods
    hash_methods = [universal_hash(prime, n_buckets)
                    for i in range(bands)]
    
    # Buckets for all hashes
    hash_buckets = [{} for i in range(bands)]
    
    for i, signature in enumerate(signatures):
        for j in range(bands):
            # Get mini signature
            mini_signature = signature[j*rows:j*rows+rows]
            
            # "Merge" entries of vector
            value = np.sum(np.power(mini_signature, 2))
                
            # Compute hash/bucket for the band
            hash_value = hash_methods[j](value)

            if hash_value in hash_buckets[j]:
                hash_buckets[j][hash_value].append(i)
            else:
                hash_buckets[j][hash_value] = [i]
    
    return hash_buckets

## Find Candidates

In [11]:
def find_candidates(hash_buckets, signatures, threshold):
    # Get signature size - signatures y-axis
    signature_size = signatures.shape[1]
    
    # Get all pairs
    pairs = set()
    for hash_bucket in hash_buckets:
        for bucket, values in hash_bucket.items():
            # Only interested in pairs
            if len(values) < 2:
                continue
            
            # Check if items are candidates (> threshold)
            for i in range(0, len(values)):
                for j in range(i + 1, len(values)):
                    # Keep order, so we can eliminate duplicates
                    if values[i] > values[j]:
                        pairs.add((values[j], values[i]))
                    else:
                        pairs.add((values[i], values[j]))
    
    # Find all candidates                        
    candidates = []
    for pair in pairs:
        idx, idy = pair
        equal_values = np.sum(signatures[idx] == signatures[idy])
        if equal_values >= threshold * signature_size:
            candidates.append(pair)
    
    return candidates

In [25]:
def check_false_positives(documents, pairs):
    true_positives = []
    for pair in pairs:
        idx, idy = pair
        
        # Check if same local
        local_x, local_y = parse_local(documents[idx]), parse_local(documents[idy])
        if local_x != local_y:
            continue
        
        # Check if same formation
        formation_x, formation_y = parse_formation(documents[idx]), parse_formation(documents[idy])
        if formation_x != formation_y:
            continue
        
        true_positives.append(pair)
    
    return true_positives        

## Convert documents to duplicates's clusters

In [13]:
def set_clusters(candidates, n_items):
    # Initialize clusters
    cids = {i:i for i in range(n_items)}
    clusters = {i:set([i]) for i in range(n_items)}
    
    # Fill clusters
    for item_a, item_b in candidates:
        # Already in the same cluster due to composition
        if cids[item_a] == cids[item_b]:
            continue
            
        # Get current clusters
        cluster_a = clusters[cids[item_a]]
        cluster_b = clusters[cids[item_b]]
        
        # Merge clusters
        if len(cluster_a) >= len(cluster_b):
            new_cid = cids[item_a]
            old_cid = cids[item_b]
        else:
            new_cid = cids[item_b]
            old_cid = cids[item_a]

        # Update
        for item in clusters[old_cid]:
            cids[item] = new_cid
        clusters[new_cid].update(clusters[old_cid])
        del clusters[old_cid]
        
    return clusters, cids

# Experiments

## Parameters

In [14]:
data_path = "../datasets/development.json"
dump_path = "../datasets/cids.tsv"

## Hyperparameters

In [18]:
k = 5
n_rows = 20
n_bands = 5
n_hashes = n_rows * n_bands
threshold = 0.999

## Load dataset

In [19]:
data_reader = io.open(data_path, mode="r", encoding="utf-8")

# Go to beginning
data_reader.seek(0)

# Parse all text from json
documents = [document['description']
             for document in json.loads(data_reader.readline())
             if document['description']]

data_reader.close()

## Run

In [20]:
%time parsed_documents = preprocessing(documents)

CPU times: user 2.26 s, sys: 16.8 ms, total: 2.27 s
Wall time: 2.28 s


In [21]:
%time documents_shingles, map_shingles = to_shingles(parsed_documents, k)

CPU times: user 3.07 s, sys: 110 ms, total: 3.18 s
Wall time: 3.2 s


In [22]:
%time signatures = compute_min_hashing_multiprocessing(documents_shingles, len(map_shingles), n_hashes)

CPU times: user 769 ms, sys: 128 ms, total: 896 ms
Wall time: 1min 45s


In [23]:
%time hash_buckets = compute_lsh(signatures, n_rows, n_bands)

CPU times: user 642 ms, sys: 13 ms, total: 655 ms
Wall time: 654 ms


In [24]:
%time candidates = find_candidates(hash_buckets, signatures, threshold)

CPU times: user 99.3 ms, sys: 2.96 ms, total: 102 ms
Wall time: 101 ms


In [26]:
%time candidates = check_false_positives(documents, candidates)

CPU times: user 1.01 s, sys: 19.5 ms, total: 1.03 s
Wall time: 1.04 s


In [27]:
%time clusters, cids = set_clusters(candidates, len(documents))

CPU times: user 11 ms, sys: 2.04 ms, total: 13 ms
Wall time: 12.6 ms


## Dump cluster ids

In [28]:
with io.open(dump_path, 'w') as fp:
    for cid in cids.items():
        fp.write('%s\t%s\n' % cid)

## Analysis

In [29]:
len(map_shingles)

81668

In [30]:
len(candidates)

3269

In [49]:
for i in range(2):
    idx = np.random.randint(0, len(candidates))
    item_a = candidates[idx][0]
    item_b = candidates[idx][1]
    print('%s:\t%s' % (item_a, documents[item_a]))
    print('%s:\t%s' % (item_b, documents[item_b]))
    print()

6119:	operador de estacionamentos simopark estacionamentos contrata: operador de estacionamento manobrista caixa requisitos: nao precisa ter experiencia deve ter cnh valida salario: r beneficios: cesta basica vale transporte uniforme e entre outros os curriculos podem ser entregues diretamente na rua barao de jaguara no informando que visualizou a vaga no vagas urgentes centro campinas sp ou
7515:	operador de estacionamentos simopark estacionamentos contrata: operador de estacionamento manobrista caixa requisitos: nao precisa ter experiencia deve ter cnh valida salario: r beneficios: cesta basica vale transporte uniforme e entre outros os curriculos podem ser entregues diretamente na rua barao de jaguara no informando que visualizou a vaga no vagas urgentes centro campinas sp ou

911:	sobre a vaga descricao area e especializacao profissional: informatica ti telecomunicacoes montagem e manutencao de micros nivel hierarquico: encarregado local de trabalho: itauna mg trabalhar na manutenc