## Load Packages

In [9]:
import pandas as pd
import numpy as np
import string
import sys
import unidecode
from math import sqrt
import itertools
import sklearn
from nltk.corpus import stopwords as _stopwords
from nltk.stem import *

np.random.seed(1234)



## Defining Utils - Pre Processing

In [10]:
def tokenizer(sentences, stopwords=None, stemmer=None):
    sentences = str(sentences).lstrip()
    tokens = str(sentences).split(' ')
    tokens = [t for t in tokens if not (t == '')]
    tokens = [unidecode.unidecode(t) for t in tokens]

    # import code; code.interact(local=dict(globals(), **locals()))

    if stopwords is None:
        stopwords = get_stopwords()

    if stemmer is None:
        stemmer = get_stemmer()

    tokens = [t.lower() for t in tokens]  # to lowercase
    tokens = [remove_puctuation(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]  # remove stopwords
    tokens = [stemmer.stem(t) for t in tokens]  # stemmify
    tokens = [t for t in tokens if not (t == None)]

    return ' '.join(tokens)


def bow2dist(bow, verbose=True):
    '''
    INPUT
        bow: bag-of-words VxD numpy matrix
    OUTPUT
        dist: distance DxD lower triangular matrix
    '''
    d = bow.shape[1]
    dist = np.zeros((d, d), dtype=np.float32)
    for i in range(d):
        for j in range(0, i):
            dif = bow[:, i] - bow[:, j]
            dist[i, j] = np.sqrt(np.dot(dif, dif))
            if verbose:
                sys.stdout.write('%05d,%05d:\t%0.2f\r' % (i, j, dist[i, j]))
                sys.stdout.flush()
    print('')
    return dist


def matrix2txt(mtrx, filename='mtrx.txt'):
    '''
    INPUT
        mtrx: a generic numpy matrix ex: bow or dist
    OUTPUT
        matrix representation in text format
        header: nrows ncols
        body: matrix
    '''
    path = '../../locality-sensitive-hashing/datasets/' + filename
    n_headers = mtrx.shape[1] - 2
    headers = list(mtrx.shape) + [''] * n_headers
    df = pd.DataFrame(data=mtrx.astype(np.int32), columns=headers, index=None)
    df.to_csv(path, sep=' ', index=False, index_label=False)


def word2idx2txt(word2idx, filename='word2idx.txt'):
    '''
    INPUT
        word2idx: dict
            keys:token
            value:idx
    OUTPUT
        -
    '''
    idx2word = {v: k for k, v in word2idx.items()}
    path = '../../locality-sensitive-hashing/datasets/' + filename
    df = pd.DataFrame.from_dict(idx2word, orient='index')
    df.to_csv(path, sep=' ', index=True, index_label=False, header=None)


def data2bow(data, word2idx):
    '''
        INPUT
            data: a pandas.DataFrame
                            processes column idx_description
        OUTPUT
            bow: bag-of-words VxD numpy matrix
                    D: documents (idx_description)
                    V: Vocabulary
            example: if word w<=>idx appears 10 times on document d then
                    bow[idx,d]=10
    '''

    nrows = data.shape[0]
    ncols = len(word2idx)
    bow = np.zeros((nrows, ncols), dtype=np.int32)
    for r in range(nrows):
        idx_desc = data.loc[r, "idx_description"]
        if idx_desc == "":
            continue
        indexes = list(map(int, idx_desc.split(' ')))
        for c in indexes:
            bow[r, c] += 1

    return bow.T

def string2shingles(description, shingle_length):
    description = description.lstrip()
    tokens = description.split()
    if len(tokens) < shingle_length:
        return " ".join(tokens)

    k_shingles = []
    for i in range(len(tokens) - shingle_length + 1):
        k_shingles.append(" ".join(tokens[i:i + shingle_length]))

    return k_shingles

def data2idx(data, word2idx, bool_shingle = True, shingle_length = 5):
    '''
        INPUT
            data: pandas.DataFrame
                        column: token_description
            word2idx: dict
                            keys:tokens,
                            values:integer
        OUTPUT
            data: pandas.DataFrame
                        column: token_description -> idx_description

    '''
    nrows = data.shape[0]
    token_count = 0
    for i in range(nrows):
        # import code; code.interact(local=dict(globals(), **locals()))
        if bool_shingle:
            tokens = string2shingles(data.loc[i, 'token_description'], shingle_length = shingle_length)
        else:
            tokens = data.loc[i, 'token_description'].split(' ')
        indexes = token2idx(tokens, word2idx)
        token_count += len(indexes)
        data.loc[i, 'token_description'] = " ".join([str(idx) for idx in indexes])

        sys.stdout.write('document:%d of %d\tVOCAB:%d\tWORD COUNT:%d\t\r' % (i, nrows, len(word2idx), token_count))
        sys.stdout.flush()
    data = data.rename(columns={'token_description': 'idx_description'})
    print('')
    return data


def token2idx(tokens, word2idx):
    '''
        INPUT
            tokens: a list of strings

            word2idx: dict
                            keys:tokens,
                            values:integer
        OUTPUT
            bow: bag-of-words VxD numpy matrix
                    D: documents (idx_description)
                    V: Vocabulary

    '''
    nextidx = max(word2idx.values()) + 1 if len(word2idx) > 0  else 0
    indexes = []
    for t in tokens:
        if not (t in word2idx):
            word2idx[t] = nextidx
            nextidx += 1
        indexes.append(word2idx[t])
    return indexes


def get_stopwords(lang='portuguese'):
    return set(_stopwords.words(lang))


def get_stemmer(lang='portuguese'):
    return SnowballStemmer(lang)


def remove_puctuation(s):
    # 	'''
    # 		s is a string with punctuation; converts unicode to string which might get data loss
    # 			url: https://stackoverflow.com/questions/23175809/typeerror-translate-takes-one-argument-2-given-python
    # 					 https://pypi.python.org/pypi/Unidecode
    # 					 https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate
    # 	'''
    # 	# return str(s).translate(None, string.punctuation)
    # s = unidecode.unidecode(s) # Converts unicode s into closest ascii s, removes accents
    # 	if s:
    # 		# This uses the 3-argument version of str.maketrans
    # 		# with arguments (x, y, z) where 'x' and 'y'
    # 		# must be equal-length strings and characters in 'x'
    # 		# are replaced by characters in 'y'. 'z'
    # 		# is a string (string.punctuation here)
    # 		# where each character in the string is mapped
    # 		# to None
    s = s.translate(str.maketrans('', '', string.punctuation))  # removes punctuation
    s = s.translate(str.maketrans('', '', '\n'))  # removes \n
    s = s.translate(str.maketrans('', '', '\t'))  # removes \t
    s = s.translate(str.maketrans('', '', '\r'))  # removes \r

    return s

## Loading and Pre Processing

In [13]:
data_path = "../datasets/development.json"
df = pd.read_json(data_path, orient='records')

df.sample(10).head(10)

stopwords = list(get_stopwords())

stemmer= get_stemmer()

tokenfy = lambda x : tokenizer(x, stopwords= stopwords, stemmer=stemmer)
df['token_title'] = df['title'].transform(tokenfy)
df.sample(10).head(10)

df['token_description'] = df['description'].transform(tokenfy)

word2idx={}

df = data2idx(df, word2idx)

bow = data2bow(df, word2idx)


ValueError: Expected object or value

## Defining Utils - LSH

In [14]:
def isPrime(n):
    if n == 2:
        return True
    if (n < 2) or (n % 2 == 0):
        return False
    return not any(n % i == 0 for i in range(3, int(sqrt(n)) + 1, 2))

def nextPrime(n):
    if isPrime(n):
        n += 1
    if (n % 2 == 0) and (n != 2):
        n += 1
    while True:
        if isPrime(n):
            break
        n += 2
    return n


def universalHashFunction(x, k, maxvalue, next_prime):
    a = np.random.randint(0, maxvalue ,k)
    b = np.random.randint(0, maxvalue, k)
    return (a*x + b) % next_prime

def getSignatureMatrix(input_bow, num_permutations = 200):
    nrows, ncols = input_bow.shape
    idx = range(nrows)
    sigM = np.empty((num_permutations, ncols))
    sigM[:] = np.Inf
    next_prime = nextPrime(nrows)
    for p in range(nrows):
        bool_1 = input_bow[p,] == 1
        hashes = universalHashFunction(p, num_permutations, nrows, next_prime)
        for c in range(ncols):
            if bool_1[c]:
                for r in range(num_permutations):
                    if hashes[r] < sigM[r,c]:
                        sigM[r,c] = hashes[r]
        if p % 10000 == 0:
            print(p, "de", nrows)

    return sigM + 1

def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

def findCandidates(sigM, num_bands):
    num_bands = int(num_bands)
    for c in np.where(sigM[0,] == np.Inf):
        sigM = np.delete(sigM, c, 1)
    nrows, ncols = sigM.shape
    r = nrows / num_bands
    print(r, "linhas por banda, em media")
    idx_splits = list(split(range(int(nrows)), num_bands))
    hashtable = dict.fromkeys(range(num_bands))
    grupos_finais = dict.fromkeys(range(ncols))
    dist_matrix_candidates = np.empty([ncols, ncols])
    dist_matrix_candidates[:] = np.NaN
    counter_groups = 0
    for i in range(num_bands):
        hashtable[i] = {}
        col_sums = sum(sigM[idx_splits[i]])

        for j in range(len(col_sums)):
            s = col_sums[j]
            if s in hashtable[i]:
                hashtable[i][s].append(j)
            else:
                hashtable[i][s] = [j]

        for l in list(hashtable[i].values()):
            if len(l) > 1:
                for pair in list(itertools.combinations(l, 2)):
                    if np.isnan(dist_matrix_candidates[pair]):
                        dist_matrix_candidates[pair] = sklearn.metrics.jaccard_similarity_score(sigM[:, pair[0]], sigM[:, pair[1]])
                        if dist_matrix_candidates[pair] > 0.95:
                            if grupos_finais[pair[0]]:
                                grupos_finais[pair[1]] = grupos_finais[pair[0]]
                            elif grupos_finais[pair[1]]:
                                grupos_finais[pair[0]] = grupos_finais[pair[1]]
                            else:
                                grupos_finais[pair[0]] = counter_groups
                                grupos_finais[pair[1]] = counter_groups
                                counter_groups = counter_groups + 1

    for u in list(grupos_finais.keys()):
        if not grupos_finais[u]:
            grupos_finais[u] = counter_groups
            counter_groups = counter_groups + 1

    return grupos_finais, dist_matrix_candidates, sigM

## Rodando o LSH

In [15]:
# Calcula a matriz de signature -- cerca de 5 min
sigM = getSignatureMatrix(bow)

# Forma os grupos 
grupos, dist_candidates, sigM = findCandidates(sigM, num_bands)

NameError: name 'bow' is not defined