# DATA SCIENCE CHALLENGE #3 - SIMILITUD ENTRE PRODUCTOS

+ Un desafío constante en MELI es el de poder agrupar productos similares utilizando algunos atributos de estos como pueden ser el título, la descripción o su imagen;
+ Para este desafío tenemos un dataset `items_titles.csv` que tiene títulos de 30 mil productos de 3 categorías diferentes de Mercado Libre Brasil;
+ El objetivo del desafío es poder generar una Jupyter notebook que determine cuán similares son dos títulos del dataset `item_titles_test.csv`, generando como output un listado de la forma:

|    ITE_ITEM_TITLE   |   ITE_ITEM_TITLE    |  Score Similitud (0,1)  |
|:-------------------:|:-------------------:|:-----------------------:|
|   Zapatillas Nike   |  Zapatillas Adidas  |           0.5           |
|   Zapatillas Nike   |  Zapatillas Nike    |            1            |

donde ordenando por score de similitud podamos encontrar los pares de productos más similares en nuestro dataset de test.

# Loading modules and functions

In [1]:
import pandas as pd # dataframes
import numpy as np
from unidecode import unidecode # special characters
from collections import defaultdict # dictionary
from string import punctuation
import re
import nltk
from nltk.tokenize import word_tokenize 

## Loading the data

In [2]:
# loading the dataset containing the instances that will be used to develop the algorithm
df_train = pd.read_csv(filepath_or_buffer = '../data/items_titles.csv')
# loading the dataset containing the instances that will be used to test the algorithm
df_test = pd.read_csv(filepath_or_buffer = '../data/items_titles_test.csv')

## Pre-processing the text

Creating a function to pre-process each sentence into a list of words.

In [3]:
def pre_process_text(sentence):
    """
    Pre-processes the sentence by lower casing all characters, replacing any special characters by their standard 
    counterparts (e.g., 'ê' -> e) and splitting the sentence based on whitespace
    
    Arguments
    ---------
    sentence: string
        A string containing the sentence that will be pre-processed.
    
    Returns
    -------
        A list of strings where each element is a word found in the original sentence
    """
    decoded_sentence = unidecode(sentence) # stripping special characters
    decoded_sentence = decoded_sentence.lower() # lower casing the entire sentence
    decoded_sentence = re.sub('[^\w\s\d]', ' ', decoded_sentence) # removing any punctuation
    return word_tokenize(text = decoded_sentence, language = "portuguese") # returning the tokenized sentence

In [4]:
## list of words within each sentence
train_sentences = df_train.ITE_ITEM_TITLE.apply(pre_process_text).tolist()
## getting the list of unique tokens across all sentences
vocabulary = set([word for sentence in train_sentences for word in sentence])

In [5]:
def encode_vocabulary(vocabulary, unknown_idx = 0, digit_idx = 1):
    """
    Integer-encode a vocabulary.
    
    Arguments
    ---------
    vocabulary: dict
        Input dictionary containing the tokens found in a corpus.
    digit_idx: int, defaults to 1
        Integer used to encode any token that contains a digit.
        
    Returns
    -------
    word_to_idx: dict
        Dictionary used to encode each token to a unique integer
    idx_to_word: dict
        Dictionary used to encode each integer to a token
    """
    # creating default dictionaries to store the encoded values
    word_to_idx = defaultdict(int)
    idx_to_word = defaultdict(int)
    
    # initializing the idx_to_word dictionary with the values for the unknown and digit ids
    idx_to_word[unknown_idx] = '__UNK__'
    idx_to_word[digit_idx] = '__DGT__'
    
    #
    counter = unknown_idx + digit_idx + 1
    
    # iterating across the non-digit tokens in the vocabulary to assign an index to them
    for token in vocabulary:
        if token.isdigit() or any(character.isdigit() for character in token):
            pass
        else:
            word_to_idx[token] = counter
            idx_to_word[counter] = token
            counter += 1
    return word_to_idx, idx_to_word

In [6]:
# defining the default indexes that will be used for the unknown and digit tokens
idx_for_unknown = 0
idx_for_digits = 1
# getting the encoded representations of tokens and their indexes
word2idx, idx2word = encode_vocabulary(vocabulary = vocabulary, unknown_idx = idx_for_unknown, digit_idx = idx_for_digits)

In [7]:
def encode_sentence(sentence, word_to_idx_dict, unknown_idx = 0, digit_idx = 1):
    """
    Integer-encode an input sentence based on the indexes given to each token in a vocabulary, while also taking
    care to given specific encoding values to unknown words and for any token where a digit is found.
    
    Arguments
    ---------
    
    Returns
    -------
    """
    # pre-processing and tokenizing the sentence
    tokenized_sentence = pre_process_text(sentence)
    # creating and empty list to store the indexes of the sentence tokens
    sentence_idx = []
    # iterating across the tokens in the tokenized sentence to extract their indexes
    for token in tokenized_sentence:
        # assigning any token containing a digit to its own integer index
        if token.isdigit() or any(character.isdigit() for character in token):
            sentence_idx.append(idx_for_digits)
        # using the get method of the dictionary to extract the word index or, if that does not exist, assigning
        # the unknown word index
        else:
            sentence_idx.append(word2idx.get(token, idx_for_unknown))
    
    return sentence_idx

## Defining a baseline for the test data 

In [201]:
def one_hot_array(indexes, vocab_size):
    """
    Creates a one-hot vector representation of an input sentence that have already been encoded.
    
    Arguments
    ---------
    indexes: list of integers
        List of integers representing the indexes from each of the tokens found in a given sentence
    vocab_size: int
        Size of the original vocabulary
    
    Returns
    -------
        A numpy array of shape (vocab_size,) containing the vector representation for a sentence represented by
        a sequence of tokens
    """
    # creating a matrix of zeroes to accumulate the one hot encoded representation of each token in the sentence
    # independently - each row is a token, each column a token index
    one_hot = np.zeros((len(indexes), vocab_size))
    # populating the zeroes matrix according to the number of times each token appears in the input sentence
    for row, idx in enumerate(indexes):
        one_hot[row][idx] += 1
    # creating a vector representation of the input sentence based on the column-wise average of the one hot encoding
    # this array represents the average content of the sentence
    return np.mean(one_hot, axis = 0)

In [196]:
def score_similitud(target_sentence, possible_sentences, sentence_encoder, word_to_idx_dict, unknown_idx, digit_idx, ohe_function, n_closest = 5):
    """
    """
    # extracting the vocabulary size from the word to index dictionary - this is the maximum index value in this dictionary
    vocab_size = max(word_to_idx_dict.values())
    # encoding the target sentence
    target_tokens = sentence_encoder(target_sentence, word_to_idx_dict, unknown_idx, digit_idx)
    # one hot encoding the target sentence
    ohe_target = ohe_function(indexes = target_tokens, vocab_size = vocab_size)
    
    # creating an empty list to store the tuple containing the cosine similarity between the target setence and
    # every other sentence in possible_sentences
    sentence_similarity = []
    
    # iterating across each of the other possible sentences
    for proposal in possible_sentences:
        # encoding the proposed sentence
        proposal_tokens = sentence_encoder(proposal, word_to_idx_dict, unknown_idx, digit_idx)
        # one hot encoding the proposed sentence
        ohe_proposal = ohe_function(indexes = proposal_tokens, vocab_size = vocab_size)
        
        # calculating the cosine similarity between the vector representation of the target sentence and the 
        # proposed sentence
        numerator = np.dot(ohe_target[1:], ohe_proposal[1:])
        denominator = np.linalg.norm(ohe_target[1:]) * np.linalg.norm(ohe_proposal[1:])
        cosine = numerator / denominator if not np.isclose(0.0, denominator) else 0.0
        
        # appending the tuple containing the proposed sentence and the cosine similarity between that sentence
        # and the target sentence into the list of sentence similarities
        sentence_similarity.append((proposal, cosine))
    
    # reordering the list of sentence similarity so that tuples with higher cosine similarity comes first, and
    # extracting the n_closest matches to the target sentence according to this metric
    sentence_similarity = sorted(sentence_similarity, key = lambda x: x[-1], reverse = True)[:n_closest]
    
    return sentence_similarity

In [204]:
sample_scoring = score_similitud(
    df_test.iloc[0, 0], possible_sentences = df_test.iloc[1:, 0], 
    sentence_encoder = encode_sentence, word_to_idx_dict = word2idx, 
    unknown_idx = idx_for_unknown, digit_idx = idx_for_digits, 
    ohe_function = one_hot_array, n_closest = 5
)

In [211]:
sample_output = pd.DataFrame(sample_scoring, columns = ['Similar Items', 'Score Similitud'])
sample_output['Target Item'] = df_test.iloc[0, 0]
sample_output = sample_output[['Target Item', 'Similar Items', 'Score Similitud']]