In [1]:
import sys
sys.path.append("C:\\Users\\lucas\\AppData\\Local\\Programs\\Python\\Python36\\Lib\\site-packages")

import os
import gc
import re
import sys
import math
import nltk
import pickle
import unidecode
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

### Load Data

In [3]:
df_titles = pd.read_csv('items_titles.csv')
df_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ITE_ITEM_TITLE  30000 non-null  object
dtypes: object(1)
memory usage: 234.5+ KB


In [4]:
df_titles.head()

Unnamed: 0,ITE_ITEM_TITLE
0,Tênis Ascension Posh Masculino - Preto E Vermelho
1,Tenis Para Caminhada Super Levinho Spider Corrida
2,Tênis Feminino Le Parc Hocks Black/ice Original Envio Já
3,Tênis Olympikus Esportivo Academia Nova Tendência Triunfo
4,Inteligente Led Bicicleta Tauda Luz Usb Bicicleta Carregáve


In [5]:
#Original size of titles
df_titles['length'] = df_titles['ITE_ITEM_TITLE'].str.len()

### Numbers

#### Hypothesis -> numbers are not important for the comparison

In [6]:
def detect_numbers(text):
    if not re.search('\d+', text):
        return 0
    else:
        return 1
    
df_titles['numbers_'] = df_titles['ITE_ITEM_TITLE'].apply(lambda x: detect_numbers(x))

In [7]:
#Rate of titles containing numeric values
df_titles[df_titles['numbers_']==1].shape[0]/df_titles.shape[0]

0.48313333333333336

##### Due to almost half of data containing numbers, i've decided not to filter it. And check the results.

### Cleaning dataset

In [8]:
def regex_all(text):
    regex_chat1 = re.compile(r'([^\w|:|\|]+|[\ ]{2,})')
    new_text = regex_chat1.sub(' ',unidecode.unidecode(text).lower()).strip()
    return new_text

df_titles['ITE_ITEM_TITLE_'] = df_titles['ITE_ITEM_TITLE'].apply(lambda x: regex_all(x))

#Calculate new size of titles
df_titles['length_'] = df_titles['ITE_ITEM_TITLE_'].str.len()

#### Stop words  will be considered on the sklearn Tf-Idf implementation

In [10]:
# download portuguese stopwords from the NLTK library
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
#Remove Stopwords
df_titles['ITE_ITEM_TITLE__'] = df_titles['ITE_ITEM_TITLE_'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
#Calculate new size of titles
df_titles['length__'] = df_titles['ITE_ITEM_TITLE__'].str.len()

### Statistics about data cleaning

In [18]:
df_titles['char_diff_regex'] = df_titles['length'] - df_titles['length_']
print(f"Regex affected {df_titles[df_titles['char_diff_regex']>0].shape[0]} titles")
print(f'Mean of characters removed after regex: ',np.mean(df_titles['char_diff_regex']))
print(f'Max of characters removed after regex: ',np.max(df_titles['char_diff_regex']))

Regex affected 10295 titles
Mean of characters removed after regex:  0.6731333333333334
Max of characters removed after regex:  27


In [19]:
df_titles['char_diff_stopwords'] = df_titles['length_'] - df_titles['length__']
print(f"Removing stopwords affected {df_titles[df_titles['char_diff_stopwords']>0].shape[0]} titles")
print(f'Mean of characters removed after stopwords: ',np.mean(df_titles['char_diff_stopwords']))
print(f'Max of characters removed after stopwords: ',np.max(df_titles['char_diff_stopwords']))

Removing stopwords affected 6801 titles
Mean of characters removed after stopwords:  0.9077333333333333
Max of characters removed after stopwords:  23


#### Load Test Set

In [21]:
df_titles_test = pd.read_csv('arquivos_teste/items_titles_test.csv')

In [22]:
df_titles_test.shape

(10000, 1)

In [23]:
#Clean it
df_titles_test['ITE_ITEM_TITLE_'] = df_titles_test['ITE_ITEM_TITLE'].apply(lambda x: regex_all(x))

In [24]:
df_titles_test.head()

Unnamed: 0,ITE_ITEM_TITLE,ITE_ITEM_TITLE_
0,Tênis Olympikus Esporte Valente - Masculino Kids,tenis olympikus esporte valente masculino kids
1,Bicicleta Barra Forte Samy C/ 6 Marchas Cubo C/ Rolamento,bicicleta barra forte samy c 6 marchas cubo c rolamento
2,Tênis Usthemp Slip-on Temático - Labrador 2,tenis usthemp slip on tematico labrador 2
3,Tênis Casual Feminino Moleca Tecido Tie Dye,tenis casual feminino moleca tecido tie dye
4,Tênis Star Baby Sapatinho Conforto + Brinde,tenis star baby sapatinho conforto brinde


#### Implement the Tf-Idf approach, using unigrams 

In [27]:
tfidf = TfidfVectorizer(stop_words=stopwords,analyzer='word',ngram_range=(1, 1))                                                                                                                                                                                                   
tfidf = tfidf.fit(df_titles['ITE_ITEM_TITLE_']) #Fit the original dataset, just "regex cleaned"
tfidf_test = tfidf.transform(df_titles_test['ITE_ITEM_TITLE_']) #Transform the test set

In [28]:
#Checking most relevant terms
importance = np.argsort(np.asarray(tfidf_test.sum(axis=0)).ravel())[::-1]
tfidf_feature_names = np.array(tfidf.get_feature_names())
top_100 = tfidf_feature_names[importance[:100]]
top_100

array(['tenis', 'feminino', 'masculino', 'casual', 'infantil', 'preto',
       'sapatenis', 'original', 'branco', 'bicicleta', 'olympikus',
       'couro', 'slip', 'promocao', 'on', 'confortavel', 'caminhada',
       'academia', 'kit', 'aro', 'leve', 'azul', 'corrida', 'esportivo',
       'rosa', 'sapatilha', 'conforto', 'lancamento', 'plataforma',
       'usthemp', 'cano', 'marinho', 'pares', 'cadarco', '29', 'barato',
       'bike', 'menina', 'led', 'mizuno', 'tv', 'star', 'meia', 'menino',
       'alto', 'adidas', 'kolosh', 'nike', 'sapato', 'moleca', 'chunky',
       'fila', 'vermelho', 'tematico', 'mtb', 'flatform', 'sola', 'facil',
       'via', 'black', 'macio', 'gel', 'sneaker', 'asics', 'cinza',
       'unissex', 'envio', 'calce', 'oferta', 'brinde', 'marte', 'all',
       '20', 'elastico', 'super', 'vizzano', 'dia', 'new', 'wave', 'polo',
       'molekinha', 'botinha', 'feminina', 'verde', '26', 'top',
       'confort', 'skate', 'actvitta', 'frete', 'ref', 'legitimo', 'rio',


#### Implement the Tf-Idf approach, using ngrams range as (1,2) to consider some "context"

In [29]:
tfidf = TfidfVectorizer(stop_words=stopwords,analyzer='word',ngram_range=(1, 2))                                                                                                                                                                                                   
tfidf = tfidf.fit(df_titles['ITE_ITEM_TITLE_'])  #Fit the original dataset, just "regex cleaned"
tfidf_test = tfidf.transform(df_titles_test['ITE_ITEM_TITLE_']) #Transform the test set

In [30]:
#Checking most relevant ngrams
importance = np.argsort(np.asarray(tfidf_test.sum(axis=0)).ravel())[::-1]
tfidf_feature_names = np.array(tfidf.get_feature_names())
top_100 = tfidf_feature_names[importance[:100]]
top_100

array(['tenis', 'feminino', 'masculino', 'casual', 'tenis feminino',
       'infantil', 'preto', 'sapatenis', 'original', 'tenis infantil',
       'bicicleta', 'tenis masculino', 'branco', 'olympikus', 'couro',
       'slip', 'on', 'promocao', 'slip on', 'aro', 'confortavel', 'kit',
       'tenis casual', 'caminhada', 'academia', 'tenis olympikus', 'azul',
       'sapatenis masculino', 'leve', 'sapatilha', 'corrida', 'rosa',
       'esportivo', 'usthemp', 'conforto', 'plataforma', 'lancamento',
       'feminino casual', 'cano', 'tenis usthemp', 'tv', '29', 'bike',
       'marinho', 'pares', 'cadarco', 'led', 'menina', 'mizuno', 'adidas',
       'nike', 'star', 'barato', 'casual feminino', 'alto', 'meia',
       'kolosh', 'menino', 'moleca', 'sapato', 'tematico', 'vermelho',
       'fila', 'bicicleta aro', 'chunky', 'infantil feminino', 'mtb',
       'aro 29', 'tenis adidas', 'cano alto', 'black', 'gel', 'flatform',
       'unissex', 'tenis slip', 'asics', 'sola', 'via', 'cinza',
      

##### Many relevant bigrams are in the top 100, lets keep 'em

In [31]:
#Calculate similarity
cosine_sim_matrix = cosine_similarity(tfidf_test, tfidf_test)

In [32]:
#Compose results dataframe with original titles
df_similarity = []
tq_ = tqdm(len(cosine_sim_matrix))

for i in range(0,len(cosine_sim_matrix)):
    tq_.update(1)
    for j in range(i,len(cosine_sim_matrix)):
        row = [df_titles_test['ITE_ITEM_TITLE'][i],df_titles_test['ITE_ITEM_TITLE'][j],cosine_sim_matrix[i][j]]
        df_similarity.append(row)
        
df_results = pd.DataFrame(df_similarity)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [44]:
#Format 
df_results.rename(columns={0:'ITE_ITEM_TITLE',1:'ITE_ITEM_TITLE',2:'Score Similitud (0,1)'},inplace=True)
df_results = df_results.round({'Score Similitud (0,1)': 4})
df_results = df_results[df_results['Score Similitud (0,1)']<1.0]
df_results.sort_values('Score Similitud (0,1)',ascending=False,inplace=True)
df_results.reset_index(drop=True,inplace=True)

In [45]:
#Persist 
df_results.to_csv('titles_test_results_.csv',index=False)

In [46]:
#Check top 50 matches
df_results.head(50)

Unnamed: 0,ITE_ITEM_TITLE,ITE_ITEM_TITLE.1,"Score Similitud (0,1)"
0,Tênis adidas Terrex Ax3 - Feminino,Tênis adidas Terrex Ax3,0.9947
1,Tênis Skechers M Go Walk Evolution Ultra,Tenis Skechers Go Walk Evolution Ultra Impeccable Masculino,0.9937
2,Tenis Skechers Go Walk Evolution Ultra Impeccable Masculino,Tênis Skechers Go Walk Evolution Ultra,0.9937
3,Tênis adidas Response Super Boost M Fy8746,Tenis adidas Response Super Boost Masculino,0.9928
4,Tênis Skechers M Go Walk Evolution Ultra,Tênis Skechers Go Walk Evolution Ultra - Original,0.9903
5,Tênis Skechers Go Walk Evolution Ultra,Tênis Skechers Go Walk Evolution Ultra - Original,0.9903
6,Tenis Skechers Go Walk Evolution Ultra Impeccable Masculino,Tênis Skechers Go Walk Evolution Ultra - Original,0.9841
7,Hd322hj Usado,Tenis Aldo Usado,0.9822
8,Bicicleta Aro 26 Ultra Bikes Feminina Bicolor,Bicicleta Aro 26 Ultra Bikes Feminina Bicolor Moutain Bike,0.9815
9,Tênis Asics Gel Resolution 8 Clay Masculino - Saibro- New,Tênis Asics Gel Resolution 8 Clay Masculino Saibro,0.9789
