In [1]:
import numpy as np
import pandas as pd
import json
import unidecode
from pyjarowinkler import distance as jarowinkler
import distance
import re
import arff
import time

def norm(string):
    """Normalizes pt-br string

    Arguments:
        string {str} -- a portuguese type string (ex: não, é muito pouco...)

    Returns:
        str -- a normalized string (ex: nao, e muito pouco)
    """
    if(isinstance(string, list)):
        string = " ".join(string)

    s = unidecode.unidecode(string)
    s = s.lower().strip()
    return s



def compare_strings(A, B):
    a = norm(A)
    b = norm(B)
    if(a == b):
        return 1

    try:
        jaro_score = (jarowinkler.get_jaro_distance(a, b, winkler=True, scaling=0.1))
    except:
        jaro_score = 0
        
    return jaro_score

In [2]:
def extract_features(text, models, nfeatures=None):
    # Obtendo indices ordenados do vetor de modelos
    idx = np.argsort(list(map(lambda x: len(x), models)))[::-1]

    # Organizando features
    max_features = 15
    features = []

    for i in range(max_features):
        features.append([])
    if(nfeatures == None):
        nfeatures = max_features
    
    # Pré-processando dados
    ntext = norm(text)
    splited_text = [t.strip() for t in ntext.split(' ') if t.strip() != '']
    code = re.compile(r'([0-9-./|a-z-./]*(?:[a-z]-?[0-9]|[0-9]-?[a-z]|[a-z]-[a-z]|[0-9]-?[0-9])[0-9-./|a-z-./]*)')
    threshold = 0.8
    textgb = re.findall(r'[0-9]+gb', ntext)
    ntext_codes = code.findall(ntext)

    # Extraindo as features para cada modelo
    for model in models:
        nmodel= norm(model)
        # split do modelo em tokens
        splited_model = [m.strip() for m in nmodel.split(' ') if m.strip() != '']

        # Número absoluto de tokens em comum (int)
        count = 0
        for token in splited_text:
            for m in splited_model:
                if (token == m):
                    count += 1
        features[0].append(count)

        # Número relativo de tokens em comum (double) relativo ao modelo da referencia
        features[1].append(count/len(splited_model))

        # As palavras da referência ocorrem no título na mesma ordem (binário)
        ordered = False
        m = 0
        for token in splited_text:
            if m >= len(splited_model):
                ordered = True
                break
            if (token == splited_model[m]):
                m += 1
        features[2].append(ordered)

        # As palavras da referência ocorrem no título na mesma ordem com jarowinkler > 0.8 (binário)
        ordered = False
        m = 0
        for token in splited_text:
            if m >= len(splited_model):
                ordered = True
                break
            if (compare_strings(token, splited_model[m]) > threshold):
                m += 1
        features[3].append(ordered)

        # Palavras em código
        count = 0
        count_jaro = 0
        for token in ntext_codes:
            for m in splited_model:
                if(code.match(m)):
                    if (token == m):
                        count += 1
                    if (compare_strings(token,m) > threshold):
                        count_jaro += 1

        # Número de palavras de código em comum (int)                        
        features[4].append(count)
        if(count == 0):
            count_total = 1
        # Número de palavras de código em comum relativo
        features[5].append(count/len(splited_model))
        # Número de palavras de código em comum com jaro
        features[6].append(count_jaro)
        # Número de palavras de código em comum com jaro relativo
        features[7].append(count_jaro/len(splited_model))
        
        # Utilizar janela deslisante com jaro e edit distance
        window = len(splited_text) - len(splited_model) + 1
        best_jaro = compare_strings(ntext, nmodel)
        best_edit = distance.levenshtein(ntext, nmodel)
        for d in range(window):
            cmptext = ' '.join(splited_text[d:d+len(splited_model)])
            cmp_jaro = compare_strings(cmptext, nmodel)
            if(cmp_jaro > best_jaro):
                best_jaro = cmp_jaro
            
            cmp_edit = distance.levenshtein(cmptext, nmodel)
            if(cmp_edit < best_edit):
                best_edit = cmp_edit
        
        
        features[8].append(best_jaro)
        features[9].append(1 - (best_edit/ max(len(ntext), len(nmodel))) )
        
        # Ordered contains feature
        features[10].append(False)

        # gb feature
        count = 0
        if('gb' in ntext and 'gb' in nmodel):
            modelgb = re.findall(r'[0-9]+gb', nmodel)
            for mgb in modelgb:
                if (mgb in textgb):
                        count += 1
        features[11].append(count)

        # dual feature
        if('dual' in ntext and ('dual' in nmodel or 'ds' in nmodel)):
            features[12].append(True)
        else:
            features[12].append(False)

        # plus or max feature
        if(('plus' in ntext and 'plus' in nmodel) or ('max' in ntext and 'max' in nmodel)
          or ('maxx' in ntext and 'maxx' in nmodel)):
            features[13].append(True)
        else:
            features[13].append(False)

    # ordered contains feature        
    for i in range(len(models)):
        nmodel = norm(models[idx[i]])
        valid = nmodel in ntext
        if(valid):
            features[10][idx[i]] = valid
            break

    # voting feature
    max_values = [np.max(f) for f in features[0:max_features-2]]
    for i in range(len(models)):
        row = [f[i] for f in features[0:max_features-2]]
        features[14].append(np.sum([r == m for r, m in zip(row, max_values) if m]))
            
    return np.array(features)

In [4]:
df = pd.read_csv('rotuled/0.tsv', sep='\t', index_col=False)
for i in range(1, 5):
    d = pd.read_csv('rotuled/' + str(i) + '.tsv', sep='\t', index_col=False)
    df = df.append(d)
df.shape

(3237, 6)

(2521, 6)

In [7]:
# df = pd.read_csv('rotuled/0.tsv', sep='\t', index_col=False)
df = df.dropna()
df = df.drop(df.columns[0], axis=1)
df['correct'][df['correct'] == 'T'] = True
df['correct'][df['correct'] == 'F'] = False
df['correct'][df['correct'] == 't'] = True
df['correct'][df['correct'] == 'f'] = False
df['correct'][df['correct'] == 'FF'] = False
df.describe()

Unnamed: 0,retailer,sku,title,features_model,correct
count,2521,2521,2521,2521,2521
unique,19,510,371,729,2
top,mercadolivre,#1035033215,Celular Samsung Galaxy J7 Prime 2 32gb Preto ...,MS45 3G,False
freq,1339,5,20,27,1935


In [8]:
df.drop_duplicates(subset=['title', 'features_model'], keep='first').shape

(1760, 5)

In [9]:
df.drop_duplicates(subset=['title', 'features_model', 'correct'], keep='first').shape

(1766, 5)

In [10]:
filtered_df = df.drop_duplicates(subset=['title', 'features_model', 'correct'], keep='first')
filtered_df.head()


Unnamed: 0,retailer,sku,title,features_model,correct
0,zoom,0039a94af8a537fd3c7137dfd7c8cbc8cb419e7dd1f49b...,Smartphone Multilaser MS80 32GB 4G Android,MS80 32GB,True
1,zoom,0039a94af8a537fd3c7137dfd7c8cbc8cb419e7dd1f49b...,Smartphone Multilaser MS80 32GB 4G Android,MS80 64GB,False
2,zoom,0039a94af8a537fd3c7137dfd7c8cbc8cb419e7dd1f49b...,Smartphone Multilaser MS80 32GB 4G Android,MS50 4G,False
3,zoom,0039a94af8a537fd3c7137dfd7c8cbc8cb419e7dd1f49b...,Smartphone Multilaser MS80 32GB 4G Android,MS70 4G,False
4,zoom,0039a94af8a537fd3c7137dfd7c8cbc8cb419e7dd1f49b...,Smartphone Multilaser MS80 32GB 4G Android,MS50R,False


In [11]:
X = []
Y = []

for title in np.unique(filtered_df['title']):
    space = filtered_df[filtered_df['title'] == title]
    models = space['features_model'].values
    labels = space['correct'].values
    features = extract_features(title, models).T
    for x, y in zip(features, labels):
        X.append(x)
        Y.append(y)
    
    
X = np.array(X)
Y = np.array(Y).astype('int')

In [48]:
idx = int(0.8*len(Y))
idx


In [33]:
idx = int(0.8*len(Y))
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(20), max_iter=2000)
clf = clf.fit(X[0:idx,:], Y[0:idx])
from sklearn.metrics import classification_report
y_true = Y[idx+1:]
y_pred = clf.predict(X[idx+1:,:])
target_names = ['False', 'True']
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

       False       0.88      0.92      0.90       265
        True       0.73      0.61      0.67        88

   micro avg       0.85      0.85      0.85       353
   macro avg       0.80      0.77      0.78       353
weighted avg       0.84      0.85      0.84       353



In [None]:
clfs = []

In [113]:
scoring = 'f1'
# scoring = 'accuracy'
# scoring = 'precision'

In [105]:
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(20), max_iter=2000)
clfs.append(('MLP', clf))
scores = cross_val_score(clf, X, Y, cv=10, scoring=scoring)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.86 (+/- 0.09)


In [106]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clfs.append(('RF', clf))
scores = cross_val_score(clf, X, Y, cv=10, scoring=scoring)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.86 (+/- 0.08)


In [107]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial', max_iter=2000)
clfs.append(('LR', clf))
scores = cross_val_score(clf, X, Y, cv=10, scoring=scoring)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.85 (+/- 0.10)


In [108]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clfs.append(('DT', clf))
scores = cross_val_score(clf, X, Y, cv=10, scoring=scoring)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.80 (+/- 0.10)


In [109]:
from sklearn.naive_bayes import GaussianNB 
clf = GaussianNB()
clfs.append(('NB', clf))
scores = cross_val_score(clf, X, Y, cv=10, scoring=scoring)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.82 (+/- 0.07)


In [110]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(clf, X, Y, cv=10, scoring=scoring)
clfs.append(('ABC', clf))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.85 (+/- 0.09)


In [111]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
scores = cross_val_score(clf, X, Y, cv=10, scoring=scoring)
clfs.append(('KNN', clf))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.84 (+/- 0.12)


In [114]:
from sklearn.ensemble import VotingClassifier
eclf = VotingClassifier(estimators=clfs, voting='soft', weights=np.ones(len(clfs)))
scores = cross_val_score(eclf, X, Y, cv=10, scoring=scoring)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.67 (+/- 0.26)
