In [14]:
file_with_candidate_predictions = './dafiti_catalog_nlpsolution.csv.gz'
file_with_test_set = './dafiti_catalog_challenge_test.csv.gz'

In [15]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from hashlib import sha1

In [16]:
CATEGORY_HASH_SALT = 'Dafiti_Group%'
CATEGORY_HASH_PEPPER = '@Data_Challenge'

categories = [
    'Camiseta',
    'Tênis',
    'Bermuda',
    'Boné',
    'Moletom',
    'Blusa',
    'Calça',
    'Polo',
    'Mochila',
    'Óculos',
    'Acessório de Vestuário',
    'Camisa',
    'Shorts',
    'Casacos e Jaquetas',
    'Meia',
    'Roupa de Banho',
    'Bota',
    'Bagagem Pequena',
    'Vestido',
    'Chinelo'
]

In [17]:
def find_category(predicted, product_id, category_hash, categories):
    for c in categories:
        s = CATEGORY_HASH_SALT + product_id + c + CATEGORY_HASH_PEPPER
        c_hash = sha1(s.encode()).hexdigest().upper()
        if c_hash==category_hash:
            return c

df_predictions = pd.read_csv(file_with_candidate_predictions, usecols=['product_id', 'category'])
df_test = pd.read_csv(file_with_test_set, usecols=['product_id','category_hash'])
df_join = df_predictions.set_index('product_id').join(df_test.set_index('product_id')).reset_index().rename({'category':'predicted'}, axis=1)

df_join['correct_category'] = df_join.apply(lambda x: find_category(x.predicted, x.product_id, x.category_hash, categories), axis=1)
df_join.head()

Unnamed: 0,product_id,predicted,category_hash,correct_category
0,631FB8F77A5A4F8ED6DE427768101834,Tênis,05BDF025F31085AAAF7CD5F4F05286719E19252F,Tênis
1,CFE6EFAED4FC64D02DC8A5B0117331D3,Bermuda,D81D40DD8DD1B6B4CB479C014555603C452B2A9A,Bermuda
2,4864C65B70BB0B874239A16E0F51F928,Calça,936A5C2FB335EDA008CF246CF9D0899AD8164774,Calça
3,FC116862B1FEB15E249BE2941AEB215D,Tênis,AA3F0705D66FFF1238409176A42C580E8B2F9F7C,Tênis
4,83D4B854506AACC841872A3FB309CAB6,Bagagem Pequena,234B38810B6AD54912C1F3D4DAFDD80D15EF583F,Bagagem Pequena


In [18]:
pd.DataFrame(confusion_matrix(df_join['correct_category'], df_join['predicted'], labels=categories), index=categories, columns=categories)

Unnamed: 0,Camiseta,Tênis,Bermuda,Boné,Moletom,Blusa,Calça,Polo,Mochila,Óculos,Acessório de Vestuário,Camisa,Shorts,Casacos e Jaquetas,Meia,Roupa de Banho,Bota,Bagagem Pequena,Vestido,Chinelo
Camiseta,9112,0,7,1,26,84,0,9,0,0,7,12,1,4,0,0,0,0,1,2
Tênis,6,3203,0,0,0,0,0,0,0,0,2,0,0,0,1,0,4,0,0,3
Bermuda,31,0,1577,0,0,0,171,0,1,0,1,2,9,0,0,1,0,1,1,0
Boné,20,2,0,1527,0,0,0,0,0,0,6,0,1,0,1,0,0,0,0,0
Moletom,56,0,2,1,1141,11,0,0,0,0,0,4,0,6,1,0,0,0,0,0
Blusa,383,0,2,1,35,440,0,5,0,0,8,7,0,4,0,2,0,0,14,0
Calça,14,1,186,1,4,0,586,0,2,0,4,1,4,0,0,2,1,3,0,0
Polo,145,0,1,0,1,7,0,469,0,0,0,16,0,0,0,3,0,0,0,0
Mochila,4,0,8,3,0,0,0,0,607,0,0,0,0,0,0,0,0,4,0,0
Óculos,7,4,0,2,0,0,0,0,3,555,0,0,0,0,0,0,0,0,0,0


In [19]:
pd.DataFrame(confusion_matrix(df_join['correct_category'], df_join['predicted'], labels=categories), index=categories, columns=categories).style.background_gradient(cmap = 'viridis')

Unnamed: 0,Camiseta,Tênis,Bermuda,Boné,Moletom,Blusa,Calça,Polo,Mochila,Óculos,Acessório de Vestuário,Camisa,Shorts,Casacos e Jaquetas,Meia,Roupa de Banho,Bota,Bagagem Pequena,Vestido,Chinelo
Camiseta,9112,0,7,1,26,84,0,9,0,0,7,12,1,4,0,0,0,0,1,2
Tênis,6,3203,0,0,0,0,0,0,0,0,2,0,0,0,1,0,4,0,0,3
Bermuda,31,0,1577,0,0,0,171,0,1,0,1,2,9,0,0,1,0,1,1,0
Boné,20,2,0,1527,0,0,0,0,0,0,6,0,1,0,1,0,0,0,0,0
Moletom,56,0,2,1,1141,11,0,0,0,0,0,4,0,6,1,0,0,0,0,0
Blusa,383,0,2,1,35,440,0,5,0,0,8,7,0,4,0,2,0,0,14,0
Calça,14,1,186,1,4,0,586,0,2,0,4,1,4,0,0,2,1,3,0,0
Polo,145,0,1,0,1,7,0,469,0,0,0,16,0,0,0,3,0,0,0,0
Mochila,4,0,8,3,0,0,0,0,607,0,0,0,0,0,0,0,0,4,0,0
Óculos,7,4,0,2,0,0,0,0,3,555,0,0,0,0,0,0,0,0,0,0


In [20]:
print(classification_report(df_join['correct_category'], df_join['predicted'], target_names=categories))

                        precision    recall  f1-score   support

              Camiseta       0.85      0.95      0.90       437
                 Tênis       0.89      0.85      0.87       247
               Bermuda       0.83      0.88      0.86      1795
                  Boné       0.71      0.49      0.58       901
               Moletom       0.99      0.98      0.98      1557
                 Blusa       0.94      0.30      0.45       243
                 Calça       0.74      0.72      0.73       809
                  Polo       0.86      0.67      0.75       418
               Mochila       0.91      0.98      0.95      9266
                Óculos       0.95      0.94      0.95       385
Acessório de Vestuário       0.96      0.83      0.89       223
                Camisa       0.98      0.93      0.95       336
                Shorts       0.96      0.97      0.97       626
    Casacos e Jaquetas       0.93      0.93      0.93      1222
                  Meia       0.94      