In [3]:
import numpy as np
import pandas as pd
import gc
import os
pd.options.display.max_columns = 100
pd.options.display.max_rows = 5000
pd.options.display.max_colwidth = 1000


In [6]:
def performaance_dashboard(df):
    for eval_set in ['train', 'val', 'test']:
        correct_preds, total_correct, total_preds = 0., 0., 0.
        for i_id, groupby_df in df[df.eval_set == eval_set].groupby('item_id'):
            y_true = list(groupby_df.label)
            y_pred = list(groupby_df.predictions)
        #     print ('item_id', i_id)
        #     print ('y_true', y_true)
        #     print ('y_pred', y_pred)
            if all(v == 0 for v in y_true):
                pass
            else:
                total_correct += 1.0
                if y_true == y_pred:
                    correct_preds += 1.0
            if all(v == 0 for v in y_pred):
                pass
            else:
                total_preds += 1.0
        #----------
        # output
        #----------
        p   = correct_preds / total_preds if correct_preds > 0 else 0
        r   = correct_preds / total_correct if correct_preds > 0 else 0
        f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
        print('{}-f1: {}'.format(eval_set,f1))
        print('{}-precision: {}'.format(eval_set,p))
        print('{}-recall: {}'.format(eval_set,r))


In [7]:
os.listdir('/data/ner_task/result_for_brand_detection_model/')

['lips', 'dress', 'face', 'women_top', 'mobile']

In [14]:
for category in os.listdir('/data/ner_task/result_for_brand_detection_model/'):
    print ('=================category=================== : {}'.format(category))
    bath_path = "/data/ner_task/result_for_brand_detection_model/{}/predictions_wo_causal_cnn_char/".format(category)
    #-----------------------
    # lodaing output of Model
    #-----------------------
    item_id = np.load(os.path.join(bath_path,'item_id.npy'))
    word_id = np.load(os.path.join(bath_path,'word_id.npy'))
    final_predictions = np.load(os.path.join(bath_path,'final_predictions.npy'))
    final_states = np.load(os.path.join(bath_path,'final_states.npy'))
    history_length = np.load(os.path.join(bath_path,'history_length.npy'))
    #----------------
    # merge testing result from Neural Network
    #----------------
    i_ids = []
    w_ids = []
    predictions = []
    hidden_features = []
    for sent, hidden_feature, sent_length, i_id, w_id in zip(final_predictions, final_states, history_length, item_id, word_id):
        for w, y_pred, h_f in zip(w_id[:sent_length], sent[:sent_length], hidden_feature[:sent_length]):
            i_ids.append(i_id)
            w_ids.append(w)
            predictions.append(np.argmax(y_pred))
            hidden_features.append(h_f)
    hidden_features = np.array(hidden_features) # (history_lengh * num_samples, num_hidden_units)
    #----------------
    # Convert into DataFrame
    #----------------
    df1 = pd.DataFrame({
        'item_id':i_ids,
        'word_id':w_ids,
        'predictions':predictions,
    })
    df2 = pd.DataFrame(hidden_features, columns = ['simple_cnn_{}'.format(i+1) for i in range(hidden_features.shape[1])])
    test = pd.concat([df1,df2], axis = 1)
    del df1,df2
    gc.collect()
    #-------------
    # submit
    #-------------
    training_w_word_id = pd.read_csv('../data/processed/{}_w_word_id.csv'.format(category))
    submission = pd.merge(training_w_word_id, test[['item_id','word_id','predictions']], 
             on = ['item_id','word_id'], how = 'left')
    #save
    submission.to_csv(os.path.join(bath_path, 'submission.csv'), index = False)
    #-------------
    # result
    #-------------
    performaance_dashboard(submission)

train-f1: 0.9990050065543219
train-precision: 0.9990523422939634
train-recall: 0.9989576753000632
val-f1: 0.9954545454545455
val-precision: 0.9957374254049446
val-recall: 0.9951718261857427
test-f1: 0.9945188450019777
test-precision: 0.9946874646772917
test-recall: 0.9943502824858758
train-f1: 0.9993421052631579
train-precision: 0.9993421052631579
train-recall: 0.9993421052631579
val-f1: 0.9896907216494845
val-precision: 0.9940828402366864
val-recall: 0.9853372434017595
test-f1: 0.9859154929577465
test-precision: 0.9912854030501089
test-recall: 0.9806034482758621
train-f1: 0.9974827768945416
train-precision: 0.9977093745267303
train-recall: 0.9972562821677263
val-f1: 0.993621906624713
val-precision: 0.9937064126552134
val-recall: 0.9935374149659864
test-f1: 0.9919512437481369
test-precision: 0.9921155502550851
test-recall: 0.9917869916545238
train-f1: 0.9984583761562179
train-precision: 0.9984583761562179
train-recall: 0.9984583761562179
val-f1: 0.9891472868217054
val-precision: 0.9922

In [143]:
for category in os.listdir('/data/ner_task/result_for_brand_detection_model/')[:]:
    print ('=================category=================== : {}'.format(category))
    path = "/data/ner_task/result_for_brand_detection_model/{}/predictions_wo_causal_cnn_char/submission.csv".format(category)
    submission = pd.read_csv(path)
    # train
    train = submission[submission.eval_set == 'train']
    train = train[train.label != 0]
    seen_brand_set_in_training = set(train[train.label != 0].groupby('item_id').apply(lambda x: ' '.join(x.tokens.tolist()[:])).unique().tolist())
    train.reset_index(drop = True,inplace = True)
    # test
    test = submission[submission.eval_set == 'test']
    test = test[test.label != 0]
    seen_brand_set_in_testing = set(test[test.label != 0].groupby('item_id').apply(lambda x: ' '.join(x.tokens.tolist()[:])).unique().tolist())
    test.reset_index(drop = True,inplace = True)
    # val
    val = submission[submission.eval_set == 'val']
    val = val[val.label != 0]
    seen_brand_set_in_validating = set(val[val.label != 0].groupby('item_id').apply(lambda x: ' '.join(x.tokens.tolist()[:])).unique().tolist())
    val.reset_index(drop = True,inplace = True)
    # analysis
    unseen_brand_for_model = seen_brand_set_in_testing-seen_brand_set_in_training
    unseen_brand_for_model.update(seen_brand_set_in_validating-seen_brand_set_in_training)
    print ('unseen_brand_for_model : {}'.format(unseen_brand_for_model))
    print ('number of unseen brand for model : {}'.format(len(unseen_brand_for_model)))
    # save the case that our model classified wrongly
    wrong_test = submission[submission.eval_set == 'test']
    wrong_test = wrong_test[wrong_test.label!=wrong_test.predictions]
    print ('number of sku model classified wrongly : {}'.format(wrong_test.item_id.nunique()))
    print ('percentage of wrong classificaiton : {}'.format(wrong_test.item_id.nunique()/ submission.item_id.nunique()))
    path = "/data/ner_task/result_for_brand_detection_model/{}/predictions_wo_causal_cnn_char/case_our_model_classified_wrongly.csv".format(category)
    wrong_test.to_csv(path,index = False)
    
    
    
    

unseen_brand_for_model : {'etude'}
number of unseen brand for model : 1
number of sku model classified wrongly : 50
percentage of wrong classificaiton : 0.001135615162733653
unseen_brand_for_model : {'kafis shop', 'doublec fashion', 'batik alhadi'}
number of unseen brand for model : 3
number of sku model classified wrongly : 9
percentage of wrong classificaiton : 0.0023407022106631988
unseen_brand_for_model : set()
number of unseen brand for model : 0
number of sku model classified wrongly : 124
percentage of wrong classificaiton : 0.0016796705678370177
unseen_brand_for_model : {'kafis shop'}
number of unseen brand for model : 1
number of sku model classified wrongly : 8
percentage of wrong classificaiton : 0.0010247214038683233
unseen_brand_for_model : set()
number of unseen brand for model : 0
number of sku model classified wrongly : 167
percentage of wrong classificaiton : 0.0008961919890095737


In [79]:
unseen_brand_for_model.update(tmp)

# manually check the wrong result on testing set

In [144]:
wrong_test

Unnamed: 0,item_name,tokens,label,eval_set,clean_tokens,item_id,word_id,predictions
36072,advan s4t bukan oppo samsung vivo sony,advan,2,test,advan,3007,7186,0
129990,bagus nian oppo a3s a33 neo 7 a83 redmi 6a 6 samsung j8 2018,samsung,2,test,samsung,12466,16814,0
130059,bagus nian pubg case samsung s9 s8 a8 iphone x 6 7 oppo f7 f9 v9 a3s y71 mi6 dll,oppo,2,test,oppo,12472,8272,0
130164,bagus oppo a3s a33 neo 7 a83 redmi 6a 6 samsung j8 2018,oppo,0,test,oppo,12484,8272,2
130177,bagus oppo a3s a33 neo 7 a83 redmi 6a 6 samsung j8 2018,samsung,2,test,samsung,12484,16814,0
130295,bagus pubg case samsung s9 s8 a8 iphone x 6 7 oppo f7 f9 v9 a3s y71 mi6 dll,oppo,2,test,oppo,12494,8272,0
136369,baru brandcode b11 mate8 android 3g samsung iphone hp murah mito,samsung,2,test,samsung,12941,16814,0
141456,baru hp murah handphone meriah android 3g bs bbm merek brandcode mirip samsung j1 ace,samsung,2,test,samsung,13285,16814,0
162535,baterai handphone oppo f1s a59 blp601 batre hp battery lenovo original,oppo,2,test,oppo,14666,8272,0
174541,beli 2 gratis 1 huamei nova 3i ram 4 128gb garansi resmi huawei,huawei,2,test,huawei,15646,15883,0


In [140]:
submission[submission.item_id == 18557]

Unnamed: 0,item_name,tokens,label,eval_set,clean_tokens,item_id,word_id,predictions
215484,big promo best seller hp handphone android vivo v5 20mp ram 432 mirip oppo f1s xiaomi samsung,big,0,test,big,18557,18953,0
215485,big promo best seller hp handphone android vivo v5 20mp ram 432 mirip oppo f1s xiaomi samsung,promo,0,test,promo,18557,13374,0
215486,big promo best seller hp handphone android vivo v5 20mp ram 432 mirip oppo f1s xiaomi samsung,best,0,test,best,18557,22373,0
215487,big promo best seller hp handphone android vivo v5 20mp ram 432 mirip oppo f1s xiaomi samsung,seller,0,test,seller,18557,19736,0
215488,big promo best seller hp handphone android vivo v5 20mp ram 432 mirip oppo f1s xiaomi samsung,hp,0,test,hp,18557,6746,0
215489,big promo best seller hp handphone android vivo v5 20mp ram 432 mirip oppo f1s xiaomi samsung,handphone,0,test,handphone,18557,4498,0
215490,big promo best seller hp handphone android vivo v5 20mp ram 432 mirip oppo f1s xiaomi samsung,android,0,test,android,18557,8822,0
215491,big promo best seller hp handphone android vivo v5 20mp ram 432 mirip oppo f1s xiaomi samsung,vivo,2,test,vivo,18557,2360,0
215492,big promo best seller hp handphone android vivo v5 20mp ram 432 mirip oppo f1s xiaomi samsung,v5,0,test,v5,18557,13176,0
215493,big promo best seller hp handphone android vivo v5 20mp ram 432 mirip oppo f1s xiaomi samsung,20mp,0,test,20mp,18557,9921,0


In [None]:
# I go back to check this list