In [1]:
import numpy as np
import pandas as pd
import gc

pd.options.display.max_columns = 100
pd.options.display.max_rows = 5000
pd.options.display.max_colwidth = 1000


In [2]:
#-----------------------
# lodaing simple CNN features
#-----------------------
item_id = np.load('../models/tcns/predictions_wo_causal_cnn_char/item_id.npy')
word_id = np.load('../models/tcns/predictions_wo_causal_cnn_char/word_id.npy')
final_predictions = np.load('../models/tcns/predictions_wo_causal_cnn_char/final_predictions.npy')
final_states = np.load('../models/tcns/predictions_wo_causal_cnn_char/final_states.npy')
history_length = np.load('../models/tcns/predictions_wo_causal_cnn_char/history_length.npy')
#----------------
# merge testing result from Neural Network
#----------------
i_ids = []
w_ids = []
predictions = []
hidden_features = []
for sent, hidden_feature, sent_length, i_id, w_id in zip(final_predictions, final_states, history_length, item_id, word_id):
#     print ('i_id', i_id)
#     print ('sent_length',sent_length)
    for w, y_pred, h_f in zip(w_id[:sent_length], sent[:sent_length], hidden_feature[:sent_length]):
#         print ('w_id',w)
#         print ('y_pred_index',np.argmax(y_pred))
#         print ('h_f', len(h_f))
        i_ids.append(i_id)
        w_ids.append(w)
        predictions.append(np.argmax(y_pred))
        hidden_features.append(h_f)
hidden_features = np.array(hidden_features) # (history_lengh * num_samples, num_hidden_units)
#----------------
# output
#----------------
df1 = pd.DataFrame({
    'item_id':i_ids,
    'word_id':w_ids,
    'predictions':predictions,
})
df2 = pd.DataFrame(hidden_features, columns = ['simple_cnn_{}'.format(i+1) for i in range(hidden_features.shape[1])])
test = pd.concat([df1,df2], axis = 1)
del df1,df2
gc.collect()
#-------------
# submit
#-------------
mobile_training_w_word_id = pd.read_csv('../data/processed/mobile_training_w_word_id.csv')
submission = pd.merge(mobile_training_w_word_id, test[['item_id','word_id','predictions']], 
         on = ['item_id','word_id'], how = 'left')

# manually check the wrong result

In [3]:
val = submission[submission.eval_set == 'val']
wrong_val = val[val.label!=val.predictions]
wrong_val.shape

(218, 8)

In [4]:
val[val.item_name == 'cuci gudang charger hp lenovo handphone ori original 100']

Unnamed: 0,item_name,tokens,label,eval_set,clean_tokens,item_id,word_id,predictions
499325,cuci gudang charger hp lenovo handphone ori original 100,cuci,0,val,cuci,43059,7205,0
499326,cuci gudang charger hp lenovo handphone ori original 100,gudang,0,val,gudang,43059,23973,0
499327,cuci gudang charger hp lenovo handphone ori original 100,charger,0,val,charger,43059,8858,0
499328,cuci gudang charger hp lenovo handphone ori original 100,hp,0,val,hp,43059,10914,0
499329,cuci gudang charger hp lenovo handphone ori original 100,lenovo,2,val,lenovo,43059,17441,2
499330,cuci gudang charger hp lenovo handphone ori original 100,handphone,0,val,handphone,43059,19692,0
499331,cuci gudang charger hp lenovo handphone ori original 100,ori,0,val,ori,43059,8228,0
499332,cuci gudang charger hp lenovo handphone ori original 100,original,0,val,original,43059,20520,0
499333,cuci gudang charger hp lenovo handphone ori original 100,100,0,val,100,43059,2955,0


In [5]:
def performaance_dashboard(df):
    for eval_set in ['train', 'val', 'test']:
        correct_preds, total_correct, total_preds = 0., 0., 0.
        for i_id, groupby_df in df[df.eval_set == eval_set].groupby('item_id'):
            y_true = list(groupby_df.label)
            y_pred = list(groupby_df.predictions)
        #     print ('item_id', i_id)
        #     print ('y_true', y_true)
        #     print ('y_pred', y_pred)
            if all(v == 0 for v in y_true):
                pass
            else:
                total_correct += 1.0
                if y_true == y_pred:
                    correct_preds += 1.0
            if all(v == 0 for v in y_pred):
                pass
            else:
                total_preds += 1.0
        #----------
        # output
        #----------
        p   = correct_preds / total_preds if correct_preds > 0 else 0
        r   = correct_preds / total_correct if correct_preds > 0 else 0
        f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
        print('{}-f1: {}'.format(eval_set,f1))
        print('{}-precision: {}'.format(eval_set,p))
        print('{}-recall: {}'.format(eval_set,r))


In [6]:
performaance_dashboard(submission)

train-f1: 0.993844901087321
train-precision: 0.9981778929707561
train-recall: 0.9895493647945345
val-f1: 0.9940461758129261
val-precision: 0.9979445015416238
val-recall: 0.9901781880635466
test-f1: 0.9777380565485863
test-precision: 0.980446472217696
test-recall: 0.9750445632798574


In [7]:
train_brand = set(submission[(submission.eval_set == 'test') & (submission.label == 2)].clean_tokens.unique())
train_brand

{'advan',
 'apple',
 'asus',
 'blackberry',
 'blackview',
 'brandcode',
 'doogee',
 'evercoss',
 'htc',
 'huawei',
 'icherry',
 'lenovo',
 'lg',
 'meizu',
 'mito',
 'motorola',
 'nokia',
 'oppo',
 'oukitel',
 'polytron',
 'samsung',
 'smartfren',
 'sony',
 'ulefone',
 'vivo',
 'xiaomi'}

In [8]:
test_brand = set(submission[(submission.eval_set == 'train') & (submission.label == 2)].clean_tokens.unique())
test_brand

{'advan',
 'apple',
 'asus',
 'blackberry',
 'blackview',
 'brandcode',
 'doogee',
 'evercoss',
 'htc',
 'huawei',
 'icherry',
 'lenovo',
 'lg',
 'meizu',
 'mito',
 'motorola',
 'nokia',
 'oem',
 'oppo',
 'oukitel',
 'polytron',
 'samsung',
 'smartfren',
 'sony',
 'ulefone',
 'vivo',
 'xiaomi'}

In [9]:
submission[submission.clean_tokens == 'oem']

Unnamed: 0,item_name,tokens,label,eval_set,clean_tokens,item_id,word_id,predictions
166871,ab tarvel charger handphone asus zenfone 2 4 5 padfone casan hp original oem,oem,0,train,oem,13662,22611,0
167487,acc hp cdt hosing housing samsung b3210 corby fullset oem,oem,0,train,oem,13719,22611,0
168781,accesoris hp book cover samsung galaxy tab a 80 sarung oem by,oem,0,val,oem,13843,22611,0
168795,accesoris hp book cover samsung galaxy tab a7 2016 j max t285 oem by,oem,0,train,oem,13844,22611,0
168841,accesoris hp oppo a37 spigen rugged armor extra black oem by rafanzacollection,oem,0,train,oem,13848,22611,0
169198,accesoris hp spigen slim armor ipad mini 123 oem by rafanzacollection,oem,2,train,oem,13874,22611,2
170551,adhesive lem 3m lcd atau depan sony xperia z3 z4 original oem,oem,0,train,oem,13988,22611,0
170567,adhesive lem 3m lcd depan dan backdoor belakang set sony xperia z c6602 c6603 original oem,oem,0,train,oem,13989,22611,0
170582,adhesive lem 3m lcddepan backdoorbelakang set sony xperia z ultra c6802 original oem,oem,0,train,oem,13990,22611,0
170598,adhesive lem 3m lcddepan backdoorbelakang set sony xperia z3 mini compact d5803 original oem,oem,0,train,oem,13991,22611,0


In [10]:
# Q2: 表現能0.99, 是因為所有的brand在train都看過了, model強記ＸＤ. 
# My opinion 很像也不意外, 因為我們tagging就是來自lazada data, 所以f1=0.99代表我們的model很好的學起pattern that which word
# will be the brand that popping out in b
