In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import json
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [3]:
from import_casa import casa
from casa import Cadet

In [4]:
cadet = Cadet.load(casa.get_data_path()/"cadet/op20.2")
cadet

[INFO] 2021-07-08 11:29:24,448 gensim.utils: loading KeyedVectors object from E:\LangOn\casa\data\cadet\op20.2\ft-2020.kv
[INFO] 2021-07-08 11:29:25,338 gensim.utils: setting ignored attribute vectors_norm to None
[INFO] 2021-07-08 11:29:25,339 gensim.utils: setting ignored attribute vectors_vocab_norm to None
[INFO] 2021-07-08 11:29:25,340 gensim.utils: setting ignored attribute vectors_ngrams_norm to None
[INFO] 2021-07-08 11:29:25,341 gensim.utils: setting ignored attribute buckets_word to None
[INFO] 2021-07-08 11:29:25,466 gensim.utils: FastTextKeyedVectors lifecycle event {'fname': 'E:\\LangOn\\casa\\data\\cadet\\op20.2\\ft-2020.kv', 'datetime': '2021-07-08T11:29:25.466865', 'gensim': '4.0.0', 'python': '3.8.2 (tags/v3.8.2:7b3ab59, Feb 25 2020, 23:03:10) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'loaded'}


<Cadet: FastText(9851, 200), Seeds(79)>

In [5]:
ss = cadet.detect("12345", summary=False)

In [8]:
len(ss["service"])

25

## Use aspect tuples

In [5]:
tuples = pd.read_csv(casa.get_data_path() / "annot_data/annotated_data_bkup/20210628/aspect_tuples_20210628.csv", index_col=None)

In [6]:
aspect_ans = []
aspect_pred = []
pred_items = []
for row in tqdm(tuples.itertuples(), total=tuples.shape[0]):
    if row.is_context:
        continue
    ent, attr = row.ent_norm, row.attr_norm
    ent_text, attr_text = row.ent_rawtext, row.attr_rawtext
    
    if not all([isinstance(x, str) for x in (ent, attr, ent_text, attr_text)]):
        continue
    ent = ent.replace("臺", "台")
    det = cadet.detect(ent_text + " " + attr_text)
    ent_pred = det["entity"][0]
    attr_pred = det["service"][0]
    attr_pred = f"[{attr_pred[0][:2]}]{attr_pred[1]}"
    aspect_ans.append((ent, attr))
    aspect_pred.append((ent_pred, attr_pred))
    pred_items.append((ent, ent_pred, ent_text, attr, attr_pred, attr_text))

HBox(children=(FloatProgress(value=0.0, max=3592.0), HTML(value='')))




In [7]:
preds = pd.DataFrame.from_records(pred_items, columns="ent,ent_pred,ent_text,attr,attr_pred,attr_text".split(","))

In [8]:
data5g = preds.loc[preds.attr_text == "訊號"]
pd.crosstab(data5g.attr, data5g.attr_pred)

[INFO] 2021-07-06 17:26:18,272 numexpr.utils: NumExpr defaulting to 8 threads.


attr_pred,[通訊]網速
attr,Unnamed: 1_level_1
[通訊]國內電信漫遊,3
[通訊]涵蓋,101
[通訊]網速,69
[通訊]頻段,3


In [9]:
preds.loc[(preds.attr=="[資費]續約攜碼") & (preds.attr!=preds.attr_pred), :]

Unnamed: 0,ent,ent_pred,ent_text,attr,attr_pred,attr_text
74,遠傳電信,遠傳電信,遠傳,[資費]續約攜碼,[資費]低資費方案,月租999
213,遠傳電信,遠傳電信,遠傳,[資費]續約攜碼,[資費]方案活動,資費
214,遠傳電信,遠傳電信,遠傳,[資費]續約攜碼,[資費]方案活動,588退傭
281,台灣之星,台灣之星,台星,[資費]續約攜碼,[資費]方案活動,188續約
329,中華電信,中華電信,中華,[資費]續約攜碼,[通訊]網速,可連接性
511,中華電信,中華電信,中華,[資費]續約攜碼,[資費]方案活動,488或588
512,中華電信,中華電信,中華,[資費]續約攜碼,[資費]方案活動,488
513,中華電信,中華電信,中華,[資費]續約攜碼,[資費]方案活動,488
515,中華電信,中華電信,中華,[資費]續約攜碼,[資費]方案活動,588
932,台灣之星,台灣之星,台星,[資費]續約攜碼,[資費]方案活動,續約iphone之類的方案


In [10]:
print(classification_report([x[0] for x in aspect_ans], [x[0] for x in aspect_pred]))

              precision    recall  f1-score   support

        中華電信       0.96      1.00      0.98      1060
        亞太電信       1.00      0.97      0.99       105
        台灣之星       0.99      0.97      0.98       154
       台灣大哥大       1.00      0.94      0.97       216
        遠傳電信       1.00      0.89      0.94       236

    accuracy                           0.97      1771
   macro avg       0.99      0.95      0.97      1771
weighted avg       0.97      0.97      0.97      1771



In [11]:
print(classification_report([x[1][:4] for x in aspect_ans], [x[1][:4] for x in aspect_pred]))

              precision    recall  f1-score   support

        [其他]       0.79      0.57      0.66       102
        [加值]       0.52      0.84      0.64       118
        [資費]       0.83      0.85      0.84       482
        [通訊]       0.94      0.89      0.91      1069

    accuracy                           0.86      1771
   macro avg       0.77      0.79      0.77      1771
weighted avg       0.87      0.86      0.86      1771



In [12]:
srv_ans = [x[1] for x in aspect_ans]
srv_pred = [x[1] for x in aspect_pred]
print(classification_report(srv_ans, srv_pred, zero_division=0))

              precision    recall  f1-score   support

      [其他]固網       0.57      0.80      0.67        10
      [其他]客服       0.93      0.59      0.72        44
    [其他]帳單繳費       1.00      0.08      0.14        13
      [其他]手機       0.73      0.69      0.71        32
      [其他]資安       0.00      0.00      0.00         3
    [加值]esim       0.17      0.20      0.18         5
  [加值]vowifi       0.47      0.54      0.50        13
    [加值]來電答鈴       1.00      0.67      0.80         3
    [加值]國際漫遊       0.52      0.78      0.62        18
    [加值]影音娛樂       0.63      0.65      0.64        48
    [加值]手機保險       0.00      0.00      0.00         2
    [加值]智慧音箱       0.00      0.00      0.00         0
     [加值]物聯網       0.00      0.00      0.00         0
    [加值]行動支付       0.23      0.33      0.27        18
   [加值]電信APP       0.04      0.18      0.06        11
   [資費]低資費方案       0.28      0.57      0.37        42
      [資費]升級       0.00      0.00      0.00        10
    [資費]方案活動       0.72    

In [13]:
srv_mapper = {"[通訊]涵蓋": "[通訊]網速", "[資費]低資費方案": "[資費]方案活動"}
srv_ans_2 = [srv_mapper.get(x, x) for x in srv_ans]
srv_pred_2 = [srv_mapper.get(x, x) for x in srv_pred]
print(classification_report(srv_ans_2, srv_pred_2, zero_division=0))

              precision    recall  f1-score   support

      [其他]固網       0.57      0.80      0.67        10
      [其他]客服       0.93      0.59      0.72        44
    [其他]帳單繳費       1.00      0.08      0.14        13
      [其他]手機       0.73      0.69      0.71        32
      [其他]資安       0.00      0.00      0.00         3
    [加值]esim       0.17      0.20      0.18         5
  [加值]vowifi       0.47      0.54      0.50        13
    [加值]來電答鈴       1.00      0.67      0.80         3
    [加值]國際漫遊       0.52      0.78      0.62        18
    [加值]影音娛樂       0.63      0.65      0.64        48
    [加值]手機保險       0.00      0.00      0.00         2
    [加值]智慧音箱       0.00      0.00      0.00         0
     [加值]物聯網       0.00      0.00      0.00         0
    [加值]行動支付       0.23      0.33      0.27        18
   [加值]電信APP       0.04      0.18      0.06        11
      [資費]升級       0.00      0.00      0.00        10
    [資費]方案活動       0.71      0.83      0.76       379
     [資費]月租費       0.41    

In [14]:
with open(casa.get_data_path() / 
          "models/bert_aspect_extraction/benchmark_predictions.json", 'r', encoding="UTF-8") as fin:
    bdata = json.load(fin)

In [30]:
cadet.detect("瑞普·特爾傑 - 我是不是這個月載太多遊戲包被中華電信限速了")

{'entity': ['中華電信', '無框行動', '台灣大哥大', '遠傳電信', '亞太電信', '台灣之星'],
 'entity_probs': array([0.93761204, 0.01317898, 0.01302011, 0.01284649, 0.01182457,
        0.01151781]),
 'service': [('通訊品質', '限速'),
  ('加值服務', '電信APP'),
  ('資費方案', '月租費'),
  ('資費方案', '方案活動'),
  ('通訊品質', '網速')],
 'service_probs': array([0.06958488, 0.05686023, 0.05511549, 0.05511549, 0.05157847]),
 'seeds': ['100G限速', 'CARPLAY', '月租費', '綁約', '尖峰'],
 'seed_probs': array([0.0253839 , 0.02074207, 0.0201056 , 0.0201056 , 0.01881533])}