In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from import_casa import casa
from casa import Cadet

In [3]:
import pickle
import numpy as np
import pandas as pd
import ast
from collections import Counter

## Sample statistics

In [7]:
with (casa.get_data_path() / f"threads/cht2021-JanMay.pkl").open("rb") as fin:
    op21 = pickle.load(fin)

In [10]:
len(op21), sum(len(list(x.opinions())) for x in op21)

(23886, 105236)

In [12]:
sum(len(op.text) for thread in op21 for op in thread.opinions())

7238676

## Sample opinions (every 20)

In [4]:
with (casa.get_data_path() / f"threads/cht2021-JanMay-op-every20.pkl").open("rb") as fin:
    op_sample = pickle.load(fin)

In [5]:
with (casa.get_data_path() / f"models/bert_aspect_extraction/0629/every20_bert_logits.pkl").open("rb") as fin:
    samp_logits = pickle.load(fin)

In [6]:
spans = pd.read_csv(casa.get_data_path() / f"models/bert_aspect_extraction/0629/every20_evalspans.csv", index_col=None)

In [7]:
spans.shape

(5387, 5)

In [8]:
len([x for x in op_sample if x.text])

5387

In [9]:
len(samp_logits)

5387

In [10]:
len(op_sample[0].text), samp_logits[0].shape

(275, (512, 5))

In [11]:
cadet = Cadet.load(casa.get_data_path()/"cadet/op20.2")
cadet

[INFO] 2021-07-07 16:01:52,332 gensim.utils: loading KeyedVectors object from E:\LangOn\casa\data\cadet\op20.2\ft-2020.kv
[INFO] 2021-07-07 16:01:53,215 gensim.utils: setting ignored attribute vectors_norm to None
[INFO] 2021-07-07 16:01:53,215 gensim.utils: setting ignored attribute vectors_vocab_norm to None
[INFO] 2021-07-07 16:01:53,216 gensim.utils: setting ignored attribute vectors_ngrams_norm to None
[INFO] 2021-07-07 16:01:53,217 gensim.utils: setting ignored attribute buckets_word to None
[INFO] 2021-07-07 16:01:53,329 gensim.utils: FastTextKeyedVectors lifecycle event {'fname': 'E:\\LangOn\\casa\\data\\cadet\\op20.2\\ft-2020.kv', 'datetime': '2021-07-07T16:01:53.328008', 'gensim': '4.0.0', 'python': '3.8.2 (tags/v3.8.2:7b3ab59, Feb 25 2020, 23:03:10) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'loaded'}


<Cadet: FastText(9851, 200), Seeds(79)>

In [17]:
from tqdm.auto import tqdm
srv_list = list(cadet.lexicon.get_services().keys())
ent_list = list(cadet.lexicon.get_entities().keys())
cadet_failed_ids = []
counter = 0
for op_x in tqdm(op_sample):
    if not op_x.text.strip(): 
        continue
    
    res = cadet.detect(op_x.text, summary=False)                    
    ent_probs = res["entity_probs"]
    srv_probs = res["service_probs"]
    
    top_ent = np.argmax(ent_probs)
    top_srv = np.argmax(srv_probs)
    ent_maxp = np.max(ent_probs)
    srv_maxp = np.max(srv_probs)
    M = len(op_x.text)
    pred_span = ast.literal_eval(spans.iloc[counter,:].pred_eval_spans)
    if len(pred_span):
        n_pos = Counter([x[0] for x in pred_span]).get("Positive", 0)
        n_neg = len(pred_span) - n_pos    
        if n_pos > n_neg: polarity = "Positive"
        elif n_neg > n_pos: polarity = "Negative"
        else: polarity = "Neutral"
    else:
        polarity = "Neutral"
    
    setattr(op_x, "cadet_entity", ent_list[top_ent] if ent_maxp > 0.3 else None)
    setattr(op_x, "cadet_service", srv_list[top_srv] if srv_maxp > 0.1 else None)
    setattr(op_x, "ent_probs", ent_probs)
    setattr(op_x, "srv_probs", srv_probs)
    setattr(op_x, "tok_logits", samp_logits[counter][1:M+1, :])
    setattr(op_x, "pred_span", pred_span)
    setattr(op_x, "pred_polarity", polarity)
    
    counter += 1

HBox(children=(FloatProgress(value=0.0, max=5405.0), HTML(value='')))




In [18]:
counter, len(spans)

(5387, 5387)

In [19]:
with (casa.get_data_path() / f"threads/cht2021-JanMay-op-every20-attr.pkl").open("wb") as fout:
    pickle.dump(op_sample, fout)

In [20]:
from collections import Counter
ent_counter = Counter(map(lambda x: getattr(x, "cadet_entity", None), op_sample))

In [21]:
ent_counter.most_common()

[(None, 3064),
 ('中華電信', 763),
 ('遠傳電信', 515),
 ('台灣大哥大', 401),
 ('台灣之星', 386),
 ('亞太電信', 261),
 ('無框行動', 15)]

In [92]:
from itertools import islice
xx = list(islice(filter(lambda x: getattr(x, "cadet_entity", None) == "台灣大哥大", op_sample), 5))

In [93]:
xx[0].text

'台哥大平均網速不快幹嘛辦之前用了多年中華,合約到期有去申辦台哥大及遠傳試用卡,遠傳平均網速比台哥大快多了,不下於中華！就NP到遠傳,也順利申辦到遠傳電銷319不限速吃到飽專案！剛剛測的'

In [50]:
cadet.detect("就是中華電信，絕對的，其他別想了", verbose=True)

tokens:  ['就', '中華電信', '絕對', '其他', '別', '想', '了']
ent_scores [1.         0.72315866 0.70973432 0.60055864 0.62684321 0.73528677]
srv_scores [0.37546813 0.31578273 0.40386418 0.34264758 0.47839317 0.32973254
 0.29867277 0.34706154 0.35138562 0.36617967 0.42078739 0.30622414
 0.37394285 0.15872674 0.19507323 0.27586547 0.33051437 0.35387656
 0.29947039 0.17197542 0.16918962 0.41623956 0.25886106 0.20634082
 0.28248078 0.3069101  0.        ]
seed_scores(topn) [0.47839317 0.47839317 0.42078739 0.41623956 0.40386418]


{'entity': ['中華電信', '無框行動', '台灣大哥大', '遠傳電信', '亞太電信', '台灣之星'],
 'entity_probs': array([0.93761204, 0.01317898, 0.01302011, 0.01284649, 0.01182457,
        0.01151781]),
 'service': [('資費方案', '升級'),
  ('加值服務', '行動支付'),
  ('其他', '手機'),
  ('資費方案', '方案活動'),
  ('資費方案', '低資費方案')],
 'service_probs': array([0.05181615, 0.04617739, 0.04575928, 0.0446406 , 0.04217602]),
 'seeds': ['轉4G', '升4G', '支付APP', 'android', '購機'],
 'seed_probs': array([0.01779091, 0.01779091, 0.01585486, 0.01571131, 0.01532721])}