In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import json
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from icecream import ic
from import_casa import casa
from matplotlib import pyplot as plt
from casa import Cadence, Cadet, Crystal, MTBert
from sklearn.metrics import classification_report, accuracy_score

In [3]:
cadence = Cadence.load("../../data/cadence/config.json")

[INFO] 2021-09-01 13:36:39,181 casa.Cadence: Loading Cadet
[INFO] 2021-09-01 13:36:39,196 gensim.utils: loading KeyedVectors object from ..\..\data\cadence\..\cadet\op20.3\ft-2020.kv
[INFO] 2021-09-01 13:36:39,496 gensim.utils: setting ignored attribute vectors_norm to None
[INFO] 2021-09-01 13:36:39,497 gensim.utils: setting ignored attribute vectors_vocab_norm to None
[INFO] 2021-09-01 13:36:39,498 gensim.utils: setting ignored attribute vectors_ngrams_norm to None
[INFO] 2021-09-01 13:36:39,498 gensim.utils: setting ignored attribute buckets_word to None
[INFO] 2021-09-01 13:36:39,612 gensim.utils: FastTextKeyedVectors lifecycle event {'fname': '..\\..\\data\\cadence\\..\\cadet\\op20.3\\ft-2020.kv', 'datetime': '2021-09-01T13:36:39.612075', 'gensim': '4.0.0', 'python': '3.8.2 (tags/v3.8.2:7b3ab59, Feb 25 2020, 23:03:10) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'loaded'}
[INFO] 2021-09-01 13:36:39,638 casa.Cadence: Loading Crystal
[INFO] 2021-09

In [4]:
from casa.cadence.resolvers import CadenceBertOnlyResolver, CadenceSimpleResolver, CadenceMultiResolver
from casa.cadence import visualize_tokens

## Sample 1

In [60]:
out = cadence.analyze("中華超划算，而且最快", strategy="simple")
out.aspects

[['中華電信', '[通訊]網速', 'Positive', 'crystal', 'crystal']]

In [31]:
out = cadence.analyze("中華超划算，而且最快", strategy="multiple")
out.aspects

[('中華電信', '[資費]續約攜碼', 'Positive'), ('中華電信', '[通訊]網速', 'Positive')]

In [32]:
visualize_tokens(out, 0.8)

中華[32m超[0m[32m划[0m[32m算[0m，而[32m且[0m[32m最[0m[32m快[0m


{'pn_prob': array([2.2194027e-03, 6.2370958e-04, 9.9951661e-01, 9.9337065e-01,
        9.4366127e-01, 1.6342182e-01, 7.2830623e-01, 8.4772754e-01,
        9.9723649e-01, 8.9129704e-01], dtype=float32),
 'pn_idx': array([-1, -1,  0,  0,  0, -1, -1,  0,  0,  0], dtype=int64)}

In [39]:
out.entities

['中華電信']

## Sample 2

In [70]:
out = cadence.analyze("遠傳最近是在哈囉", strategy="simple")
out.aspects

[['遠傳電信', None, 'Negative', 'cadet', 'crystal']]

In [71]:
out = cadence.analyze("遠傳最近是在哈囉", strategy="multiple")
out.aspects

[('遠傳電信', None, 'Negative')]

## Sample 3

In [119]:
out = cadence.analyze("亞太網路超差，中華收訊就很好", 
                      strategy="simple")
out.aspects

[['中華電信', '[通訊]網速', 'Negative', 'crystal', 'crystal']]

In [120]:
out = cadence.analyze("亞太網路超差，中華收訊就很好", 
                      strategy="multiple")
out.aspects

[('亞太電信', '[通訊]網速', 'Negative'), ('中華電信', '[通訊]涵蓋', 'Positive')]

In [117]:
out.entities

['亞太電信', '中華電信']

In [123]:
out.cadet

{'entity': ['中華電信', '亞太電信', '台灣大哥大', '遠傳電信', '台灣之星', '無框行動'],
 'entity_probs': array([0.48782193, 0.48782193, 0.00635621, 0.00620852, 0.00619568,
        0.00559573]),
 'service': [('通訊品質', '網速'),
  ('通訊品質', '涵蓋'),
  ('資費方案', '低資費方案'),
  ('加值服務', 'vowifi'),
  ('加值服務', '電信APP')],
 'service_probs': array([0.44906384, 0.44906384, 0.00546045, 0.00518785, 0.0051812 ]),
 'seeds': ['網速', '覆蓋率', '訊號', '0月租', '免月租'],
 'seed_probs': array([0.3714933 , 0.3714933 , 0.00598352, 0.00451722, 0.00451722]),
 'tokens': ['亞太', '網路', '超', '差,', '中華', '收訊', '就', '很好'],
 'tokens_attrib': {'亞太電信': [0], '網速': [1], '中華電信': [4], '覆蓋率': [5]}}

In [125]:
out.mt_bert

{'text': '亞太網路超差，中華收訊就很好',
 'seq_polarity': 'Negative',
 'seq_probs': array([0.00368624, 0.00232451, 0.9939892 ], dtype=float32),
 'spans': ['超差'],
 'span_idxs': [[4, 5]],
 'span_pols': ['N'],
 'token_probs': tensor([[9.9958e-01, 1.0760e-04, 3.1391e-04],
         [9.9992e-01, 3.4762e-05, 4.5604e-05],
         [9.9666e-01, 1.3611e-04, 3.2047e-03],
         [9.9577e-01, 2.2931e-04, 3.9974e-03],
         [5.7096e-03, 1.4068e-05, 9.9428e-01],
         [2.1420e-02, 7.8692e-05, 9.7850e-01],
         [9.9966e-01, 8.1200e-06, 3.3352e-04],
         [9.9994e-01, 2.8090e-05, 2.7910e-05],
         [9.9996e-01, 1.0898e-05, 2.4490e-05],
         [9.9429e-01, 1.0880e-04, 5.6001e-03],
         [9.9923e-01, 9.7623e-05, 6.7131e-04],
         [7.6497e-01, 1.6180e-03, 2.3341e-01],
         [7.4420e-01, 2.9050e-04, 2.5551e-01],
         [8.3516e-01, 1.2149e-03, 1.6363e-01]])}

In [124]:
out.crystal

{'result': ('[通訊]網速', 1.0),
 'word_attr_map': {'超差': ('[通訊]網速', 1, 0.5), '很好': ('[通訊]涵蓋', 5, 0.3)},
 'CxG': [],
 'onto': [('超差', [('[通訊]網速', 1, 0.5), ('[通訊]涵蓋', 1, 0.5)]),
  ('很好',
   [('[其他]手機', 4, 0.3),
    ('[通訊]網速', 4, 0.3),
    ('[通訊]涵蓋', 5, 0.3),
    ('[通訊]涵蓋', 4, 0.1)])]}

In [110]:
# why this doesn't work??
out = cadence.analyze("亞太很划算，但中華網速很快", strategy="simple")
out

<CadenceOutput: [('中華電信', '[資費]方案活動', 'Positive')]>

In [113]:
out = cadence.analyze("亞太很划算，但中華很順暢", strategy="multiple")
out

<CadenceOutput: [('中華電信', '[通訊]網速', 'Positive')]>

In [114]:
out.crystal

{'result': ('[通訊]網速', 5.0),
 'word_attr_map': {'很順暢': ('[通訊]網速', 5, 1.0)},
 'CxG': [],
 'onto': [('很順暢', [('[通訊]網速', 5, 1.0)]),
  ('很划算',
   [('[資費]低資費方案', 5, 0.16666666666666666),
    ('[資費]續約攜碼', 5, 0.16666666666666666),
    ('[加值]電信APP', 5, 0.16666666666666666),
    ('[其他]手機', 4, 0.16666666666666666),
    ('[資費]方案活動', 4, 0.16666666666666666),
    ('[加值]國際漫遊', 5, 0.16666666666666666)])]}

In [41]:
visualize_tokens(out, 0.005)

亞太網路[31m超[0m[31m差[0m，中華[31m收[0m訊[31m就[0m[31m很[0m[31m好[0m


{'pn_prob': array([3.1391025e-04, 4.5604211e-05, 3.2047320e-03, 3.9974255e-03,
        9.9427634e-01, 9.7850168e-01, 3.3351569e-04, 2.8090037e-05,
        2.4490140e-05, 5.6001269e-03, 6.7131169e-04, 2.3341034e-01,
        2.5550574e-01, 1.6362540e-01], dtype=float32),
 'pn_idx': array([-1, -1, -1, -1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1],
       dtype=int64)}

In [42]:
out.crystal

{'result': ('[通訊]網速', 1.0),
 'word_attr_map': {'超差': ('[通訊]網速', 1, 0.5), '很好': ('[通訊]涵蓋', 5, 0.3)},
 'CxG': [],
 'onto': [('超差', [('[通訊]網速', 1, 0.5), ('[通訊]涵蓋', 1, 0.5)]),
  ('很好',
   [('[其他]手機', 4, 0.3),
    ('[通訊]網速', 4, 0.3),
    ('[通訊]涵蓋', 5, 0.3),
    ('[通訊]涵蓋', 4, 0.1)])]}

In [118]:
out.cadet

{'entity': ['中華電信', '亞太電信', '台灣大哥大', '遠傳電信', '台灣之星', '無框行動'],
 'entity_probs': array([0.48782193, 0.48782193, 0.00635621, 0.00620852, 0.00619568,
        0.00559573]),
 'service': [('通訊品質', '網速'),
  ('通訊品質', '涵蓋'),
  ('資費方案', '低資費方案'),
  ('加值服務', 'vowifi'),
  ('加值服務', '電信APP')],
 'service_probs': array([0.44906384, 0.44906384, 0.00546045, 0.00518785, 0.0051812 ]),
 'seeds': ['網速', '覆蓋率', '訊號', '0月租', '免月租'],
 'seed_probs': array([0.3714933 , 0.3714933 , 0.00598352, 0.00451722, 0.00451722]),
 'tokens': ['亞太', '網路', '超', '差,', '中華', '收訊', '就', '很好'],
 'tokens_attrib': {'亞太電信': [0], '網速': [1], '中華電信': [4], '覆蓋率': [5]}}

In [44]:
out.entities

['亞太電信', '中華電信']

## Sample 4

In [103]:
out = cadence.analyze("他為什麼怪怪的阿", strategy="simple")
out.aspects

[[None, '[通訊]網速', 'Negative', 'crystal', 'crystal']]

In [104]:
out.mt_bert

{'text': '他為什麼怪怪的阿',
 'seq_polarity': 'Neutral',
 'seq_probs': array([9.9883884e-01, 1.3758482e-04, 1.0236134e-03], dtype=float32),
 'spans': [],
 'span_idxs': [],
 'span_pols': [],
 'token_probs': tensor([[9.9030e-01, 4.4528e-04, 9.2553e-03],
         [9.9771e-01, 9.3668e-06, 2.2854e-03],
         [9.9631e-01, 1.4438e-05, 3.6719e-03],
         [9.9797e-01, 1.2793e-05, 2.0210e-03],
         [9.8008e-01, 1.3504e-05, 1.9907e-02],
         [9.9730e-01, 1.4293e-05, 2.6883e-03],
         [9.9830e-01, 2.0094e-05, 1.6798e-03],
         [9.9836e-01, 4.0464e-05, 1.6040e-03]])}