In [1]:
%load_ext autoreload
%autoreload 2

In [60]:
import pickle
import json
import random
import numpy as np
import pandas as pd
from itertools import chain
from tqdm.auto import tqdm
from icecream import ic
from import_casa import casa

In [4]:
data_dir = casa.get_data_path()

In [6]:
aspects = pd.read_csv(data_dir/r"annot_data\annotated_data_bkup\20210605\aspect_tuples_20210605.csv")

In [31]:
from DistilTag import DistilTag
tagger = DistilTag()

In [34]:
a = list(range(10))
a[slice(0, None)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [65]:
from collections import Counter
def ngrams(text, ng_map, wsize=(2, 4)):    
    for w in range(wsize[0], wsize[1]+1):
        seqs = [text[slice(c, None)] for c in range(w)]
        w_iter = ("".join(toks) for toks in zip(*seqs))
        ng_map.setdefault(w, Counter()).update(w_iter)
    return ng_map

In [66]:
ng_map = {}
ngrams("刷牙刷牙齒", ng_map)

{2: Counter({'刷牙': 2, '牙刷': 1, '牙齒': 1}),
 3: Counter({'刷牙刷': 1, '牙刷牙': 1, '刷牙齒': 1}),
 4: Counter({'刷牙刷牙': 1, '牙刷牙齒': 1})}

## Reweight ngram frequency
The reweighted score of ngram, $n$, is recursively defined as,
$$
score(n) = \begin{cases}
0, & f(n) = 1 \\
\max\left(0, f(n) - \sum_{m\in H(n)} score(m)\right), & \textrm{otherwise}
\end{cases}
$$
where $f(n)$ is raw ngram frequency of $n$, and H(n) denotes all other ngrams containing $n$

In [158]:
def compute_score(ng, ngfreq, scores, r=0):        
    f_ng = ngfreq[ng] 
    # if r == 0: ic(r, ng, f_ng)
    if f_ng == 1:
        scores[ng] = 0                
    else:
        H = [x for x in ngfreq 
             if (ng in x) and (ng != x)]
        scores[ng] = f_ng
        for m in H:
            scores[ng] -= compute_score(m, ngfreq, scores, r+1)    
            # if r== 0: ic(m, scores[m])
    scores[ng] = max(scores[ng], 0)
    #if r==0: ic(r, ng, scores[ng])
    return scores[ng]

def compute_ngram_scores(ng_dict):
    wsize = sorted(ng_dict.keys())
    ngfreq = {k: freq for k, freq 
              in chain((*(ng_dict[w].items() for w in wsize)))}
    buf = list(ngfreq.keys())
    scores = {}
    while buf:
        ng = buf.pop()    
        if ng in scores:
            continue
        else:        
            scores[ng] = compute_score(ng, ngfreq, scores)
    scores = {k: v for k, v in scores.items() if v>0}
    return scores

In [159]:
# debug use
scores = {}
attr, ng_dict = list(islice(A_ngrams.items(), 2, 3))[0]
print(attr)
ngfreq = {k: freq for k, freq 
          in chain((*(ng_dict[w].items() for w in wsize)))}
print(compute_score("麼優", ngfreq, scores))
del attr, ng_dict, scores, ngfreq

[資費]方案活動
2


## Main loop

In [160]:
asp_subdfr = aspects.loc[aspects.is_context==False, :]
A_evals = {}
A_words = {}
A_ngrams = {}
for A, V in tqdm(zip(asp_subdfr.attr_norm, asp_subdfr.evaltext), total=asp_subdfr.shape[0]):
    if isinstance(A, float) or isinstance(V, float): continue
    V = V.strip()
    try:
        words = chain(*(tuple(w) for w in tagger.tag(V)))
    except:
        print("tagging error: ", end='')
        print(V)
        continue
    A_words.setdefault(A, Counter()).update(words)
    A_evals.setdefault(A, []).append(V)
    ng_dict = A_ngrams.setdefault(A, {})
    ngrams(V, ng_dict)


HBox(children=(FloatProgress(value=0.0, max=2355.0), HTML(value='')))

tagging error: ？
tagging error: ！？
tagging error: ？？？
tagging error: ？？！？！



In [161]:
A_ngrams = {k: compute_ngram_scores(ng_dict) 
            for k, ng_dict 
            in tqdm(A_ngrams.items())}

HBox(children=(FloatProgress(value=0.0, max=23.0), HTML(value='')))




## Export data

In [162]:
data = {}
for A in A_evals:    
    data[A] = {
        "evals": A_evals[A],
        "words": sorted(A_words[A].items(), key=lambda x: -x[1]),
        "ngrams": sorted(A_ngrams[A].items(), key=lambda x: (-x[1], -len(x[0])))
    }    


In [163]:
data["[通訊]國內電信漫遊"]

{'evals': ['很可以',
  '越來越爛',
  '滿格',
  '用過速度不快穩定也不高',
  '優惠比較少',
  '死活就是收 不到',
  '收不到',
  '用一用突然很頓',
  '還是比較好。',
  '超便宜'],
 'words': [(('不', 'D'), 4),
  (('很', 'Dfa'), 2),
  (('用', 'VC'), 2),
  (('比較', 'Dfa'), 2),
  (('收', 'VC'), 2),
  (('到', 'VC'), 2),
  (('可以', 'D'), 1),
  (('越來越', 'Dfa'), 1),
  (('爛', 'VH'), 1),
  (('滿格', 'Nb'), 1),
  (('過', 'Di'), 1),
  (('速度', 'Na'), 1),
  (('快', 'VH'), 1),
  (('穩定', 'VHC'), 1),
  (('也', 'D'), 1),
  (('高', 'VH'), 1),
  (('優惠', 'VJ'), 1),
  (('少', 'VH'), 1),
  (('死活', 'Na'), 1),
  (('就', 'D'), 1),
  (('是', 'SHI'), 1),
  (('一', 'Neu'), 1),
  (('用', 'Na'), 1),
  (('突然', 'D'), 1),
  (('頓', 'VH'), 1),
  (('還是', 'D'), 1),
  (('好', 'VH'), 1),
  (('。', 'PERIODCATEGORY'), 1),
  (('超便宜', 'VH'), 1)],
 'ngrams': [('不到', 2), ('比較', 2)]}

In [165]:
out_path = data_dir/r"annot_data\annotated_data_bkup\20210605\eval_ontology_raw.json"
with open(out_path, "w", encoding="UTF-8") as fout:
    json.dump(data, fout, indent=2, ensure_ascii=False)