In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import json
import random
import numpy as np
import pandas as pd
from itertools import chain
from tqdm.auto import tqdm
from icecream import ic
from import_casa import casa

In [3]:
data_dir = casa.get_data_path()

In [4]:
aspects = pd.read_csv(data_dir/r"annot_data\annotated_data_bkup\20210605\aspect_tuples_20210605.csv")

In [5]:
from DistilTag import DistilTag
tagger = DistilTag()

In [6]:
a = list(range(10))
a[slice(0, None)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [22]:
from collections import Counter
def ngrams(text, ng_map, wsize=(2, 4)):    
    for w in range(wsize[0], wsize[1]+1):
        seqs = [text[slice(c, None)] for c in range(w)]
        w_iter = ("".join(toks) for toks in zip(*seqs))
        ng_map.setdefault(w, Counter()).update(w_iter)
    return ng_map

In [23]:
ng_map = {}
ngrams("刷牙刷牙齒", ng_map)

{2: Counter({'刷牙': 2, '牙刷': 1, '牙齒': 1}),
 3: Counter({'刷牙刷': 1, '牙刷牙': 1, '刷牙齒': 1}),
 4: Counter({'刷牙刷牙': 1, '牙刷牙齒': 1})}

## Reweight ngram frequency
The reweighted score of ngram, $n$, is recursively defined as,
$$
score(n) = \begin{cases}
0, & f(n) = 1 \\
\max\left(0, f(n) - \sum_{m\in H(n)} score(m)\right), & \textrm{otherwise}
\end{cases}
$$
where $f(n)$ is raw ngram frequency of $n$, and H(n) denotes all other ngrams containing $n$

In [9]:
def compute_score(ng, ngfreq, scores, r=0):        
    f_ng = ngfreq[ng] 
    # if r == 0: ic(r, ng, f_ng)
    if f_ng == 1:
        scores[ng] = 0                
    else:
        H = [x for x in ngfreq 
             if (ng in x) and (ng != x)]
        scores[ng] = f_ng
        for m in H:
            scores[ng] -= compute_score(m, ngfreq, scores, r+1)    
            # if r== 0: ic(m, scores[m])
    scores[ng] = max(scores[ng], 0)
    #if r==0: ic(r, ng, scores[ng])
    return scores[ng]

def compute_ngram_scores(ng_dict):
    wsize = sorted(ng_dict.keys())
    ngfreq = {k: freq for k, freq 
              in chain((*(ng_dict[w].items() for w in wsize)))}
    buf = list(ngfreq.keys())
    scores = {}
    while buf:
        ng = buf.pop()    
        if ng in scores:
            continue
        else:        
            scores[ng] = compute_score(ng, ngfreq, scores)
    scores = {k: v for k, v in scores.items() if v>0}
    return scores

## Main loop

In [12]:
asp_subdfr = aspects.loc[aspects.is_context==False, :]
A_evals = {}
A_words = {}
A_ngrams = {}
for A, V, r in tqdm(zip(asp_subdfr.attr_norm, 
                     asp_subdfr.evaltext, asp_subdfr.rating), total=asp_subdfr.shape[0]):
    if isinstance(A, float) or isinstance(V, float): continue
    V = V.strip()
    try:
        words = chain(*(tuple(w) for w in tagger.tag(V)))
    except:
        print("tagging error: ", end='')
        print(V)
        continue
    A_words.setdefault(A, Counter()).update(words)
    A_evals.setdefault(A, []).append((V, r))
    ng_dict = A_ngrams.setdefault(A, {})
    ngrams(V, ng_dict)


[INFO] 2021-06-09 23:45:13,930 numexpr.utils: NumExpr defaulting to 8 threads.


HBox(children=(FloatProgress(value=0.0, max=2355.0), HTML(value='')))

tagging error: ？
tagging error: ！？
tagging error: ？？？
tagging error: ？？！？！



In [None]:
# debug use
from itertools import islice
scores = {}
attr, ng_dict = list(islice(A_ngrams.items(), 2, 3))[0]
print(attr)
wsize = sorted(ng_dict.keys())
ngfreq = {k: freq for k, freq 
          in chain((*(ng_dict[w].items() for w in wsize)))}
print(compute_score("麼優", ngfreq, scores))
del attr, ng_dict, scores, ngfreq

In [13]:
A_ngrams = {k: compute_ngram_scores(ng_dict) 
            for k, ng_dict 
            in tqdm(A_ngrams.items())}

HBox(children=(FloatProgress(value=0.0, max=23.0), HTML(value='')))




## Export data

In [24]:
data = {}
for A in A_evals:    
    data[A] = {
        "evals": A_evals[A],
        "words": sorted(A_words[A].items(), key=lambda x: -x[1]),
        "ngrams": sorted(A_ngrams[A].items(), key=lambda x: (-x[1], -len(x[0])))
    }    


In [63]:
ng_list = {k: list(chain(*([y[0]]*y[1] for y in x["ngrams"]))) for k, x in data.items()}
attr_list = list(ng_list.keys())

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer=lambda x: x)
mat = tfidf.fit_transform(ng_list.values())

In [65]:
ng_tokens = tfidf.get_feature_names()
ng_tokens.index("超慢")

475

In [68]:
ng_tokens = tfidf.get_feature_names()
for attr_i, attr in enumerate(attr_list):
    attr_item = data[attr]
    ngs = attr_item["ngrams"]
    new_ngs = []    
    for ng, ngfreq in ngs:
        score = mat[attr_i, ng_tokens.index(ng)]
        new_ngs.append((ng, ngfreq, score))
    new_ngs = sorted(new_ngs, key=lambda x: -x[2])
    attr_item["ngrams"] = new_ngs
    data[attr] = attr_item

In [73]:
out_path = data_dir/r"annot_data\annotated_data_bkup\20210605\eval_ontology_raw.json"
with open(out_path, "w", encoding="UTF-8") as fout:
    json.dump(data, fout, indent=2, ensure_ascii=False)