In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import json
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from import_casa import casa, cano

In [3]:
TARGET_DIR = "20210605"
result_dir = casa.get_data_path() / f"annot_data/annotated_data_bkup/{TARGET_DIR}"
json_paths = [x for x in result_dir.iterdir() if x.suffix==".json"]

In [4]:
with json_paths[1].open("r", encoding="UTF-8") as fin:
    annots = json.load(fin)

In [5]:
list(annots[0].keys())

['completions', 'data', 'id']

In [6]:
len(annots)

527

In [7]:
aspect_list = []

In [8]:
aspect_list = []
for json_path in tqdm(json_paths):
    with json_path.open("r", encoding="UTF-8") as fin:
        annots = json.load(fin)
    for annot_i, annot_x in enumerate(annots):        
        aspects = cano.process_thread_annotations(annot_x)        
        aspect_list.extend(aspects)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 14.06it/s]


In [9]:
len(aspect_list)

3118

In [10]:
import pandas as pd
data_items = []
for aspect_x in aspect_list:
    batch_idx = aspect_x.batch_idx
    thread_idx = aspect_x.thread_idx
    serial = aspect_x.serial
    aspect_tuple = aspect_x.make_tuple()
    memo = aspect_x.memo
    ent_rawtext = aspect_x.raw_text(cano.AspectEnum.Entity)
    attr_rawtext = aspect_x.raw_text(cano.AspectEnum.Attribute)
    is_context = aspect_x.has_context_only
    if all(not x.strip() for x in aspect_tuple[0:3]):
        continue
    data_items.append((batch_idx, serial, thread_idx, is_context,
                      *aspect_tuple, ent_rawtext, attr_rawtext))
    

In [11]:
aspect_df = pd.DataFrame(data_items, 
             columns=["batch_idx", "serial", "thread_idx", "is_context", "ent_norm", "attr_norm", "evaltext", 
                      "rating", "ent_rawtext", "attr_rawtext"])
aspect_df.to_csv(result_dir/f"aspect_tuples_{TARGET_DIR}.csv", encoding="UTF-8", index=False)

In [12]:
aspect_df

Unnamed: 0,batch_idx,serial,thread_idx,is_context,ent_norm,attr_norm,evaltext,rating,ent_rawtext,attr_rawtext
0,0,43,3260,True,臺灣之星,[通訊]頻段,台星的態度就是在等宿主台哥，逸以待勞，準備寄生。,1,台星的態度就是在等宿主台哥，逸以待勞，準備寄生。,台星的態度就是在等宿主台哥，逸以待勞，準備寄生。
1,0,7,281,False,中華電信,[通訊]國內電信漫遊,很可以,4,中華,訊號
2,0,13,932,False,臺灣之星,[通訊]網速,不限速卡上傳可,3,台星,21M
3,0,1,114,True,中華電信,,中華、遠傳，蓋一座拆一座,1,中華、遠傳，蓋一座拆一座,中華、遠傳，蓋一座拆一座
4,0,10,674,False,臺灣大哥大,[通訊]國內電信漫遊,越來越爛,1,台灣大哥大,收訊
...,...,...,...,...,...,...,...,...,...,...
3106,8,5,109,False,臺灣大哥大,[資費]低資費方案,禮券算下來不會輸488,4,台哥,499方案
3107,8,5,109,False,遠傳電信,[資費]月租費,,-1,遠傳,588
3108,8,3,45,False,中華電信,[資費]續約攜碼,沒有珍惜已經使用的老客戶權益,1,中華電信,續約
3109,8,8,194,False,中華電信,[資費]方案活動,夠,3,中華,469限速吃到飽


## Sandbox

In [13]:
def find_thread_idx(annots, thread_idx):
    iter_annot = filter(lambda x: x["data"]["thread_idx"]==thread_idx, annots)
    return list(iter_annot)[0]

In [14]:
annots = []
for json_path in tqdm(json_paths):
    with json_path.open("r", encoding="UTF-8") as fin:
        obj = json.load(fin)
    annots.extend(obj)


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 24.34it/s]


In [15]:
annot_x = find_thread_idx(annots, 220)

In [16]:
aspects, dbg = cano.process_thread_annotations(annot_x, True)