In [1]:
import pandas as pd

In [14]:
import json
from itertools import islice

with open("/data/medivh_data/workv2_sample.tagged.fixed.jsonl") as f:
    data = [json.loads(line) for line in islice(f, 20000,30000)]

In [2]:
df = pd.read_csv("/data/medivh_data/las.sample.part-1.meta.v2.tsv", sep="\t")
file_source_map = {file: source for file, source in zip(df.file, df.source)}

In [4]:
file_source_map = {file: source for file, source in zip(df.file, df.source)}

In [8]:
import pickle

with open("model/lgbm.pkl", "rb") as f:
    model = pickle.load(f)

with open("model/dtypes.pkl", "rb") as f:
    dtypes = pickle.load(f)

In [9]:
import pandas as pd
from latain.data import Block, BlockHandcraftRecord

In [10]:
import numpy as np

In [17]:
qcwy_a_xuen_dicts = [i for i in data if file_source_map[i["file"]] == "QCWY_A"][:2000]

In [21]:
def get_comm_matrix(xuen_dict):
    block = Block.from_dict(xuen_dict)
    flat_records, ids = BlockHandcraftRecord.from_block(block).to_flat_records()
    records = []
    records_span_idx = []
    for record, ((line_id1, span_id1, span_idx1), (line_id2, span_id2, span_idx2)) in zip(flat_records, ids):
        if line_id1 == line_id2:
            continue
        records.append(record)
        records_span_idx.append((span_idx1, span_idx2))
    _df = pd.DataFrame(records)
    X = _df.iloc[:, :-1].astype(dtypes)
    y = _df.iloc[:, -1]
    probs = model.predict_proba(X)[:, 1]
    num_spans = sum(len(i) for i in block)
    comm_matrix = np.zeros((num_spans, num_spans))
    for prob, (idx1, idx2) in zip(probs, records_span_idx):
        comm_matrix[idx1, idx2] = prob
    return comm_matrix + comm_matrix.T

In [23]:
from tqdm import tqdm

In [24]:
file_comm_matrix_map = {}
for xuen_dict in tqdm(qcwy_a_xuen_dicts):
    file = xuen_dict["file"]
    comm_matrix = get_comm_matrix(xuen_dict)
    file_comm_matrix_map[file] = comm_matrix

100%|██████████| 2000/2000 [11:42<00:00,  2.85it/s]


In [26]:
from moka_tokenizer import moka_codec

In [31]:
def process_span(span: dict):
    output = {
        "tokens": [],
        "tags": [],
        "token_ids": []
    }
    
    for (subtext, _), tag in zip(span["text"], span["tags"]):
        if tag != "O":
            tag = tag[2:]
        _tokens, _indexes = moka_codec.encode(subtext)
        for _token, (_, _idx) in zip(_tokens, _indexes):
            output["tokens"].append(_token.chars)
            output["token_ids"].append(_idx)
            output["tags"].append(tag)
    return output


def xuen_dict_to_ner_record(xuen_dict):
    spans = []
    for line in xuen_dict["objs"]:
        for span in line:
            spans.append(process_span((span)))
    return spans


In [33]:
comm_records = []
for xuen_dict in qcwy_a_xuen_dicts:
    file = xuen_dict["file"]
    spans = xuen_dict_to_ner_record(xuen_dict)
    comm_matrix = file_comm_matrix_map[file]
    comm_records.append({
        "spans": spans,
        "comm_matrix": comm_matrix,
    })

In [35]:
import pickle

In [37]:
with open("data/comm_records.pkl", "wb") as f:
    pickle.dump(comm_records, f)

In [34]:
comm_records[0]

{'spans': [{'tokens': ['<', 'photo', '>'],
   'tags': ['O', 'O', 'O'],
   'token_ids': [1481, 12118, 1494]},
  {'tokens': ['最', '近', '工', '作'],
   'tags': ['O', 'O', 'O', 'O'],
   'token_ids': [18599, 22005, 17691, 16180]},
  {'tokens': ['最', '高', '学', '历', '/', '学', '位'],
   'tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O'],
   'token_ids': [18599, 22918, 17441, 16656, 303, 17441, 16167]},
  {'tokens': ['职', '位', ':'],
   'tags': ['O', 'O', 'O'],
   'token_ids': [20754, 16167, 1435]},
  {'tokens': ['主', '办', '会', '计'],
   'tags': ['O', 'O', 'O', 'O'],
   'token_ids': [16004, 16533, 16143, 21583]},
  {'tokens': ['专', '业', ':'],
   'tags': ['O', 'O', 'O'],
   'token_ids': [15976, 15983, 1435]},
  {'tokens': ['财', '务', '管', '理'],
   'tags': ['O', 'O', 'O', 'O'],
   'token_ids': [21766, 16536, 20382, 19760]},
  {'tokens': ['公', '司', ':'],
   'tags': ['O', 'O', 'O'],
   'token_ids': [16382, 16720, 1435]},
  {'tokens': ['美', '商', '希', '赫', '泵', '浦', '股', '份', '有', '限', '公', '司'],
   'tags': ['O',

In [32]:
xuen_dict_to_ner_record(xuen_dict)

[{'tokens': ['最', '近', '工', '作'],
  'tags': ['O', 'O', 'O', 'O'],
  'token_ids': [18599, 22005, 17691, 16180]},
 {'tokens': ['最', '高', '学', '历', '/', '学', '位'],
  'tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O'],
  'token_ids': [18599, 22918, 17441, 16656, 303, 17441, 16167]},
 {'tokens': ['职', '位', ':'],
  'tags': ['O', 'O', 'O'],
  'token_ids': [20754, 16167, 1435]},
 {'tokens': ['售', '前', '/', '售', '后', '技', '术', '支', '持', '工', '程', '师'],
  'tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
  'token_ids': [16894,
   16510,
   303,
   16894,
   16736,
   18150,
   18621,
   18405,
   18225,
   17691,
   20259,
   17715]},
 {'tokens': ['专', '业', ':'],
  'tags': ['O', 'O', 'O'],
  'token_ids': [15976, 15983, 1435]},
 {'tokens': ['英', '语'], 'tags': ['O', 'O'], 'token_ids': [21024, 21650]},
 {'tokens': ['公', '司', ':'],
  'tags': ['O', 'O', 'O'],
  'token_ids': [16382, 16720, 1435]},
 {'tokens': ['boe', '合', '肥', '京', '东', '方', '视', '讯'],
  'tags': ['O', 'O', 'O', 'O', 'O', 

In [25]:
xuen_dict["objs"][1]

[{'pos': [41.75, 168.74999618530273, 68.75013732910156, 182.07000350952148],
  'fsize': 9.0,
  'font': 'NotoSansCJKsc-Regular',
  'color': [102, 102, 102],
  'bidx': 9,
  'text': [['职', '<KEYW>,<CJK>'], ['位', '<KEYW>,<CJK>'], [':', '<POS>']],
  'tags': ['O', 'O', 'O'],
  'cats': 'Work,Work'},
 {'pos': [105.5, 168.74999618530273, 208.02871704101562, 182.07000350952148],
  'fsize': 9.0,
  'font': 'NotoSansCJKsc-Regular',
  'color': [51, 51, 51],
  'bidx': 11,
  'text': [['售', '<CJK>'],
   ['前', '<CJK>'],
   ['/', '<POS>'],
   ['售', '<CJK>,<TIL>'],
   ['后', '<CJK>,<TIL>'],
   ['技', '<CJK>,<TIL>'],
   ['术', '<CJK>,<TIL>'],
   ['支', '<CJK>,<TIL>'],
   ['持', '<CJK>,<TIL>'],
   ['工', '<CJK>,<TIL>'],
   ['程', '<CJK>,<TIL>'],
   ['师', '<CJK>,<TIL>']],
  'tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
  'cats': 'Work,Work'},
 {'pos': [305.0, 168.74999618530273, 332.0001220703125, 182.07000350952148],
  'fsize': 9.0,
  'font': 'NotoSansCJKsc-Regular',
  'color': [102, 102, 1

In [22]:
get_comm_matrix(qcwy_a_xuen_dicts[0])

array([[0.        , 0.60239705, 0.51842509, ..., 0.02117822, 0.02758879,
        0.08201466],
       [0.60239705, 0.        , 0.        , ..., 0.02702595, 0.02352109,
        0.03841559],
       [0.51842509, 0.        , 0.        , ..., 0.0601093 , 0.01787125,
        0.04158031],
       ...,
       [0.02117822, 0.02702595, 0.0601093 , ..., 0.        , 0.91100606,
        0.73921165],
       [0.02758879, 0.02352109, 0.01787125, ..., 0.91100606, 0.        ,
        0.93044779],
       [0.08201466, 0.03841559, 0.04158031, ..., 0.73921165, 0.93044779,
        0.        ]])

In [33]:
spans = [j for i in block for j in i]

In [52]:
spans[55]

Span(pos=Position(left=300.4773254394531, top=948.3636494318645, right=384.1754150390625, down=961.5829945246379), fsize=8.931989669799805, font='NotoSansCJKsc-Bold', color=Color(r=0, g=0, b=0), tokens=['财', '务', '助', '理', '/', '文', '员', ' (', '兼', '职', ')'], token_tags=['<CJK>,<TIL>', '<CJK>,<TIL>', '<CJK>,<TIL>', '<CJK>,<TIL>', '<POS>', '<CJK>', '<CJK>', '<POS>', '<CJK>,<TIL>', '<CJK>,<TIL>', '<POS>'], text='财务助理/文员 (兼职)', tag='title')

In [37]:
spans[8]

Span(pos=Position(left=82.38790893554688, top=161.57968521118164, right=157.34567260742188, down=174.79903030395508), fsize=8.931989669799805, font='NotoSansCJKsc-Regular', color=Color(r=0, g=0, b=0), tokens=['物', '业', '管', '理', '/', '商', '业', '中', '心'], token_tags=['<CJK>,<TIL>', '<CJK>,<TIL>', '<CJK>,<TIL>', '<CJK>,<TIL>', '<POS>', '<CJK>,<DPT>', '<CJK>,<DPT>', '<CJK>,<DPT>', '<CJK>,<DPT>'], text='物业管理/商业中心', tag='')

In [45]:
import sys
sys.path.insert(0, "/home/wangyuxin/workspace/resume-meta-guesser/guesser")
from mrd import Block as Mblock
import rich

In [47]:
spans[20]

Span(pos=Position(left=300.4773254394531, top=544.1665992736816, right=354.0696105957031, down=557.3858985900879), fsize=8.931989669799805, font='NotoSansCJKsc-Bold', color=Color(r=0, g=0, b=0), tokens=['财', '务', '会', '计', '专', '员'], token_tags=['<CJK>,<TIL>', '<CJK>,<TIL>', '<CJK>,<TIL>', '<CJK>,<TIL>', '<CJK>,<TIL>', '<CJK>,<TIL>'], text='财务会计专员', tag='title')

In [46]:
rich.print(Mblock.from_mrd_record(xuen_dict))