In [1]:
PATH_TO_SAMPLE_FILE = "/home/chendian/CDConfusor/exp/data/cn/Wang271k/dcn_train.tsv"
lines = [line for line in open(PATH_TO_SAMPLE_FILE, 'r')]
len(lines)

272099

In [8]:
from tqdm import tqdm
from collections import Counter

char_level_pairs = []

for line in tqdm(lines):
    err, cor = line.rstrip().split('\t')[:2]
    err = err.replace(' ', '')
    cor = cor.replace(' ', '')
    if err == cor:
        continue
    else:
        faulty_position = []
        for i, (_e, _c) in enumerate(zip(err, cor)):
            if _e != _c:
                char_level_pairs.append((_e, _c))


ct = Counter(char_level_pairs).most_common()

100%|██████████| 281381/281381 [00:04<00:00, 60302.75it/s]


## char-level confusion

In [12]:
import jieba
import random
from tqdm import tqdm
from copy import deepcopy

# char-level confusion from SpellGCN
cfs_path = '../data/spellGraphs.txt'
char_cfs = {}
for line in open(cfs_path, 'r'):
    l, r, t = line.strip().split('|')
    if t in ['同音同调', '同音异调', '近音异调', '近音同调']:
        t = '近音'
    char_cfs.setdefault(t, {})
    char_cfs[t].setdefault(l, [])
    char_cfs[t][l].append(r)
backup_cfs = deepcopy(char_cfs)
print("char-cfs Loaded", list(char_cfs.keys()))


def char_confusor(char):
    # always take different token
    take = char
    candidates = char_cfs['近音'].get(char, [char])
    if candidates:
        take = random.choice(candidates)
        if take != char:
            char_cfs['近音'][char].remove(take)
    else:
        if backup_cfs['近音'][char]:
            char_cfs['近音'][char] = [_c for _c in backup_cfs['近音'][char]]
    return take


def augment_single_sample(err, cor, confusor):
    faulty_position = []
    for i, (_e, _c) in enumerate(zip(err, cor)):
        if _e != _c:
            faulty_position.append((i, _e, _c))
    for i, e, c in faulty_position:
        assert cor[i] == c
        cor = f"{cor[:i]}{confusor(c)}{cor[i+1:]}"
    return cor

dir_path = '../exp/data/cn/'
# SIGHAN
# src_path = dir_path + 'sighan15/sighan15_train.tsv'
# tgt_path = dir_path + 'sighan15/sighan15_train.augc.tsv'

# Wang271K + SIGHAN
src_path = dir_path + 'Wang271k/dcn_train.tsv'
tgt_path = dir_path + 'Wang271k/dcn_train.augc.tsv'
with open(tgt_path, 'w') as f:
    for line in tqdm(open(src_path, 'r')):
        err, cor = line.strip().split('\t')
        aug_err = augment_single_sample(err, cor, confusor=char_confusor)
        if len(err) == len(cor):
            f.write(f"{aug_err}\t{cor}\n")


char-cfs Loaded ['形近', '近音', '同部首同笔画']


272099it [00:03, 72901.51it/s]


## word-level confusion

In [7]:
import sys
sys.path.append('../')

import jieba
import random
from tqdm import tqdm
from confusor import Confusor

mapping = {}
used_conf = {}

conf = Confusor(
    cand_pinyin_num=10, 
    cos_threshold=(0., .75), 
    method='all-similar single-freedom', 
    token_sample_mode='sort', 
    pinyin_sample_mode='sort',  # special
    weight=[1., 0, .2],   # pinyin score, word freq score, IME ranking
    conf_size=300, ime_weight=1,
    debug=False)
conf.conf_with_scores = True


def word_confusor(word, random_select=True):
    if word not in used_conf or len(used_conf.get(word, [])) == 0:
        used_conf[word] = [] 
        res = conf(word)
        min_score = min([x[1] for x in res])
        for w, score in res:
            used_conf[word].extend([w] * int((score - min_score) // 0.01 + 1))
        mapping[word] = [w for w in used_conf[word]]
    """
    elif len(used_conf[word]) == 0:  # all out
        if word in mapping:
            used_conf[word] = mapping[word]
    """
    if word in used_conf[word]:
        used_conf[word].remove(word)
    if random_select:
        ret = random.choice(used_conf[word])
        used_conf[word].remove(ret)
    else:
        ret = used_conf[word][0]
        used_conf[word] = used_conf[word][1:]
    return ret


def augment_single_sample(err, cor, confusor):
    faulty_position = []
    for i, (_e, _c) in enumerate(zip(err, cor)):
        if _e != _c:
            faulty_position.append((i, _e, _c))
    es, cs, streak = "", "", []
    for i, e, c in faulty_position:
        assert cor[i] == c
        if len(streak) == 0:
            es, cs, streak = e, c, [i]
        elif i == streak[-1]+1:
            es += e
            cs += c
            streak.append(i)
        elif i != streak[-1]+1 and len(cs) > 0:
            cor = f"{cor[:streak[0]]}{confusor(cs)}{cor[streak[-1]+1:]}"
            es, cs, streak = e, c, [i]
    else:
        if len(cs) > 0:
            cor = f"{cor[:streak[0]]}{confusor(cs)}{cor[streak[-1]+1:]}"
            es, cs, streak = "", "", []
    return cor


def augment_single_sample_jieba(err, cor, confusor):
    faulty_position = []
    words = jieba.lcut(err)
    pivot = 0
    for i, w in enumerate(words):
        _e = err[pivot: pivot+len(w)]
        _c = cor[pivot: pivot+len(w)]
        if _e != _c:
            if len(_e) >= 2:
                for offset, (__e, __c) in enumerate(zip(_e, _c)):
                    if __e != __c:
                        faulty_position.append((pivot+offset, pivot+offset+1, __e, __c))
            else:
                faulty_position.append((pivot, pivot+len(w), _e, _c))
        pivot += len(w)
    for i, j, _, c in faulty_position:
        cor = f"{cor[:i]}{confusor(c)}{cor[j:]}"
    return cor


same = 0
faulty = 0
src_path = '../exp/data/cn/Wang271k/dcn_train.tsv'
tgt_path = '../exp/data/cn/Wang271k_augw/dcn_train.augw6.tsv'


with open(tgt_path, 'w') as f:
    for line in tqdm(open(src_path, 'r')):
        err, cor = line.strip().split('\t')
        # aug_err = augment_single_sample(err, cor, confusor=word_confusor)
        aug_err = augment_single_sample_jieba(err, cor, confusor=word_confusor)
        if len(aug_err) == len(cor):
            f.write(f"{aug_err}\t{cor}\n")
            if aug_err == cor:
                same += 1
            else:
                faulty += 1
        else:
            print(len(aug_err), aug_err)
            print(len(cor), cor)

same, faulty

Use all-similar single-freedom method.
Pinyin sampling mode: sort.
Token sampling mode: sort.
Now loading pinyin2token corpus.
Loading pinyin2token_noname.pkl (1049.38MB) cost 56.175 seconds.
Loading similar_pinyins.pkl (1589.0MB) cost 81.734 seconds.
Now loading REDscore:
Loading ziREDscore.pkl (1.97MB) cost 0.049 seconds.
Now generating score matrix.
Now Loading word freuency data:
Loading wc_word_frequency_score_01.pkl (59.35MB) cost 1.506 seconds.
Loading wc_word2_frequency_score.pkl (2084.0MB) cost 66.313 seconds.


272099it [03:02, 1494.19it/s]


(158432, 113667)

In [3]:
same, faulty

(158432, 113667)