## lrdb.csv to sparse matrix

In [1]:
lrdb_fname = '../data/lrdb.csv'

def load(fname):
    def remain_complete_korean(r):
        kor_begin = 44032
        kor_end = 55203
        r = ''.join([ri for ri in r if kor_begin <= ord(ri) <= kor_end])
        return r
    def preprocessing(doc):
        return (doc[0], doc[1], remain_complete_korean(doc[2]), doc[3], doc[4])
    
    # <eojeol, l, r, tag, l_stemmed>
    with open(fname, encoding='utf-8') as f:
        next(f)
        docs = [doc.strip().split('\t') for doc in f]
    docs = [preprocessing(doc) for doc in docs if remain_complete_korean(doc[2])]
    eojeol, l, r, tag, l0 = zip(*docs)
    return l, r, tag

l,r,t = load(lrdb_fname)
len(l), len(r), len(t)

(5696420, 5696420, 5696420)

In [2]:
from collections import Counter
ltag_count = Counter([(li,ti) for li, ti in zip(l, t)])
r_count = Counter(r)

print('min freq (0,0)  : #L= %d, #R = %d' % (len(ltag_count), len(r_count)))
print('min freq (5,5)  : #L= %d, #R = %d' % (len({li:f for li, f in ltag_count.items() if f >= 5}),
                                        len({ri:f for ri, f in r_count.items() if f >= 5})))
print('min freq (10,10): #L= %d, #R = %d' % (len({li:f for li, f in ltag_count.items() if f >= 10}),
                                        len({ri:f for ri, f in r_count.items() if f >= 10})))
print('min freq (30,15): #L= %d, #R = %d' % (len({li:f for li, f in ltag_count.items() if f >= 30}),
                                        len({ri:f for ri, f in r_count.items() if f >= 15})))

min freq (0,0)  : #L= 201336, #R = 20252
min freq (5,5)  : #L= 50766, #R = 4995
min freq (10,10): #L= 31802, #R = 3297
min freq (30,15): #L= 15166, #R = 2617


In [3]:
def create_train_data(head, directory, lset, rset):
    d = {}
    for li, ri, ti in zip(l, r, t):
        key = (li, ti)
        if not (key in lset) or not (ri in rset):
            continue
        features = d.get(key, {})
        features[ri] = features.get(ri, 0) + 1
        d[key] = features
    
    x_word = []
    ylabel = []
    vocabs = {ri:j for j, ri in enumerate(rset)}
    
    rows = []
    cols = []
    data = []
    
    for (li, ti), rdict in d.items():
        features = {vocabs[ri]:v for ri, v in rdict.items() if ri in vocabs}
        if not features: continue
        i = len(x_word)
        x_word.append(li)
        ylabel.append(ti)        
        
        for j, v in features.items():
            rows.append(i)
            cols.append(j)
            data.append(v)
    
    import os
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    from scipy.sparse import csr_matrix
    x = csr_matrix((data, (rows, cols)))
    
    from scipy.io import mmwrite
    mm_fname = '%s/%s_x.mtx' % (directory, head)
    mmwrite(mm_fname, x)
    
    x_word_fname = '%s/%s_x_word' % (directory, head)
    with open(x_word_fname, 'w', encoding='utf-8') as f:
        for word in x_word:
            f.write('%s\n' % word)
    
    x_word_fname = '%s/%s_y' % (directory, head)
    with open(x_word_fname, 'w', encoding='utf-8') as f:
        for y in ylabel:
            f.write('%s\n' % y)
            
    vocabs_fname = '%s/%s_vocabs' % (directory, head)
    with open(vocabs_fname, 'w', encoding='utf-8') as f:
        for ri in sorted(vocabs.keys(), key=lambda x:vocabs[x]):
            f.write('%s\n' % ri)
            
    return x, x_word, ylabel, vocabs

In [4]:
lset = {li:f for li, f in ltag_count.items() if f >= 5}
rset = {ri:f for ri, f in r_count.items() if f >= 5}
x, x_word, ylabel, vocabs = create_train_data('l5_r5', '../data/', lset, rset)

x.shape, len(ylabel), len(vocabs), len(x_word)

((50764, 4995), 50764, 4995, 50764)

In [5]:
lset = {li:f for li, f in ltag_count.items() if f >= 10}
rset = {ri:f for ri, f in r_count.items() if f >= 10}
x, x_word, ylabel, vocabs = create_train_data('l10_r10', '../data/', lset, rset)

x.shape, len(ylabel), len(vocabs), len(x_word)

((31797, 3297), 31797, 3297, 31797)

In [6]:
lset = {li:f for li, f in ltag_count.items() if f >= 30}
rset = {ri:f for ri, f in r_count.items() if f >= 15}
x, x_word, ylabel, vocabs = create_train_data('l30_r15', '../data/', lset, rset)

x.shape, len(ylabel), len(vocabs), len(x_word)

((15166, 2617), 15166, 2617, 15166)