In [1]:
head = 'l10_r10'
directory = '../data/'

from py.utils import load_data
from py.noun import TrainedNounExtractor

X, y, x_words, vocabs = load_data(head, directory)

x shape = (31546, 3515)
y shape = (31546,)
# features = 3515
# L words = 31546


## dev

In [2]:
def lr_to_coefficient(X, y, vocabs):
    from collections import namedtuple
    Info = namedtuple('Info', 'coef count')
    X_frequency = [v[0] for v in X.sum(axis=1).tolist()]
    n_pos = sum([v for v, label in zip(X_frequency, y) if label == 1])
    n_neg = sum([v for v, label in zip(X_frequency, y) if label == -1])
    print('(pos= %.3f, neg= %.3f)' % (n_pos/(n_pos+n_neg), n_neg/(n_pos+n_neg)))
    
    X_pos = X[[i for i, label in enumerate(y) if label == 1]]
    X_neg = X[[i for i, label in enumerate(y) if label == -1]]
    feature_pos_frequency = X_pos.sum(axis=0).tolist()[0]
    feature_neg_frequency = X_neg.sum(axis=0).tolist()[0]
    
    vocab_frequency = X.sum(axis=0).tolist()[0]
    neg_factor = n_pos / n_neg
    score = lambda p,n: (p - n * neg_factor) / (p + n * neg_factor)
    coefficient = [score(p,n) for p, n in zip(feature_pos_frequency, feature_neg_frequency)]
    coefficient = {vocab:Info(coef, count) for count, vocab, coef in zip(vocab_frequency, vocabs, coefficient)}
    return coefficient

## test

In [5]:
from py.rcoef import lr_to_coefficient
from py.utils import save_lr_coefficient
from py.utils import load_lr_coefficient

In [3]:
coefficient = lr_to_coefficient(X, y, vocabs)

(pos= 0.724, neg= 0.276)


In [6]:
save_lr_coefficient(coefficient, './rcoef.pkl')

In [18]:
coefficient = load_lr_coefficient('./rcoef.pkl', as_coef_dict=False)
sorted(filter(lambda x:abs(x[1][0]) < 0.9, coefficient.items()), key=lambda x:x[1], reverse=False)[:20]

[('낸', Info(coef=-0.8972826917901976, count=337)),
 ('내야', Info(coef=-0.8969513146935133, count=56)),
 ('선', Info(coef=-0.8953956843389466, count=647)),
 ('수', Info(coef=-0.8912857765304226, count=99)),
 ('말', Info(coef=-0.8894625978993094, count=15)),
 ('나가서', Info(coef=-0.8894625978993094, count=15)),
 ('내려고', Info(coef=-0.8894625978993094, count=15)),
 ('올리는', Info(coef=-0.8894625978993094, count=15)),
 ('잘', Info(coef=-0.8894625978993094, count=30)),
 ('려니', Info(coef=-0.8860528883070529, count=124)),
 ('놓고', Info(coef=-0.8860528883070529, count=992)),
 ('지두', Info(coef=-0.8859092426445433, count=51)),
 ('오면', Info(coef=-0.8844231003033562, count=36)),
 ('서만큼은', Info(coef=-0.8808001494662847, count=14)),
 ('오', Info(coef=-0.8808001494662847, count=14)),
 ('냐가', Info(coef=-0.8808001494662847, count=14)),
 ('서나마', Info(coef=-0.8808001494662847, count=14)),
 ('내고', Info(coef=-0.8808001494662846, count=189)),
 ('보며', Info(coef=-0.8776028640269462, count=41)),
 ('걸', Info(coef=-0.874229

In [20]:
coefficient = load_lr_coefficient('./rcoef.pkl', as_coef_dict=True)
coefficient

{'온': -0.9374121744058179,
 '마따나': 1.0,
 '어져서': -1.0,
 '였으면': -0.25970007860588096,
 '은년과': -1.0,
 '해두는': 1.0,
 '어줘야': -1.0,
 '하겠다던': 1.0,
 '기도': -0.9984061595819308,
 '어가기': -1.0,
 '놓는': -0.9799228844627041,
 '왔기': -1.0,
 '해보아야': 1.0,
 '었는가': -1.0,
 '했음이': 1.0,
 '이므로': 1.0,
 '다만': -0.9828634089560663,
 '준다는': -0.9916773448188826,
 '될지도': 1.0,
 '스럽기까지': 1.0,
 '기두': -0.9688056517481524,
 '어주기': -1.0,
 '여서는': -0.8678554927480204,
 '로': 0.9991348189084643,
 '이고': 0.9981023746664459,
 '았던가': -1.0,
 '일까': 1.0,
 '지은': 0.7387194126189855,
 '들어간': -1.0,
 '내게': -0.6159468009016545,
 '게야': -1.0,
 '해왔기': 1.0,
 '워할': -1.0,
 '있다고': -0.2483744474135929,
 '해보라고': 1.0,
 '는요': 1.0,
 '께로': 1.0,
 '부턴가': 1.0,
 '이라야': 1.0,
 '에나': 1.0,
 '수도': -1.0,
 '섞인': 1.0,
 '어졌기': -1.0,
 '었건': -1.0,
 '층쌓기한': -1.0,
 '댈': -0.9409541242573974,
 '에서보다도': 1.0,
 '으로선': 1.0,
 '주었지만': -1.0,
 '함은': 1.0,
 '아두는': -1.0,
 '되었는데': 1.0,
 '수없이': -1.0,
 '로나마': 1.0,
 '살': -0.595521298932173,
 '라거나': -0.09654916793905362,
 '는가를': -1.0,
 '