In [1]:
import corpus
import os

In [2]:
import CaboCha

In [3]:
c = CaboCha.Parser()
txt = '暑くて眠いので今日は休みます。'
#txt = 'そうしたらそのワンちゃんがなんかか喜んじゃって、で、あたしの方に走ってきて、とびついてきちゃってさ。'
tree = c.parse(txt)
print(tree.toString(CaboCha.FORMAT_LATTICE))

* 0 1D 0/1 1.997244
暑く	形容詞,自立,*,*,形容詞・アウオ段,連用テ接続,暑い,アツク,アツク
て	助詞,接続助詞,*,*,*,*,て,テ,テ
* 1 3D 0/1 -1.801677
眠い	形容詞,自立,*,*,形容詞・アウオ段,基本形,眠い,ネムイ,ネムイ
ので	助詞,接続助詞,*,*,*,*,ので,ノデ,ノデ
* 2 3D 0/1 -1.801677
今日	名詞,副詞可能,*,*,*,*,今日,キョウ,キョー
は	助詞,係助詞,*,*,*,*,は,ハ,ワ
* 3 -1D 0/1 0.000000
休み	動詞,自立,*,*,五段・マ行,連用形,休む,ヤスミ,ヤスミ
ます	助動詞,*,*,*,特殊・マス,基本形,ます,マス,マス
。	記号,句点,*,*,*,*,。,。,。
EOS



In [4]:
id = 0
for i in range(tree.size()):
    token = tree.token(i)
    if(token.chunk):
        tmp = token.chunk
        print(id, tmp.link, tmp.head_pos, tmp.func_pos, token.surface)
        id += 1

0 1 0 1 暑く
1 3 0 1 眠い
2 3 0 1 今日
3 -1 0 1 休み


In [5]:
def bunsetsu_list(tree):
    result = []
    chunkindex = 0
    for i in range(tree.size()):
        token = tree.token(i)
        if(token.chunk is not None):
            features = token.feature.split(',')
            result.append({'lemma':features[-3], 'link':token.chunk.link, 'index':chunkindex})
            chunkindex += 1
    return result

def search_candidate(known_word_index, blist):
    result = set()
    for b in blist:
        if((b['link'] == known_word_index) or (b['index'] == blist[known_word_index]['link'])):
            result.add(b['index'])
    return sorted(list(result))

In [6]:
cp = corpus.Corpus(os.path.join('moddata','nucc','data001.txt'))

In [7]:
dic = dict()
with open(os.path.join('seeds', 'positive.txt')) as f:
    for w in f.readlines():
        sw = w.strip()
        dic[sw] = corpus.WordDicElement(sw)
        dic[sw].set_value(1.)
with open(os.path.join('seeds', 'negative.txt')) as f:
    for w in f.readlines():
        sw = w.strip()
        dic[sw] = corpus.WordDicElement(sw)
        dic[sw].set_value(-1.)


In [8]:
dic

{'喜ぶ': <corpus.WordDicElement at 0x112c0f7c0>,
 '嬉しい': <corpus.WordDicElement at 0x112c0f580>,
 '楽しい': <corpus.WordDicElement at 0x112c0f8b0>,
 '美味しい': <corpus.WordDicElement at 0x112c0ff40>,
 'きれい': <corpus.WordDicElement at 0x112c0faf0>,
 '美しい': <corpus.WordDicElement at 0x112c0f940>,
 '楽しむ': <corpus.WordDicElement at 0x112c0f850>,
 '悲しい': <corpus.WordDicElement at 0x113231c10>,
 'さみしい': <corpus.WordDicElement at 0x112c0f4c0>,
 '辛い': <corpus.WordDicElement at 0x112c0f4f0>,
 '寂しい': <corpus.WordDicElement at 0x112c0fa90>,
 'きたない': <corpus.WordDicElement at 0x112c0fdc0>,
 '痛い': <corpus.WordDicElement at 0x112c0f190>,
 'まずい': <corpus.WordDicElement at 0x112c0fee0>,
 '残念': <corpus.WordDicElement at 0x1128acc70>}

In [9]:
dic['喜ぶ']

<corpus.WordDicElement at 0x112c0f7c0>

In [10]:
def calc_score(cp, argdic):
    dic = argdic.copy()
    linecount = 0
    for line in cp.conversation:
        linecount += 1
        print(f'  {linecount}')
        for bindex, b in enumerate(line.bunsetsu_index_list):
            bunsetsu_head = line.sentence_list[b]
            try:
                dic[bunsetsu_head.lemma].accesscount += 1
                if(dic[bunsetsu_head.lemma].isvisited and abs(dic[bunsetsu_head.lemma].value) > 0.7):
                    candidates = line.search_candidate(b)
                    for candidate_index in candidates:
                        candidate = line.sentence_list[candidate_index]
                        try:
                            dic[candidate.lemma].add_score(corpus.extract(dic[bunsetsu_head.lemma], dic[candidate.lemma]))
                        except KeyError:
                            newword = corpus.WordDicElement(candidate.lemma)
                            newword.add_score(corpus.extract(dic[bunsetsu_head.lemma], newword))
                            dic[candidate.lemma] = newword
            except KeyError:
                continue
    return dic

def calc_value(argdic, alpha=0.5):
    dic = argdic.copy()
    for key in dic.keys():
        elem = dic[key]
        if(len(elem.score) < 1):
            continue
        elif(elem.isvisited):
            dic[key].set_value(sum(elem.score)/len(elem.score) * alpha + elem.value * (1-alpha))
        else:
            dic[key].set_value(sum(elem.score)/len(elem.score))
        dic[key].reset_score()
            
    return dic

In [11]:
#旧版
'''
def calc_score(cp, argdic):
    c = CaboCha.Parser()
    dic = argdic.copy()
    for line in cp.conversation:
        tree = c.parse(line['content'])
        blist = bunsetsu_list(tree)
        for bindex, b in enumerate(blist):
            try:
                dic[b['lemma']].accesscount += 1
                if(dic[b['lemma']].isvisited and abs(dic[b['lemma']].value) > 0.7):
                    candidates = search_candidate(bindex, blist)
                    for candidate_index in candidates:
                        candidate = blist[candidate_index]
                        try:
                            dic[candidate['lemma']].add_score(corpus.extract(dic[b['lemma']], dic[candidate['lemma']]))
                        except KeyError:
                            newword = corpus.WordDicElement(candidate.lemma)
                            newword.add_score(corpus.extract(dic[b['lemma']], newword))
                            dic[candidate['lemma']] = newword
            except KeyError:
                continue
    return dic

def calc_value(argdic, alpha=0.5):
    dic = argdic.copy()
    for key in dic.keys():
        elem = dic[key]
        if(len(elem.score) < 1):
            continue
        elif(elem.isvisited):
            dic[key].set_value(sum(elem.score)/len(elem.score) * alpha + elem.value * (1-alpha))
        else:
            dic[key].set_value(sum(elem.score)/len(elem.score))
        dic[key].reset_score()
            
    return dic
        
'''

"\ndef calc_score(cp, argdic):\n    c = CaboCha.Parser()\n    dic = argdic.copy()\n    for line in cp.conversation:\n        tree = c.parse(line['content'])\n        blist = bunsetsu_list(tree)\n        for bindex, b in enumerate(blist):\n            try:\n                dic[b['lemma']].accesscount += 1\n                if(dic[b['lemma']].isvisited and abs(dic[b['lemma']].value) > 0.7):\n                    candidates = search_candidate(bindex, blist)\n                    for candidate_index in candidates:\n                        candidate = blist[candidate_index]\n                        try:\n                            dic[candidate['lemma']].add_score(corpus.extract(dic[b['lemma']], dic[candidate['lemma']]))\n                        except KeyError:\n                            newword = corpus.WordDicElement(candidate.lemma)\n                            newword.add_score(corpus.extract(dic[b['lemma']], newword))\n                            dic[candidate['lemma']] = newword\n   

In [12]:
filecount = 0
for file in sorted(os.listdir(os.path.join('moddata','nucc'))):    
    filecount += 1
    print(filecount)
    cp = corpus.Corpus(os.path.join('moddata','nucc',file))
    dic = calc_score(cp, dic)
dic = calc_value(dic)

    

1


TypeError: can only concatenate str (not "int") to str

In [None]:
output = []
for i in dic.keys():
    if(-0.6 < dic[i].value < 0.6 or dic[i].accesscount > 1000):
        dic[i].set_value(0.)
        dic[i].deactivate()
        continue
    output.append(str(dic[i]))
with open('outputclassed.txt', mode='w') as f:
    f.write('\n'.join(sorted(output, key=lambda x: int(x.split('(')[-1].replace(')','')), reverse=True)))