In [None]:
import os, sys, email
import numpy as np
import pandas as pd
from nltk import tokenize
import pprint, re, time

pd.options.display.max_colwidth = 1000

In [None]:
from loadFile import getFileList, getDirList, fileToDataFrame

# ディレクトリ 内のメールファイルを読み込む
directory_path = "wiki-research-l/2020-July"
file_list = getFileList(directory_path)
file_list.sort()

# テキストファイルをデータフレームに格納する
mail_df, thread_df = fileToDataFrame(file_list)

In [None]:
bodies = dict(Bodies=[])
for idx, body in mail_df.loc[:,'Body'].items():
    origin = []
    greetings = dict(Greetings=[])
    sentences = dict(Sentence=[])
    captions = dict(Caption=[])
    bulletlist = dict(Bulletlist=[])
    ending = dict(Ending=[])
    quotation = dict(Quotation=[])
    footer = dict(Footer=[])
    misc = dict(Misc=[])

    lines = body.splitlines()
    for num, line in enumerate(lines):
        if re.match(r'\[G\]',line) is not None:
            greetings['Greetings'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[S\]',line) is not None:
            sentences['Sentence'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[C\]',line) is not None:
            captions['Caption'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[B\]',line) is not None:
            bulletlist['Bulletlist'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[E\]',line) is not None:
            ending['Ending'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[Q\]',line) is not None:
            quotation['Quotation'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[F\]',line) is not None:
            footer['Footer'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[M\]',line) is not None:
            misc['Misc'].append({num:line[3:]})
            origin.append(line[3:])
        else: #空白行に対応する
            continue
    originbody = ' '.join(origin)
    bodies['Bodies'].append({'idx':idx, 'countrows':len(lines), 'body':originbody, **greetings, **sentences, **captions, **bulletlist, **ending, **quotation, **footer, **misc})

In [None]:
#　bodiesのsentenceのvalueから複数文のテキストを作る
s_list = []
for i in bodies['Bodies']:  #1通ずつ取り出す
    text = ''
    for j in i['Sentence']: #1行{行番号:文}ずつ取り出し，複数文が含まれた1つの文字列にする
        text = text + list(j.values())[0] + ' '
    s_list.append(text) #1通ごとの自然文のテキストをリストに格納する

In [None]:
# センテンスのdataframeを作る
sentence_list = []
# 文章全体に対する前処理
for i, content in enumerate(s_list):
    sentences = tokenize.sent_tokenize(content)
    # 文に対する前処理
    for j, sentence in enumerate(sentences):
        sentence = re.sub(r'\s{2,}',' ',sentence)
        sentence_list.append([mail_df['Message-ID'][i],sentence])
sentence_df = pd.DataFrame(sentence_list,columns=['Message-ID','sentence'])

In [None]:
sentence_df.head()
sentence_df.to_csv('wiki-research-l/output/sentence_df.csv')

In [None]:

# OpenIEにかける
from pyopenie import OpenIE5
extractor = OpenIE5('http://localhost:8000')

extractions_list = []
for i,sentence in sentence_df['sentence'].items():
    try:
        extractions = extractor.extract(sentence)
        extractions_list.append(extractions)
    except Exception:
        pass

In [143]:

# OIEの抽出結果を整形する
rows = []
for i, extractions in enumerate(extractions_list):
    if extractions == []:
        pass
    else:
        for extraction in extractions:
            message_id = sentence_df['Message-ID'][i]
            sentence = extraction['sentence']
            confidence = extraction['confidence']
            arg1 = extraction['extraction']['arg1']['text']
            rel = extraction['extraction']['rel']['text']
            arg2s_list = []
            for arg2 in extraction['extraction']['arg2s']:
                arg2s_list.append(arg2['text'])
            arg2s = ' '.join(map(str, arg2s_list))
            row = [message_id, sentence, arg1, rel, arg2s, confidence]
            rows.append(row)

# 整形結果をdataframeにする
kb_df = pd.DataFrame(rows, columns = ['message_id','sentence', 'arg1', 'rel', 'arg2s', 'confidence'])
kb_df.head()
# dataframeをcsv出力する
#kb_df.to_csv('wiki-research-l/output/triple_from_text_part.csv')

# dataframeをRDBのテーブルにする
#from db import connect
#engine = connect()
#kb_df.to_sql(name='kb_wiki_research_l_text',con=engine,if_exists='replace',index=None)

Unnamed: 0,message_id,sentence,arg1,rel,arg2s,confidence
0,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,It's a great suggestion to include a list of ORES based tools for people who are not familiar with ORES itself.,people,are not,familiar with ORES,0.895843
1,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,It's a great suggestion to include a list of ORES based tools for people who are not familiar with ORES itself.,a great suggestion,to include,a list of ORES based tools for people,0.925061
2,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,It's a great suggestion to include a list of ORES based tools for people who are not familiar with ORES itself.,It,'s,a great suggestion to include a list of ORES based tools for people,0.678369
3,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,"For our research, we would love to interview you and test our visualizations.",we,would love to test,our visualizations,0.256095
4,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,"For our research, we would love to interview you and test our visualizations.",we,would love,to test our visualizations For our research,0.380779


In [None]:
from entityLinking import tagme, confidentAnnotations, mediaWiki

entity_list = []
for i, sentence in sentence_df['sentence'].items():
    json_res = tagme(sentence)
    for candidate in json_res['annotations']:
        if candidate['rho'] >= 0.3:
            entity_list.append([i, candidate['spot'],candidate['rho'],candidate['id'],candidate['title']])
        else:
            continue

# 抽出したentityをdataframeに格納する
rows = []
for row in entity_list:
    rows.append([sentence_df['Message-ID'][row[0]], row[1], row[2], row[3], row[4]])
entity_df = pd.DataFrame(rows,columns=['Message-ID','spot','rho','id','title'])
entity_df.head()

In [None]:
#entity_df = pd.read_csv('wiki-research-l/output/entity_df.csv', index_col=0)
entity_df[entity_df['rho'] < 0.3][0:100]

In [None]:
# tripleのI, MY, ME, WE, OUR, USの代名詞をSenderに置き換えるための辞書
refer_pronoun = {}
for row in kb_df.values:
    message_id = row[0]
    sender = mail_df[mail_df['Message-ID']==message_id]['From'].values[0]
    start = re.search(r'(\(.+\))',sender).start()
    end = re.search(r'(\(.+\))',sender).end()
    sender = sender[start+1:end-1]
    refer_pronoun[message_id]={'I':sender, 'MY':sender + '\'s', 'ME':sender, 'WE':sender, 'OUR':sender + '\'s', 'US':sender}

In [137]:
# Senderに置き換えるための辞書を使って実際に置き換える
replaced_rows = []
for row in kb_df.values:
    message_id = row[0]
    arg1 = row[2].split()
    rel = row[3].split()
    arg2s = row[4].split()
    replaced = []
    for i, word in enumerate(arg1):
        sender = refer_pronoun[message_id].get(word.upper())
        if sender is None:
            continue
        else:
            arg1[i] = sender
    new_arg1 = ' '.join(arg1)
    replaced.append(new_arg1)
    for i, word in enumerate(rel):
        sender = refer_pronoun[message_id].get(word.upper())
        if sender is None:
            continue
        else:
            rel[i] = sender
    new_rel = ' '.join(rel)
    replaced.append(new_rel)
    for i, word in enumerate(arg2s):
        sender = refer_pronoun[message_id].get(word.upper())
        if sender is None:
            continue
        else:
            arg2s[i] = sender
    new_arg2s = ' '.join(arg2s)
    replaced.append(new_arg2s)
    replaced_rows.append(replaced)

In [146]:
replaced_triples = pd.DataFrame(replaced_rows,columns=['new_arg1','new_rel','new_arg2s'])
kb_df = pd.concat([kb_df, replaced_triples],axis=1)
kb_df.to_csv('/Users/taroaso/myprojects/OpenIE/wiki-research-l/output/replaced_triple_from_text_part.csv')

In [None]:
from extraction import entityExtraction

ner_list = []
for i, sentence in sentence_df['sentence'].items():
    entities = entityExtraction(sentence)
    ner_list.append(entities)
len(ner_list)