In [1]:
import os, sys, email
import numpy as np
import pandas as pd
from nltk import tokenize
import pprint, re, time

pd.options.display.max_colwidth = 1000

In [2]:
from loadFile import getFileList, getDirList, fileToDataFrame

# ディレクトリ 内のメールファイルを読み込む
directory_path = "wiki-research-l/2020-July"
file_list = getFileList(directory_path)
file_list.sort()

# テキストファイルをデータフレームに格納する
mail_df, thread_df = fileToDataFrame(file_list)

In [3]:
len(mail_df)

37

In [4]:
bodies = dict(Bodies=[])
for idx, body in mail_df.loc[:,'Body'].items():
    origin = []
    greetings = dict(Greetings=[])
    sentences = dict(Sentence=[])
    captions = dict(Caption=[])
    bulletlist = dict(Bulletlist=[])
    ending = dict(Ending=[])
    quotation = dict(Quotation=[])
    footer = dict(Footer=[])
    misc = dict(Misc=[])

    lines = body.splitlines()
    for num, line in enumerate(lines):
        if re.match(r'\[G\]',line) is not None:
            greetings['Greetings'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[S\]',line) is not None:
            sentences['Sentence'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[C\]',line) is not None:
            captions['Caption'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[B\]',line) is not None:
            bulletlist['Bulletlist'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[E\]',line) is not None:
            ending['Ending'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[Q\]',line) is not None:
            quotation['Quotation'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[F\]',line) is not None:
            footer['Footer'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[M\]',line) is not None:
            misc['Misc'].append({num:line[3:]})
            origin.append(line[3:])
        else: #空白行に対応する
            continue
    originbody = ' '.join(origin)
    bodies['Bodies'].append({'idx':idx, 'countrows':len(lines), 'body':originbody, **greetings, **sentences, **captions, **bulletlist, **ending, **quotation, **footer, **misc})

In [5]:
#　bodiesのsentenceのvalueから複数文のテキストを作る
s_list = []
for i in bodies['Bodies']:  #1通ずつ取り出す
    text = ''
    for j in i['Sentence']: #1行{行番号:文}ずつ取り出し，複数文が含まれた1つの文字列にする
        text = text + list(j.values())[0] + ' '
    s_list.append(text) #1通ごとの自然文のテキストをリストに格納する

In [6]:
#　センテンスのリストを作る
sentence_list = []
# 文章全体に対する前処理
for content in s_list:
    sentences = tokenize.sent_tokenize(content)
    # 文に対する前処理
    for i,sentence in enumerate(sentences):
        sentence = re.sub(r'\s{2,}',' ',sentence)
        #if len(sentence.split()) <= 5:
         #   continue
#        elif len(sentence.split()) >= 100:
#            continue
        #else:
        sentence_list.append(sentence)

In [7]:
# OpenIEにかける
from pyopenie import OpenIE5
extractor = OpenIE5('http://localhost:8000')

extractions_list = []
for sentence in sentence_list:
    try:
        extractions = extractor.extract(sentence)
        extractions_list.append(extractions)
    except Exception:
        pass

In [8]:
rows = []
for extractions in extractions_list:
    if extractions == []:
        pass
    else:
        for extraction in extractions:
            sentence = extraction['sentence']
            confidence = extraction['confidence']
            arg1 = extraction['extraction']['arg1']['text']
            rel = extraction['extraction']['rel']['text']
            arg2s_list = []
            for arg2 in extraction['extraction']['arg2s']:
                arg2s_list.append(arg2['text'])
            arg2s = ' '.join(map(str, arg2s_list))
            row = [sentence, arg1, rel, arg2s, confidence]
            rows.append(row)

In [9]:
kb_df = pd.DataFrame(rows, columns = ['sentence', 'arg1', 'rel', 'arg2s', 'confidence'])
len(kb_df)

493

In [11]:
kb_df.to_csv('wiki-research-l/output/triple_from_text_part.csv')

In [9]:
'''
from db import connect
engine = connect()
kb_df.to_sql(name='kb_wiki_research_l_text',con=engine,if_exists='replace',index=None)
'''