In [120]:
import os, sys, email
import numpy as np
import pandas as pd
import nltk
import pprint, re, time

pd.options.display.max_colwidth = 1000

In [121]:
from loadFile import getFileList, getDirList, fileToDataFrame

# ディレクトリ 内のメールファイルを読み込む
directory_path = "wiki-research-l/2020-July"
file_list = getFileList(directory_path)
file_list.sort()

# テキストファイルをデータフレームに格納する
mail_df, thread_df = fileToDataFrame(file_list)

In [214]:
# メールのBody部分を各パートに分解する
bodies = dict(Bodies=[])
for idx, body in mail_df.loc[:,'Body'].items():
    origin = []
    greetings = dict(Greetings=[])
    sentences = dict(Sentence=[])
    captions = dict(Caption=[])
    bulletlist = dict(Bulletlist=[])
    ending = dict(Ending=[])
    quotation = dict(Quotation=[])
    footer = dict(Footer=[])
    misc = dict(Misc=[])

    lines = body.splitlines()
    for num, line in enumerate(lines):
        if re.match(r'\[G\]',line) is not None:
            greetings['Greetings'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[S\]',line) is not None:
            sentences['Sentence'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[C\]',line) is not None:
            captions['Caption'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[B\]',line) is not None:
            bulletlist['Bulletlist'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[E\]',line) is not None:
            ending['Ending'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[Q\]',line) is not None:
            quotation['Quotation'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[F\]',line) is not None:
            footer['Footer'].append({num:line[3:]})
            origin.append(line[3:])
        elif re.match(r'\[M\]',line) is not None:
            misc['Misc'].append({num:line[3:]})
            origin.append(line[3:])
        else: #空白行に対応する
            continue
    originbody = re.sub(r'\[(G|S|C|B|E|Q|F|M)\]', '', body)
    bodies['Bodies'].append({'idx':idx, 'message_id':mail_df['Message-ID'][idx], 'countrows':len(lines), 'body':originbody, **greetings, **sentences, **captions, **bulletlist, **ending, **quotation, **footer, **misc})

In [128]:
#　bodiesのsentenceのvalueから複数文のテキストを作る
s_list = []
for i in bodies['Bodies']:  #1通ずつ取り出す
    text = ''
    for j in i['Sentence']: #1行{行番号:文}ずつ取り出し，複数文が含まれた1つの文字列にする
        text = text + list(j.values())[0] + ' '
    s_list.append(text) #1通ごとの自然文のテキストをリストに格納する

In [142]:
# 自然言語処理
import stanza

stanza.download('en')
nlp = stanza.Pipeline('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 3.71MB/s]
2020-09-07 13:24:27 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/en/default.zip: 100%|██████████| 428M/428M [02:23<00:00, 2.99MB/s]
2020-09-07 13:26:59 INFO: Finished downloading models and saved to /Users/taroaso/stanza_resources.
2020-09-07 13:26:59 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2020-09-07 13:26:59 INFO: Use device: cpu
2020-09-07 13:26:59 INFO: Loading: tokenize
2020-09-07 13:26:59 INFO: Loading: pos
2020-09-07 13:27:00 INFO: Loading: lemma
2020-09-07 13:27:00 INFO: Loading: depparse
2020-09-07 13:27:02 INFO: Loading: sentiment
2020-09-07 13:27:03 INF

In [164]:
# センテンスのdataframeを作る
sentence_list = []
# 文章全体に対する前処理
for i, content in enumerate(s_list):
    if content != '': #Sentenceが空のとき
        doc = nlp(content)
        # 文に対する前処理
        for sentence in doc.sentences:
            sentence_list.append([mail_df['Message-ID'][i],sentence.text])
    else:
        pass
sentence_df = pd.DataFrame(sentence_list,columns=['Message-ID','sentence'])
sentence_df.head()

Unnamed: 0,Message-ID,sentence
0,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,Thanks for your reply!
1,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,It's a great suggestion to include a list of ORES based tools for people who are not familiar with ORES itself.
2,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,"For our research, we would love to interview you and test our visualizations."
3,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,Please let me know what times will work best for you so we can schedule a meeting.
4,<CAE4fJj_b7k8yeqmz19a-seo3gtL9Mg1nvQO0oPUtrxvCFMkEaw@mail.gmail.com>,Thanks for clarifying.


In [292]:
# Body内のNamedEntity(NE)
rows = []
for mail in bodies['Bodies']:
    doc = nlp(mail['body'])
    for ent in doc.ents:
        if ent.type not in ('TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'CARDINAL', 'ORDINAL'):
            row = [mail['message_id'],ent.text,ent.type]
            rows.append(row)
        else:
            pass
ne_df = pd.DataFrame(rows,columns=['message_id','named_entity','type'])
ne_df.head()

Unnamed: 0,message_id,named_entity,type
0,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,ORES,ORG
1,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,Ethan,PERSON
2,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,"Tue, Jun 30, 2020",DATE
3,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,Netha Hussain,PERSON
4,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,Ethan,PERSON


In [255]:
#  送受信者に関するEntity
rows = []
for message_id, values in refer_pronoun.items():
    try:
        row1 = [message_id, values['I'], 'SENDER']
        rows.append(row1)
    except:
        pass
    try:
        row2 = [message_id, values['YOU'], 'RECEIVER']
        rows.append(row2)
    except:
        pass
ne_from_header_df = pd.DataFrame(rows,columns=['message_id','named_entity','type'])
ne_from_header_df

Unnamed: 0,message_id,named_entity,type
0,<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>,Ethan Ye,SENDER
1,<CAE4fJj_b7k8yeqmz19a-seo3gtL9Mg1nvQO0oPUtrxvCFMkEaw@mail.gmail.com>,Ethan Ye,SENDER
2,<CAE4fJj_b7k8yeqmz19a-seo3gtL9Mg1nvQO0oPUtrxvCFMkEaw@mail.gmail.com>,Stella,RECEIVER
3,<CAFw=MpJg9SYEGQD9ywhphWwwNE8fHHcsSEHne8=B-8S0w+o=Kg@mail.gmail.com>,Giovanni Luca Ciampaglia,SENDER
4,<CALeA2c9GW9BFSMetuSCDWLbm2iGG1xM7Bp0R2A9-JJF2_7Xrwg@mail.gmail.com>,J. Nathan Matias,SENDER
5,<CAD_=H2LWP6n5COjqc-DEw59feJGxXBpJLen4m7BtPT5MfcJrpg@mail.gmail.com>,Miriam Redi,SENDER
6,<CALi5MRFpf2XvvVmSLsFvAkjq2QVRHSLLT4mDppF5eXVMxeZ34w@mail.gmail.com>,Mackenzie Lemieux,SENDER
7,<CALi5MRFpf2XvvVmSLsFvAkjq2QVRHSLLT4mDppF5eXVMxeZ34w@mail.gmail.com>,Wiki Community,RECEIVER
8,<CAK0Oe2uzvfjV=R9H5xPEwEPgY0+6FcjOu=F=opXuZtWhD2LyCQ@mail.gmail.com>,Leila Zia,SENDER
9,<CAK0Oe2uzvfjV=R9H5xPEwEPgY0+6FcjOu=F=opXuZtWhD2LyCQ@mail.gmail.com>,Mackenzie,RECEIVER


In [289]:
rows = []
for i, subject in enumerate(mail_df['Subject'].values):
    subject = re.sub(r'(\n)|(\t)',' ',subject)
    doc = nlp(subject)
    for ent in doc.ents:
        if ent.type not in ('TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'CARDINAL', 'ORDINAL'):
            row = [mail_df['Message-ID'][i],ent.text,ent.type]
            rows.append(row)
        else:
            pass
ne_from_subject_df = pd.DataFrame(rows,columns=['message_id','named_entity','type'])
ne_from_subject_df.head()

Unnamed: 0,message_id,named_entity,type
0,<CAFw=MpJg9SYEGQD9ywhphWwwNE8fHHcsSEHne8=B-8S0w+o=Kg@mail.gmail.com>,2nd Conference on Truth and Trust Online,WORK_OF_ART
1,<CAFw=MpJg9SYEGQD9ywhphWwwNE8fHHcsSEHne8=B-8S0w+o=Kg@mail.gmail.com>,TTO 2020,DATE
2,<1395374715.4530653.1593904473385@mail.yahoo.com>,June 2020,DATE
3,<1395374715.4530653.1593904473385@mail.yahoo.com>,the Wikimedia Research Newsletter,ORG
4,<CAJXKj+owEP_UyX2A17x+4NrsB76zv9puwpbPE8ZPBpfD0CRARQ@mail.gmail.com>,Wikimedia Research Showcase,EVENT


In [296]:
#ne_df.to_csv('wiki-research-l/output/stanza_ne_df.csv')
#sentence_df = pd.read_csv('/Users/taroaso/myprojects/OpenIE/wiki-research-l/output/sentence_df.csv',index_col=0)
#sentence_df.head()
ne_df = pd.concat([ne_df, ne_from_subject_df, ne_from_header_df], ignore_index=True)
ne_df = ne_df.drop_duplicates()

ne_list_body = ne_df['named_entity'].drop_duplicates().values

named_entity_list = set(named_entity_list)
len(named_entity_list)

390

In [298]:
ne_df.to_csv('wiki-research-l/output/stanza_ne_df.csv')

In [305]:
#kb_df = pd.read_csv('/Users/taroaso/myprojects/OpenIE/wiki-research-l/output/full_replaced_triple_from_text_part.csv',index_col=0)
rows = []
triples = []
for row in kb_df.values:
    sbj = row[5].split()
    obj = row[7].split()
    for word in sbj:
        if word in named_entity_list:
            rows.append(row)
            triple = (word, 'seeAlso', row[5])
            triples.append(triple)
    for word in obj:
        if word in named_entity_list:
            rows.append(row)
            triple = (word, 'seeAlso', row[7])
            triples.append(triple)
minie_linked_triple_from_text_part = pd.DataFrame(rows, columns=['message_id','sentence', 'arg1', 'rel', 'arg2s','new_arg1','new_rel','new_arg2s'])
minie_linked_triple_from_text_part.to_csv('wiki-research-l/output/minie_linked_triple_from_text_part.csv')

In [306]:
print(len(triples))
triples = set(triples)
print(len(triples))

238
98


In [310]:
triple_df = pd.DataFrame(triples, columns=['subject','predicate','object'])
triple_df.to_csv('wiki-research-l/output/seealso_triples.csv')

In [136]:
# MinIEにかける
import requests
import json

extractions_list = []
for i,sentence in sentence_df['sentence'].items():
    try:
        response = requests.post('http://localhost:8080/minie/query', data=sentence)
        result = response.json()
        if result['facts'] == []:
            pass
        else:
            for triple in result['facts']:
                extractions_list.append([sentence_df['Message-ID'][i], sentence, triple['subject'], triple['predicate'], triple['object']])
    except Exception:
        pass

['<CAE4fJj-un1Um+3aE1jTe9b8WQZuFLMaaFmCJ9zNtzTkuUja0Rw@mail.gmail.com>',
 'Thanks for your reply!',
 'your',
 'has',
 'reply']
555


In [137]:
# 整形結果をdataframeにする
kb_df = pd.DataFrame(extractions_list, columns = ['message_id','sentence', 'arg1', 'rel', 'arg2s'])
kb_df.head()
# dataframeをcsv出力する
kb_df.to_csv('wiki-research-l/output/minie_triple_from_text_part.csv')

# dataframeをRDBのテーブルにする
#from db import connect
#engine = connect()
#kb_df.to_sql(name='kb_wiki_research_l_text',con=engine,if_exists='replace',index=None)

In [138]:
# triple中の代名詞youの候補の辞書を作るために，Greetingsの行からYouの候補を取り出す．

import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

you = {}
for mail in bodies['Bodies']:
    idx = mail['idx']
    for greetings in mail['Greetings']:
        doc = nlp(greetings[0])
        you[idx] = [(X.text, X.label_) for X in doc.ents if X.label_ not in ['DATE','TIME','PERCENT','MONEY','QUANTITY','ORDINAL','CARDINAL']]
        if you[idx] == []:
            del you[idx]

In [139]:
# tripleのI, MY, ME, WE, OUR, USの代名詞をSenderに置き換えるための辞書
refer_you = {}
for idx, candidate in you.items():
    message_id = mail_df["Message-ID"][idx]
    refer_you[message_id] = {"YOU":candidate[0][0], "YOUR":candidate[0][0] + '\'s'}

# tripleのI, MY, ME, WE, OUR, USの代名詞をSenderに置き換えるための辞書
refer_pronoun = {}
for row in kb_df.values:
    message_id = row[0]
    sender = mail_df[mail_df['Message-ID']==message_id]['From'].values[0]
    start = re.search(r'(\(.+\))',sender).start()
    end = re.search(r'(\(.+\))',sender).end()
    sender = sender[start+1:end-1]
    refer_pronoun[message_id]={'I':sender, 'MY':sender + '\'s', 'ME':sender, 'WE':sender, 'OUR':sender + '\'s', 'US':sender}

# 2つの辞書を結合する
for key, value in refer_you.items():
    if key in refer_pronoun:
        d = refer_pronoun[key]
        d.update(value)
        refer_pronoun[key] = d
    else:
        refer_pronoun[key] = value

In [140]:
# Senderに置き換えるための辞書を使って実際に置き換える
replaced_rows = []
for row in kb_df.values:
    message_id = row[0]
    arg1 = row[2].split()
    rel = row[3].split()
    arg2s = row[4].split()
    replaced = []
    # arg1の置き換え
    for i, word in enumerate(arg1):
        sender = refer_pronoun[message_id].get(word.upper())
        if sender is None:
            continue
        else:
            arg1[i] = sender
    new_arg1 = ' '.join(arg1)
    replaced.append(new_arg1)
    # relの置き換え
    for i, word in enumerate(rel):
        sender = refer_pronoun[message_id].get(word.upper())
        if sender is None:
            continue
        else:
            rel[i] = sender
    new_rel = ' '.join(rel)
    replaced.append(new_rel)
    # arg2sの置き換え
    for i, word in enumerate(arg2s):
        sender = refer_pronoun[message_id].get(word.upper())
        if sender is None:
            continue
        else:
            arg2s[i] = sender
    new_arg2s = ' '.join(arg2s)
    replaced.append(new_arg2s)
    # [new_arg1, new_rel, new_arg2s]を1行として追加
    replaced_rows.append(replaced)

In [141]:
replaced_triples = pd.DataFrame(replaced_rows,columns=['new_arg1','new_rel','new_arg2s'])
kb_df = pd.concat([kb_df, replaced_triples],axis=1)
kb_df.to_csv('/Users/taroaso/myprojects/OpenIE/wiki-research-l/output/minie_triple_from_text_part.csv')

In [None]:
from entityLinking import tagme, confidentAnnotations, mediaWiki

entity_list = []
for i, sentence in sentence_df['sentence'].items():
    json_res = tagme(sentence)
    for candidate in json_res['annotations']:
        if candidate['rho'] >= 0.3:
            entity_list.append([i, candidate['spot'],candidate['rho'],candidate['id'],candidate['title']])
        else:
            continue

# 抽出したentityをdataframeに格納する
rows = []
for row in entity_list:
    rows.append([sentence_df['Message-ID'][row[0]], row[1], row[2], row[3], row[4]])
entity_df = pd.DataFrame(rows,columns=['Message-ID','spot','rho','id','title'])
entity_df.head()

In [None]:
#entity_df = pd.read_csv('wiki-research-l/output/entity_df.csv', index_col=0)
entity_df[entity_df['rho'] < 0.3][0:100]

In [None]:
kb_df.head()