In [1]:
import os, sys, email, datetime, pprint, re, time, html
import numpy as np
import pandas as pd
import nltk
from nltk.metrics import *

pd.options.display.max_colwidth = 1000

In [2]:
# ディレクトリ 内のメールファイルを読み込む

from loadFile import getFileList

directory_path = "/Users/taroaso/myprojects/OpenIE/trec/2005/each_dataset/17"
file_list = getFileList(directory_path)
file_list.sort()

In [3]:
# メールファイルをパースし，データフレームに変換する 

mail_cols = ['docno','received','isoreceived','sent','isosent','name','email','subject','id','charset','inreplyto','expires','to','cc','body']
mail_df = pd.DataFrame(index=[], columns=mail_cols)

for file in file_list:
    with open(file, 'r', encoding='utf-8', errors='ignore') as f:
        mail = f.readlines()
    
        record={}
        body = []
        for row in mail:
            if row.startswith('docno='):
                match = re.search(r'".+"',row)
                if match != None:
                    record['docno'] = match.group().strip('"')
            elif row.startswith('received='):
                match = re.search(r'(... ... [0-9]{1,2} [0-9]{2}:[0-9]{2}:[0-9]{2} [0-9]{4})',row)
                if match != None:
                    record['received'] = match.group().strip('"')
            elif row.startswith('isoreceived='):
                match = re.search(r'".+"',row)
                if match != None:
                    rt = match.group().strip('"')
                    record['isoreceived'] = datetime.datetime(int(rt[0:4]),int(rt[4:6]),int(rt[6:8]),int(rt[8:10]),int(rt[10:12]),int(rt[12:14])) #yyyy,mm,dd,hh,mm,ss
            elif row.startswith('sent='):
                match = re.search(r'".+"',row)
                if match != None:
                    record['sent'] = match.group().strip('"')
            elif row.startswith('isosent='):
                match = re.search(r'".+"',row)
                if match != None:
                    st = match.group().strip('"')
                    record['isosent'] = datetime.datetime(int(st[0:4]),int(st[4:6]),int(st[6:8]),int(st[8:10]),int(st[10:12]),int(st[12:14])) #yyyy,mm,dd,hh,mm,ss
            elif row.startswith('name='):
                match = re.search(r'".+"',row)
                if match != None:
                    record['name'] = match.group().strip('"')
            elif row.startswith('email='):
                match = re.search(r'".+"',row)
                if match != None:
                    record['email'] = match.group().strip('"')
            elif row.startswith('subject='):
                match = re.search(r'".+"',row)
                if match != None:
                    match = match.group().strip('"')
                    match = html.unescape(match) # subjectに含まれるHTML特殊文字をユニコード文字に変換する
                    record['subject'] = match
            elif row.startswith('id='):
                match = re.search(r'".+"',row)
                if match != None:
                    record['id'] = match.group().strip('"')
            elif row.startswith('charset='):
                match = re.search(r'".+"',row)
                if match != None:
                    record['charset'] = match.group().strip('"')
            elif row.startswith('inreplyto='):
                match = re.search(r'".+"',row)
                if match != None:
                    match = match.group().strip('"')
                    match = html.unescape(match) # inreplytoのHTML特殊文字をユニコード文字に変換する
                    record['inreplyto'] = match
            elif row.startswith('expires='):
                match = re.search(r'".+"',row)
                if match != None:
                    record['expires'] = match.group().strip('"')
            elif row.startswith('To:'):
                match = row[3:-1]
                record.setdefault('to',match)
            elif row.startswith('Cc:'):
                match = row[3:-1]
                record.setdefault('cc',match)
            else:
                body.append(row)
        record['body'] = ''.join(body)
    
    mail_df = mail_df.append(record, ignore_index=True)

In [4]:
mail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   docno        367 non-null    object        
 1   received     224 non-null    object        
 2   isoreceived  364 non-null    datetime64[ns]
 3   sent         367 non-null    object        
 4   isosent      364 non-null    datetime64[ns]
 5   name         367 non-null    object        
 6   email        367 non-null    object        
 7   subject      367 non-null    object        
 8   id           366 non-null    object        
 9   charset      258 non-null    object        
 10  inreplyto    277 non-null    object        
 11  expires      347 non-null    object        
 12  to           365 non-null    object        
 13  cc           113 non-null    object        
 14  body         367 non-null    object        
dtypes: datetime64[ns](2), object(13)
memory usage: 43.1+ KB


In [5]:
# RDBにmail_dfのテーブルを作成する

from db import connect
engine = connect()
mail_df.to_sql(name='mail_17_original',con=engine,if_exists='replace',index=None)

In [6]:
# テキストに対する前処理
# 抽出されたEntityのリソース化はテキストにおけるOffsetで識別するため，初めに共通の前処理を行なう
# (EntityLinkingやOpenIEに入力するときに個別の前処理は行わない)
values = mail_df.values
for value in values:
    text = value[14]
    row_list = text.splitlines()
    processed_row_list = []
    for row in row_list:
        if row != '':
            processed_row_list.append(re.sub(r'[>|(> )]{2,}',' ',row))
    text = '\n'.join(processed_row_list)
    value[14] = text

In [7]:
# toの正規化処理(toにはメールアドレスだけの場合や人命＋メールアドレスなどの場合がある．メールアドレスだけに変換する)
canonical_step1 = []
for value in values:
    to = value[12]
    if type(to) != str: #型チェック（nanである場合がある）
        value[12] = None
        canonical_step1.append(list(value))
    else:
        emails = re.findall(r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)',to)
        if emails != []:
            for email in emails:
                value[12] = email
                canonical_step1.append(list(value))

In [8]:
# ccの正規化処理
canonical_step2 = []
for value in canonical_step1:
    cc = value[13]
    if type(cc) != str: #型チェック（nanである場合がある）
        value[13] = None
        canonical_step2.append(list(value))
    else:
        emails = re.findall(r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)',cc)
        if emails != []:
            for email in emails:
                value[13] = email
                canonical_step2.append(list(value))

In [9]:
# 正規化したDataFrameを作成する

canonical_mail_df = pd.DataFrame(canonical_step2,columns=mail_cols)

In [10]:
# RDBにmail_dfのテーブルを作成する

from db import connect
engine = connect()
canonical_mail_df.to_sql(name='mail_17',con=engine,if_exists='replace',index=None)