In [1]:
import os, sys, email, datetime, pprint, re, time, html
import numpy as np
import pandas as pd
import nltk
from nltk.metrics import *

pd.options.display.max_colwidth = 1000

In [2]:
from db import connect
engine = connect()
mail_df = pd.read_sql(sql='SELECT * FROM mail_21 ORDER BY docno ASC',con=engine, index_col=None)

## RDFグラフ生成

In [3]:
from rdflib import Graph, Literal, RDF, URIRef, BNode, Namespace
from rdflib.namespace import CSVW, DC, DCAT, DCTERMS, DOAP, FOAF, ODRL2, ORG, OWL, \
                           PROF, PROV, RDF, RDFS, SDO, SH, SKOS, SOSA, SSN, TIME, \
                           VOID, XMLNS, XSD
BASE = Namespace("http://www.kde.cs.tsukuba.ac.jp/~aso/w3c-email/")
EMAIL = Namespace("http://www.w3.org/2000/10/swap/pim/email#")

In [4]:
# RDFグラフ
g = Graph()
g.bind('',BASE)
g.bind('schema',SDO)
g.bind('email',EMAIL)
g.bind('foaf',FOAF)
g.bind('owl',OWL)
g.bind('rdf',RDF)

In [5]:
def header2rdf(g,value): #g:RDFグラフ，value:dataframe.valuesの1array(numpy.ndarray)
    docid = value[0]
    isoreceived = value[2].isoformat()
    isosent = value[4].isoformat()
    name = value[5]
    email = value[6]
    subject = value[7]
    uid = value[8]
    inreplyto = value[10]
    to = value[12]
    cc = value[13]
    body = value[14]

    #Creating Nodes
    ##URI参照されるリソースは表記を変換する
    if uid is not None:
        emailmessage = re.sub(r'[^a-zA-Z_0-9]','_',uid) 

        #tripleを作成
        g.add((BASE[emailmessage],RDF.type,SDO.EmailMessage))
        g.add((BASE[emailmessage],SDO.identifier,Literal(uid)))
        g.add((BASE[emailmessage],SDO.alternateName,Literal(docid)))

        if isosent is not None:
            g.add((BASE[emailmessage],SDO.dateSent,Literal(isosent,datatype=XSD.dateTime)))

        if isoreceived is not None:
            g.add((BASE[emailmessage],SDO.dateReceived,Literal(isoreceived,datatype=XSD.dateTime)))

        if inreplyto is not None:
            reply_emailmessage = re.sub(r'[^a-zA-Z_0-9]','_',inreplyto)
            g.add((BASE[emailmessage],EMAIL.References,BASE[reply_emailmessage]))

        if name is not None:
            sender = re.sub(r'[^a-zA-Z_0-9]','_',name)
            g.add((BASE[emailmessage],SDO.sender,BASE[sender]))
            g.add((BASE[sender],RDF.type,FOAF.Agent))
            if email is not None:
                sender_email = 'mailto:' + email.strip()
                g.add((BASE[sender],FOAF.mbox,URIRef(sender_email)))

        if to is not None:
            to_recipient = re.sub(r'[^a-zA-Z_0-9]','_',to)
            to_email = 'mailto:' + to.strip()
            g.add((BASE[emailmessage],SDO.toRecipient,BASE[to_recipient]))
            g.add((BASE[to_recipient],FOAF.mbox,URIRef(to_email)))

        if cc is not None:
            cc_recipient = re.sub(r'[^a-zA-Z_0-9]','_',cc)
            cc_email = 'mailto:' + cc.strip()
            g.add((BASE[emailmessage],SDO.ccRecipient,BASE[cc_recipient]))
            g.add((BASE[cc_recipient],FOAF.mbox,URIRef(cc_email)))

        if subject is not None:
            g.add((BASE[emailmessage],SDO.headline,Literal(subject)))

        if body is not None:
            g.add((BASE[emailmessage],SDO.text,Literal(body)))

In [6]:
for value in mail_df.values:
    header2rdf(g,value)

In [7]:
new_dir_path = '/Users/taroaso/myprojects/OpenIE/trec/output/21'

os.mkdir(new_dir_path)

# turtle形式でファイル出力
g.serialize(destination='/Users/taroaso/myprojects/OpenIE/trec/output/21/mail_header.ttl', format='turtle')

In [8]:
#print(g.serialize(format="turtle").decode("utf-8"))