In [None]:
%load_ext autoreload
%autoreload 2
# default_exp importers.gmail

In [None]:
# export

import imaplib, email
from integrators.data.schema import Account, EmailMessage, MessageChannel
from integrators.pod.client import PodClient
from email import policy

In [None]:
# export

class IMAPClient():
    
    def __init__(self, username, app_pw, host='imap.gmail.com', port=993, inbox='"[Gmail]/All Mail"'):
        # Quick fix to support Google's threading method
        if host == 'imap.gmail.com':
            self.x_gm_thrid_support = True
        else:
            self.x_gm_thrid_support = False
            
        self.client = imaplib.IMAP4_SSL(host, port=port)
        self.client.login(username, app_pw)
        self.client.select(inbox) # connect to inbox.
        
    def list_mailboxes(self):
        return self.client.list()
    
    def get_all_mail_uids(self):
        result, data = self.client.uid('search', None, "ALL") # search and return uids instead
        return data[0].split()
    
    def get_mail(self, uid):
        if self.x_gm_thrid_support:
            # Use Google's threading method, in which every thread has an ID
            result, data = self.client.uid('fetch', uid, '(RFC822 X-GM-THRID)')
            thread_id = data[0][0].decode("utf-8").split(" ")[2]
            raw_email = data[0][1]
            return (raw_email, thread_id)
        else:
            # Threading not yet implemented for IMAP threading
            result, data = self.client.uid('fetch', uid, '(RFC822)')
            raw_email = data[0][1]
            return (raw_email, None)
    
#     def get_all_mails(self, uids):
#         res = []
#         for uid in tqdm(uids):
#             result, data = self.client.uid('fetch', uid, '(RFC822)')
#             raw_email = data[0][1]
#             res.append(raw_email)
#         return res

#     def get_x_gm_thrid(self, uid):
#         result, data = self.client.uid('fetch', uid, '(X-GM-THRID X-GM-MSGID)')
#         return data[0].decode("utf-8").split(" ")[2]
    
# # @staticmethod
# def part_to_str(part):
#     bytes_ = part.get_payload(decode=True)
#     charset = part.get_content_charset('iso-8859-1')
#     chars = bytes_.decode(charset, 'replace')
#     return chars

# # @staticmethod
# def get_html(email_message_instance):
#     maintype = email_message_instance.get_content_maintype()
#     if maintype == 'multipart':

#         parts = _get_all_parts(email_message_instance)        
#         res = None
#         html_parts = [part_to_str(part) for part in parts if part.get_content_type() == "text/html"]
#         if len(html_parts) > 0:
#             if len(html_parts) > 1:
#                 error_msg = "\n AND \n".join(html_parts)
#                 print(f"WARNING: FOUND MULTIPLE HTML PARTS IN ONE MESSAGE {error_msg}")
#             return html_parts[0]
#         else:                
#             return parts[0].get_payload()

#     elif maintype == 'text':
#         return email_message_instance.get_payload()

# # @staticmethod
# def _get_all_parts(part):
#     payload = part.get_payload()
#     if isinstance(payload, list):
#         return [x for p in payload for x in _get_all_parts(p)]
#     else:
#         return [part]   

def get_message_content(message):
#     content = get_html(message)
#     # TODO: proper escaping here
#     content = content.replace("=3D", "=")

    attachments = []
    # SEPARATE THE ATTACHMENTS
    for i, x in enumerate(message.iter_attachments()):
        attachments.append(x)
        #f = open(f"tmp/gmail/{i}.png", 'wb')
        #f.write(x.get_content())
        #f.close()
    content = message.get_body().get_content()
    
    return (content, attachments)

def get_addresses_from_message(message, field):
    if message[field] is None:
        return []
    else:
        return email.utils.getaddresses([message[field]])
    
def get_timestamp_from_message(message):
    date = message["date"]
    parsed_time = email.utils.parsedate(date)
    dt = email.utils.parsedate_to_datetime(date)
    timestamp = int(dt.timestamp() * 1000)
    
    return timestamp

In [None]:
# export

def create_item_from_mail(mail, thread_id=None):
    # message = email.message_from_string(mail_utf8)
    message = email.message_from_bytes(mail, policy=policy.SMTP)
                
    message_id = message["message-id"]
    subject = message["subject"]
    timestamp = get_timestamp_from_message(message)

    from_tuples = get_addresses_from_message(message,'from')
    to_tuples = get_addresses_from_message(message,'to')
    reply_to_tuples = get_addresses_from_message(message,'reply-to')
    
    # TODO: verbose option?
    # print(f'{[address for name, address in from_tuples]} - {subject} [{thread_id}]')

    (content, attachments) = get_message_content(message)
        
    # TODO: is dateSent the right way to go? Might differ for whether you're sender or receiver
    # TODO: importJson
    # TODO: MAIL namespace
    email_item = EmailMessage(externalId=message_id, subject=subject, dateSent=timestamp, content=content)
    
    # Create Edges to accounts
    for name, address in from_tuples:
        address_item = Account(externalId=address)
        email_item.add_edge('sender', address_item)
        
    for name, address in to_tuples:
        address_item = Account(externalId=address)
        email_item.add_edge('receiver', address_item)
        
    for name, address in reply_to_tuples:
        address_item = Account(externalId=address)
        email_item.add_edge('replyTo', address_item)
        
    # Create edge to MessageChannel
    if thread_id != None:
        message_channel = MessageChannel(externalId=thread_id)
        email_item.add_edge('messageChannel', message_channel)
    
    return email_item

def download_mails(imap_client, gmail_ids, stop_at):
    all_mails = []
    
    # Download files
    for i, gmail_id in enumerate(gmail_ids):
        if stop_at is not None and i >= stop_at:
            print(f"stopped early at {stop_at}")
            break

        mail, thread_id = imap_client.get_mail(gmail_id)

        # thread_id = imap_client.get_x_gm_thrid(gmail_id)
        item = create_item_from_mail(mail, thread_id=thread_id)
        all_mails.append(item)
    
    return all_mails

# TODO: should probably become a general utility function
def merge_duplicate_items(all_mails):
    all_accounts = {}
    for email_item in all_mails:
        for edge in email_item.get_all_edges():
            account = edge.traverse(email_item)
            if not account.externalId in all_accounts:
                all_accounts[account.externalId] = account

    for email_item in all_mails:
        for edge in email_item.get_all_edges():
            edge.target = all_accounts[edge.target.externalId]
            
    return all_accounts

In [None]:
# export

from integrators.data.schema import *
from integrators.imports import *
from integrators.indexers.indexer import ImporterBase, test_registration

class GmailImporter(ImporterBase):        
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def run(self, importer_run, pod_client=None):
        # TODO: Get imap_host from importer_run
        imap_host = 'imap.gmail.com'
        
        imap_client = IMAPClient(username=importer_run.username, 
                                 app_pw=importer_run.password, 
                                 host=imap_host,
                                 port=993)
        gmail_ids = imap_client.get_all_mail_uids()
        all_mails = download_mails(imap_client, gmail_ids, 10)

        # Merge email accounts/messageChannels here
        # TODO: create better way to do this
        all_accounts = merge_duplicate_items(all_mails)

        # Create all email and account items
        all_thread_ids = set()
        for email_item in all_mails:
            print(pod_client.get_properties_json(email_item))
            pod_client.create(email_item)
            for message_channel in email_item.messageChannel:
                all_thread_ids.add(message_channel.externalId)
        for (external_id, item) in all_accounts.items():
            pod_client.create(item)

        # Create all edges from emails to accounts/messageThreads
        for email_item in all_mails:
            pod_client.create_edges(email_item.get_all_edges())

In [None]:
%nbdev_slow_test

# Store your credentials in this file:
file = open('tmp/credentials_gmail.txt','r')
imap_host = 'imap.gmail.com'
imap_user = file.readline().strip('\n')
imap_pw = file.readline().strip('\n')

pod_client = PodClient()
pod_client.delete_all()

importer_run = ImporterRun.from_data(progress=0, username=imap_user, password=imap_pw)

importer = GmailImporter.from_data()
importer.run(importer_run=importer_run, pod_client=pod_client)

In [None]:
test = b"""\
Message-id: 1234\r
From: user1 <a@gmail.com>\r
To: user1 <b@gmail.com>\r
Reply-to: user1 <c@gmail.com>\r
Subject: the subject\r
Date: Mon, 04 May 2020 00:37:44 -0700\r

This is content"""

#mail_message = email.message_from_string(test)
mail_item = create_item_from_mail(test, 'message_channel_id')

assert mail_item.externalId == '1234'
assert mail_item.sender[0].externalId == 'a@gmail.com'
assert mail_item.receiver[0].externalId == 'b@gmail.com'
assert mail_item.replyTo[0].externalId == 'c@gmail.com'
assert mail_item.subject == 'the subject'
assert mail_item.content == 'This is content'
assert mail_item.dateSent == get_timestamp_from_message(email.message_from_bytes(test))
assert mail_item.messageChannel[0].externalId == 'message_channel_id'

In [None]:
# Test attachment parsing (basic support)

message = email.message.EmailMessage()
message.set_content('aa')
message.add_attachment(b'bb', maintype='image', subtype='jpeg', filename='sample.jpg')
message.add_attachment(b'cc', maintype='image', subtype='jpeg', filename='sample2.jpg')
content, attachments = get_message_content(message)

assert content == 'aa\n'
assert attachments[0].get_content() == b'bb'
assert attachments[1].get_content() == b'cc'

In [None]:
def get_properties_json(node):
    res = dict()
    for k,v in node.__dict__.items():
        print(k, isinstance(v, list) and len(v)>0 and isinstance(v[0], Edge))
        if k[:1] != '_' and not (isinstance(v, list) and len(v)>0 and isinstance(v[0], Edge)) and v is not None:
            res[k] = v
    return res

mail_item = create_item_from_mail(test, 'message_channel_id')
print('aaa', mail_item.sender)

print(get_properties_json(mail_item))