In [None]:
%load_ext autoreload
%autoreload 2
# default_exp importers.email

In [None]:
# export
import imaplib, email, math
from integrators.data.schema import Account, EmailMessage, MessageChannel
from integrators.pod.client import PodClient
from integrators.importers.util import *
from integrators.data.basic import *
from email import policy
from email.utils import getaddresses
from integrators.imports import *
from nbdev.showdoc import show_doc

# Email importer

This importers fetches your emails and accounts over IMAP, it uses the python built-in imap client and some convenience functions for easier usage, batching and importing to the pod. This importer requires you to login with your email address and an app password. It is tested on gmail, but should work for other IMAP-servers. 

> Note: **The recommended usage for Gmail is to enable two-factor authentication. In this case, make sure you allow [SMTP-connections](https://www.gmass.co/blog/gmail-smtp/) and set an application password (explained in the same link)**

## ImapClient

The `EmailImporter` communicates with email providers over imap. We created a convenience class around pythons imaplib , called the `ImapClient` that lets you list your mailboxes, retriev your mails and get their content.

In [None]:
# export
DEFAULT_GMAIL_HOST = 'imap.gmail.com'
DEFAULT_GMAIL_INBOX = '"[Gmail]/All Mail"' # Note the double quotes here
DEFAULT_PORT = 993

class IMAPClient():
    
    def __init__(self, username, app_pw, host=DEFAULT_GMAIL_HOST, port=DEFAULT_PORT, inbox=DEFAULT_GMAIL_INBOX):
        assert username is not None and app_pw is not None
        self.client = imaplib.IMAP4_SSL(host, port=port)
        self.client.login(username, app_pw)
        self.client.select(inbox) # connect to inbox.
        
    def list_mailboxes(self):
        """Lists all available mailboxes"""
        return self.client.list()
    
    def get_all_mail_uids(self):
        """retrieves all mail uids from the selected mailbox"""
        result, data = self.client.uid('search', None, "ALL") # search and return uids instead
        return data[0].split()
    
    def get_mails(self, uids):
        return [self.get_mail(uid) for uid in uids]
    
    def get_mail(self, uid):
        """Fetches a mail given a uid, returns (raw_mail, thread_id)"""
        if self.client.host == DEFAULT_GMAIL_HOST:
            # Use Google's threading method, in which every thread has an ID
            result, (data, _) = self.client.uid('fetch', uid, '(RFC822 X-GM-THRID)')
            thread_id = data[0].decode("utf-8").split(" ")[2]
            raw_email = data[1]
            return (raw_email, thread_id)
        else:
            # Threading not yet implemented for IMAP threading
            result, (data, _) = self.client.uid('fetch', uid, '(RFC822)')
            raw_email = data[1]
            return (raw_email, None)

def part_to_str(part):
    # hide
    bytes_ = part.get_payload(decode=True)
    charset = part.get_content_charset('iso-8859-1')
    chars = bytes_.decode(charset, 'replace')
    return chars

def _get_all_parts(part):
    # hide
    payload = part.get_payload()
    if isinstance(payload, list):
        return [x for p in payload for x in _get_all_parts(p)]
    else:
        return [part]

In [None]:
show_doc(IMAPClient)

<h2 id="IMAPClient" class="doc_header"><code>class</code> <code>IMAPClient</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>IMAPClient</code>(**`username`**, **`app_pw`**, **`host`**=*`'imap.gmail.com'`*, **`port`**=*`993`*, **`inbox`**=*`'"[Gmail]/All Mail"'`*)



In [None]:
show_doc(IMAPClient.list_mailboxes)

<h4 id="IMAPClient.list_mailboxes" class="doc_header"><code>IMAPClient.list_mailboxes</code><a href="__main__.py#L14" class="source_link" style="float:right">[source]</a></h4>

> <code>IMAPClient.list_mailboxes</code>()

Lists all available mailboxes

In [None]:
show_doc(IMAPClient.get_all_mail_uids)

<h4 id="IMAPClient.get_all_mail_uids" class="doc_header"><code>IMAPClient.get_all_mail_uids</code><a href="__main__.py#L18" class="source_link" style="float:right">[source]</a></h4>

> <code>IMAPClient.get_all_mail_uids</code>()

retrieves all mail uids from the selected mailbox

In [None]:
show_doc(IMAPClient.get_mail)

<h4 id="IMAPClient.get_mail" class="doc_header"><code>IMAPClient.get_mail</code><a href="__main__.py#L26" class="source_link" style="float:right">[source]</a></h4>

> <code>IMAPClient.get_mail</code>(**`uid`**)

Fetches a mail given a uid, returns (raw_mail, thread_id)

In [None]:
# export

# TODO: should probably become a general utility function
def get_unique_accounts(all_mails):
    # hide
    all_accounts = {}
    for email_item in all_mails:
        for edge in email_item.get_all_edges():
            account = edge.traverse(email_item)
            if not account.externalId in all_accounts:
                all_accounts[account.externalId] = account

    for email_item in all_mails:
        for edge in email_item.get_all_edges():
            edge.target = all_accounts[edge.target.externalId]
            
    return list(all_accounts.values())

# TODO: should probably become a general utility function
def get_g_attr(item, name, data_type, default_value=None):
    # hide
    first_or_default = next((att for att in item.genericAttribute if att.name == name), None)
    if first_or_default == None:
        return default_value
    else:
        if data_type == 'int':
            return first_or_default.intValue
        elif data_type == 'bool':
            return first_or_default.boolValue
        elif data_type == 'float':
            return first_or_default.floatValue
        elif data_type == 'string':
            return first_or_default.stringValue
        elif data_type == 'datetime':
            return first_or_default.stringValue
        else:
            raise Exception(f"datatype {data_type} is not supported")

## EmailImporter

In [None]:
# export
from integrators.data.schema import *
from integrators.imports import *
from integrators.indexers.indexer import test_registration
from integrators.importers.importer import ImporterBase


class EmailImporter(ImporterBase): 
    """Imports emails over imap."""
    def __init__(self, *args, **kwargs):
        self.private = ["imap_client"]
        super().__init__(*args, **kwargs)
        self.imap_client = None
       
    def get_data(self, client, indexer_run):
        print('this function is a workaround (this Importer is an Indexer temporarily)')
        
    def set_imap_client(self, importer_run):
        imap_host = get_g_attr(importer_run, 'host', 'string', DEFAULT_GMAIL_HOST)
        port = get_g_attr(importer_run, 'port', 'int', DEFAULT_PORT)
        assert imap_host is not None and port is not None
        print(f'Using, HOST: {imap_host}, PORT: {port}')
        
        self.imap_client = IMAPClient(username=importer_run.username, 
                                     app_pw=importer_run.password, 
                                     host=imap_host,
                                     port=993)
    
    @staticmethod
    def get_timestamp_from_message(message):
        date = message["date"]
        parsed_time = email.utils.parsedate(date)
        dt = email.utils.parsedate_to_datetime(date)
        timestamp = int(dt.timestamp() * 1000)

        return timestamp
    
    @staticmethod
    def get_accounts(message, field):
        addresses = getaddresses(message.get_all(field, []))
        return [Account(externalId=address) for name, address in addresses]
    
    @staticmethod
    def get_content(message):
        """Extracts content from a python email message"""
        maintype = message.get_content_maintype()
        if maintype == 'multipart':

            parts = _get_all_parts(message)
            res = None
            html_parts = [part_to_str(part) for part in parts if part.get_content_type() == "text/html"]
            if len(html_parts) > 0:
                if len(html_parts) > 1:
                    error_msg = "\n AND \n".join(html_parts)
                    print(f"WARNING: FOUND MULTIPLE HTML PARTS IN ONE MESSAGE {error_msg}")
                return html_parts[0]
            else:
                return parts[0].get_payload()

        elif maintype == 'text':
            return message.get_payload()
        
    @staticmethod    
    def get_attachments(message): return list(message.iter_attachments())
        
    def create_item_from_mail(self, mail, thread_id=None):
        """Creates a schema-item from an existing mail"""
        message = email.message_from_bytes(mail, policy=policy.SMTP)

        message_id, subject = message["message-id"], message["subject"]
        timestamp = self.get_timestamp_from_message(message)

        content = self.get_content(message)
        attachments = self.get_attachments(message)   

        email_item = EmailMessage(externalId=message_id, subject=subject, dateSent=timestamp, content=content)

        for a in self.get_accounts(message, 'from'): email_item.add_edge('sender', a)
        for a in self.get_accounts(message, 'to'): email_item.add_edge('receiver', a)
        for a in self.get_accounts(message, 'reply-to'): email_item.add_edge('replyTo', a)

        if thread_id != None:
            email_item.add_edge('messageChannel', MessageChannel(externalId=thread_id))

        return email_item          
                
    def get_mails(self, mail_ids, batch_size=5, importer_run=None, verbose=True, pod_client=None):
        """Gets mails from a list of mail uids. You can pass an importer run and podclient
        to update the progress of the process"""
        mails = []
        n_batches = math.ceil(len(mail_ids) / batch_size)
        
        for i, batch_ids in enumerate(batch(mail_ids, n=batch_size)):

            for mail, thread_id in self.imap_client.get_mails(mail_ids):
                item = self.create_item_from_mail(mail, thread_id=thread_id)
                if pod_client is not None:
                    if not pod_client.external_id_exists(item):
                        pod_client.create(item)                    
                        mails.append(item)
                else:
                    mails.append(item)
                
            progress = (i + 1) / n_batches * 1.0
            self.update_progress(pod_client, importer_run, progress, total=len(mail_ids))                

        return mails
        
    def run(self, importer_run, pod_client=None, verbose=True):
        """This is the main function of the Email importer. It runs the importer given information
        provided in the importer run. if you pass a pod client it will add the new items to the graph."""
        self.set_imap_client(importer_run)
        self.update_run_status(pod_client, importer_run, "running")
        
        stop_early_at = get_g_attr(importer_run, 'max_number', 'int', 10)
        
        self.update_progress_message(pod_client, importer_run, "downloading emails", verbose=verbose)
        mail_ids = self.imap_client.get_all_mail_uids()
        all_mails = self.get_mails(mail_ids[:int(stop_early_at)],
                                   importer_run=importer_run,
                                   pod_client=pod_client)
                
        # TODO: create better way to do this
        self.update_progress_message(pod_client, importer_run, "merging duplicate items", verbose=verbose)
        all_accounts = get_unique_accounts(all_mails)

        self.update_progress_message(pod_client, importer_run, "creating accounts", verbose=verbose)
        for item in all_accounts: pod_client.create(item)

        self.update_progress_message(pod_client, importer_run, "creating threads", verbose=verbose)
        for email_item in all_mails: pod_client.create_edges(email_item.get_all_edges())
            
        print(f"Finished running {self}")
        
        self.update_run_status(pod_client, importer_run, "done")


The email importer has the following parameters

- **username** Your email address
- **password** Your email password. In case you're using gmail, use your application password
- _generic attributes_
- **host** The URL of the host (defaults to imap.gmail.com)
- **port** The port of the server (defaults to 993 for gmail)
- **max_number** Max number of emails to download. Leave unset for unlimited

In [None]:
show_doc(EmailImporter.get_content)

<h4 id="EmailImporter.get_content" class="doc_header"><code>EmailImporter.get_content</code><a href="__main__.py#L44" class="source_link" style="float:right">[source]</a></h4>

> <code>EmailImporter.get_content</code>(**`message`**)

Extracts content from a python email message

In [None]:
show_doc(EmailImporter.create_item_from_mail)

<h4 id="EmailImporter.create_item_from_mail" class="doc_header"><code>EmailImporter.create_item_from_mail</code><a href="__main__.py#L67" class="source_link" style="float:right">[source]</a></h4>

> <code>EmailImporter.create_item_from_mail</code>(**`mail`**, **`thread_id`**=*`None`*)

Creates a schema-item from an existing mail

In [None]:
show_doc(EmailImporter.run)

<h4 id="EmailImporter.run" class="doc_header"><code>EmailImporter.run</code><a href="__main__.py#L110" class="source_link" style="float:right">[source]</a></h4>

> <code>EmailImporter.run</code>(**`importer_run`**, **`pod_client`**=*`None`*, **`verbose`**=*`True`*)

This is the main function of the Email importer. It runs the importer given information
provided in the importer run. if you pass a pod client it will add the new items to the graph.

## Usage

### Download all mails from your account

In [None]:
# hide
def get_importer_run(imap_user, imap_pw):
    importer_run = ImporterRun.from_data(progress=0, username=imap_user, password=imap_pw)
    importer_run.add_edge('genericAttribute', GenericAttribute(name='host', stringValue=DEFAULT_GMAIL_HOST))
    importer_run.add_edge('genericAttribute', GenericAttribute(name='port', intValue=993))
    importer_run.add_edge('genericAttribute', GenericAttribute(name='max_number', intValue=10))
    return importer_run

In [None]:
pod_client = PodClient()

In [None]:
# slow
# This cell is meant to be able to test the importer locally
def get_gmail_creds():
    return read_file(HOME_DIR / '.memri' / 'credentials_gmail.txt').split("\n")[:2]

imap_user, imap_pw = get_gmail_creds()
importer           = EmailImporter.from_data()
importer_run       = get_importer_run(imap_user, imap_pw)
importer_run.add_edge('importer', importer)
pod_client.create(importer_run)

importer.run(importer_run=importer_run, pod_client=pod_client)

assert importer_run.progress == 1.0
assert importer_run.runStatus == "done"
pod_client.delete_all()

Using, HOST: imap.gmail.com, PORT: 993
RUN STATUS: running
PROGRESS MESSAGE: downloading emails
PROGRESS: Importing 50.0% of 10 
PROGRESS: Importing 100.0% of 10 
PROGRESS MESSAGE: merging duplicate items
PROGRESS MESSAGE: creating accounts
PROGRESS MESSAGE: creating threads
Finished running EmailImporter (#None)
RUN STATUS: done


In [None]:
# hide
# TODO: Test incremental updates

### Parse emails

In [None]:
test = b"""\
Message-id: 1234\r
From: user1 <a@gmail.com>\r
To: user1 <b@gmail.com>\r
Reply-to: user1 <c@gmail.com>\r
Subject: the subject\r
Date: Mon, 04 May 2020 00:37:44 -0700\r

This is content"""

email_importer = EmailImporter()
mail_item = email_importer.create_item_from_mail(test, 'message_channel_id')

assert mail_item.externalId == '1234'
assert mail_item.sender[0].externalId == 'a@gmail.com'
assert mail_item.receiver[0].externalId == 'b@gmail.com'
assert mail_item.replyTo[0].externalId == 'c@gmail.com'
assert mail_item.subject == 'the subject'
assert mail_item.content == 'This is content'
assert mail_item.dateSent == email_importer.get_timestamp_from_message(email.message_from_bytes(test))
assert mail_item.messageChannel[0].externalId == 'message_channel_id'

### Attachments

In [None]:
# Test attachment parsing (basic support)
email_importer = EmailImporter()
message = email.message.EmailMessage()
message.set_content('aa')
message.add_attachment(b'bb', maintype='image', subtype='jpeg', filename='sample.jpg')
message.add_attachment(b'cc', maintype='image', subtype='jpeg', filename='sample2.jpg')
content = email_importer.get_content(message)
attachments = email_importer.get_attachments(message)

assert content == 'aa\n'
assert attachments[0].get_content() == b'bb'
assert attachments[1].get_content() == b'cc'

In [None]:
# hide
### Calling the importer from the pod

In [None]:
# hide
#importer

In [None]:
# hide
# slow

# This cell is meant to be able to call the importer locally (simulating the front-end)

# pod_client = PodClient(url='http://0.0.0.0:3030')
# pod_client.create(importer_run)
# pod_client.create(importer)
# pod_client.create(host_item)
# pod_client.create(port_item)
# pod_client.create(max_number_item)
# pod_client.create_edges(importer_run.get_all_edges())

# json = {
#             'databaseKey':pod_client.database_key,
#             'payload':{
#                  'uid':importer_run.uid,
#                  'servicePayload': {
#                      'databaseKey': pod_client.database_key,
#                      'ownerKey': pod_client.owner_key
#                  }
#             }
#        }
# print(importer_run.uid)
# print(requests.post(f'http://0.0.0.0:3030/v2/{pod_client.owner_key}/run_importer',
#                    json=json).content)

In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted basic.ipynb.
Converted importers.EmailImporter.ipynb.
Converted importers.Importer.ipynb.
Converted importers.util.ipynb.
Converted index.ipynb.
Converted indexers.FaceClusteringIndexer.Models.ipynb.
Converted indexers.FaceClusteringIndexer.Utils.ipynb.
Converted indexers.FaceClusteringIndexer.indexer.ipynb.
Converted indexers.FaceRecognitionModel.ipynb.
Converted indexers.FacerecognitionIndexer.Photo.ipynb.
Converted indexers.GeoIndexer.ipynb.
Converted indexers.NoteListIndexer.NoteList.ipynb.
Converted indexers.NoteListIndexer.Parser.ipynb.
Converted indexers.NoteListIndexer.ipynb.
Converted indexers.NoteListIndexer.util.ipynb.
Converted indexers.indexer.ipynb.
Converted itembase.ipynb.
Converted pod.client.ipynb.
