In [None]:
import imaplib, email
from integrators.data.schema import Account, EmailMessage

class IMAPClient():
    
    def __init__(self, username, app_pw, address='imap.gmail.com'):
        self.client = imaplib.IMAP4_SSL('imap.gmail.com')
        self.client.login(username, app_pw)
        self.client.select('"[Gmail]/All Mail"') # connect to inbox.
        
    def list_mailboxes():
        return self.client.list()
    
    def get_all_mail_uids(self):
        result, data = self.client.uid('search', None, "ALL") # search and return uids instead
        return data[0].split()
    
    def get_mail(self, uid):
        result, data = self.client.uid('fetch', uid, '(RFC822)')
        raw_email = data[0][1]
        return raw_email
        
    
    def get_all_mails(self, uids):
        res = []
        for uid in tqdm(uids):
            result, data = self.client.uid('fetch', uid, '(RFC822)')
            raw_email = data[0][1]
            res.append(raw_email)
        return res

    @staticmethod
    def part_to_str(part):
        bytes_ = part.get_payload(decode=True)
        charset = part.get_content_charset('iso-8859-1')
        chars = bytes_.decode(charset, 'replace')
        return chars
    
    def get_html(self, email_message_instance):
        maintype = email_message_instance.get_content_maintype()
        if maintype == 'multipart':

            parts = self._get_all_parts(email_message_instance)        
            res = None
            html_parts = [self.part_to_str(part) for part in parts if part.get_content_type() == "text/html"]
            if len(html_parts) > 0:
                if len(html_parts) > 1:
                    error_msg = "\n AND \n".join(html_parts)
                    print(f"WARNING: FOUND MULTIPLE HTML PARTS IN ONE MESSAGE {error_msg}")
                return html_parts[0]
            else:                
                return parts[0].get_payload()

        elif maintype == 'text':
            return email_message_instance.get_payload()
        
    def _get_all_parts(self, part):
        payload = part.get_payload()
        if isinstance(payload, list):
            return [x for p in payload for x in self._get_all_parts(p)]
        else:
            return [part]   
        
    def get_x_gm_thrid(self, uid):
        result, data = self.client.uid('fetch', uid, '(X-GM-THRID X-GM-MSGID)')
        return data[0].decode("utf-8").split(" ")[2]
    
    def decode_header(self, header):
        return str(email.header.make_header(email.header.decode_header(header)))

In [None]:
# from tqdm import tqdm
from pathlib import Path
import email, time, json, os, argparse


OUT_DIR = Path().cwd() / "data"

def write_json(obj, fname):
    with open(fname, 'w') as file_out:
        json.dump(obj , file_out)

def write_object(obj, out_dir=OUT_DIR):
    uid = obj["gmailUid"]
    fname = uid + ".json"
    out_path = out_dir / fname
    write_json(obj, out_path)

def get_env_var(name):
    res = os.environ.get(name, None)
    if res is None:
        raise ValueError(f"ENVIRONMENT VARIABLE {name} not provided. Set by:\nEXPORT {name}=\"examplevalue\"""")
    return res

def run_gmail_downloader(username, app_pw, n=None, out_dir=None):
#     out_dir = out_dir if out_dir is not None else OUT_DIR
#     out_dir.mkdir(parents=True, exist_ok=True)

    imap_client = IMAPClient(username=username, app_pw=app_pw)

    uids = imap_client.get_all_mail_uids()

    objects = []

    for i, uid in enumerate(tqdm(uids)):
        uid_str = uid.decode("utf-8") 
        fname = uid_str + ".json"
        out_path = out_dir / fname

        if n is not None and i >= n:
            print(f"stopped early at {n}")
            break

        if out_path.exists():
            tqdm.write(f"skipping uid {uid}, already exists")
        else:
            m = imap_client.get_mail(uid)
            try:
                raw = m.decode("utf-8") 
            except (UnicodeDecodeError, AttributeError):
                tqdm.write(f"Skipping email {uid}, not utf-8 encoded")
                continue
                
            message = email.message_from_string(raw)
                
            message_id = message["message-id"].replace("/", "")
            subject = message["subject"]
            
            date = message["date"]
            parsed_time = email.utils.parsedate(date)
                
            dt = email.utils.parsedate_to_datetime(date)

            timestamp = int(dt.timestamp() * 1000)
            
            from_name, from_mail = email.utils.parseaddr(message["from"])
            from_name = imap_client.decode_header(from_name)
            
            to_name, to_mail = email.utils.parseaddr(message["to"])
            to_name = imap_client.decode_header(to_name)
            
            reply_to_name, reply_to_mail = email.utils.parseaddr(message["reply-to"])
            reply_to_name = imap_client.decode_header(reply_to_name)
            
            content = imap_client.get_html(message)
            content = content.replace("=3D", "=")
            importJson = raw
            
            thread_id = imap_client.get_x_gm_thrid(uid)
            
            obj = {"gmailUid": uid_str, "externalId": message_id, "subject": subject, "date": timestamp,
                "fromAddresses": [from_mail] if from_mail != "" else [],
                "toAddresses": [to_mail] if to_mail != "" else [],
                "replyToAddresses": [reply_to_mail] if reply_to_mail != "" else [],
                "content": content, "importJson": raw, "xGmThrid": thread_id}
            
            objects.append(obj)
            
            write_object(obj, out_dir)
    return objects



In [None]:
# Store your credentials in this file:
file = open('tmp/credentials_gmail.txt','r')
imap_host = 'imap.gmail.com'
imap_user = file.readline()
imap_pw = file.readline()

imap_client = IMAPClient(username=imap_user, app_pw=imap_pw, address=imap_host)
gmail_ids = imap_client.get_all_mail_uids()

all_mails = []
stop_at = 10

def get_timestamp_from_message(message):
    date = message["date"]
    parsed_time = email.utils.parsedate(date)

    dt = email.utils.parsedate_to_datetime(date)

    timestamp = int(dt.timestamp() * 1000)
    
    return timestamp

def create_item_from_mail(mail_utf8):
    message = email.message_from_string(mail_utf8)
                
    message_id = message["message-id"].replace("/", "")
    subject = message["subject"]
    timestamp = get_timestamp_from_message(message)
    
    from_name, from_mail = email.utils.parseaddr(message["from"])
    from_name = imap_client.decode_header(from_name)

    to_name, to_mail = email.utils.parseaddr(message["to"])
    to_name = imap_client.decode_header(to_name)

    reply_to_name, reply_to_mail = email.utils.parseaddr(message["reply-to"])
    reply_to_name = imap_client.decode_header(reply_to_name)

    content = imap_client.get_html(message)
    content = content.replace("=3D", "=")
    importJson = mail_utf8

    thread_id = imap_client.get_x_gm_thrid(gmail_id)
        
    temp = {"externalId": message_id, "subject": subject, "date": timestamp,
    #             "fromAddresses": [from_mail] if from_mail != "" else [],
    #             "toAddresses": [to_mail] if to_mail != "" else [],
    #             "replyToAddresses": [reply_to_mail] if reply_to_mail != "" else [],
    #             "content": content, "importJson": raw, 
                "xGmThrid": thread_id}
    email_item = EmailMessage(content=content)
    
    # Create Edges to accounts
    for address in [from_mail] if from_mail != "" else []:
        address_item = Account(externalId=address)
        email_item.add_edge('sender', address_item)
        
    for address in [to_mail] if to_mail != "" else []:
        address_item = Account(externalId=address)
        email_item.add_edge('receiver', address_item)
        
    for address in [reply_to_mail] if reply_to_mail != "" else []:
        address_item = Account(externalId=address)
        email_item.add_edge('replyTo', address_item)
    
    print(f'{email_item}')
    return email_item
    
for i, gmail_id in enumerate(gmail_ids):
    if stop_at is not None and i >= stop_at:
        print(f"stopped early at {stop_at}")
        break
            
    gmail_id = gmail_id.decode("utf-8")
    
    mail = imap_client.get_mail(gmail_id)
    try:
        mail_utf8 = mail.decode("utf-8") 
    except (UnicodeDecodeError, AttributeError):
        print(f"Skipping email {external_id}, not utf-8 encoded")
        continue
        
    item = create_item_from_mail(mail_utf8)
    all_mails.append(item)
    
# All emails are now created, upload them to the pod here
for email_item in all_mails:
    print('-----')
    for account in email_item.receiver:
        print(f'{account.externalId}')
        
# All emails are now in the pod, create their thread edges here:
