In [16]:
#| default_exp collect_emails

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
#| export

import os
from ibmcloudant.cloudant_v1 import CloudantV1
from ibm_cloud_sdk_core.authenticators import BasicAuthenticator
from dataclasses import asdict
from dotenv import load_dotenv
import sys
import base64
import json
import nltk
from nltk.tokenize import sent_tokenize
import unicodedata
from bs4 import BeautifulSoup
import re
from typing import List, Optional

sys.path.append("..")  # Adds the parent directory to sys path

from data_types.email import Email, MessageType, json_encoder

load_dotenv()

nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /Users/igor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/igor/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [18]:
#| export

DB_NAME = os.getenv("COUCHDB_DBNAME")
auth = BasicAuthenticator(os.getenv("COUCHDB_USER"), os.getenv("COUCHDB_PASSWORD"))
client = CloudantV1(authenticator=auth)
client.set_service_url(os.getenv("COUCHDB_URL"))
client.set_disable_ssl_verification(True)

db_info = client.get_database_information(db=DB_NAME)

In [19]:
#| export

def split_into_sentences(text):
    """
    Takes a string of text and returns a list of sentences.
    """
    sentences = sent_tokenize(text)
    return sentences

def cleanup_sentence(sentence:str) -> str:
    """
    Cleans up a sentence by removing special characters and extra spaces.
    """
    if sentence is not None:
        sentence = re.sub(r'\s+', ' ', sentence)
        sentence = sentence.encode('ascii', 'ignore').decode()
        return sentence.strip()
    return sentence

def message_to_sentences(message_type: MessageType, message:str) -> Email:
    sentences = []
    if message is None:
        return sentences
    if message_type == MessageType.HTML:
        soup = BeautifulSoup(message, 'html.parser')
        
        # remove all links
        for a_tag in soup.find_all("a"):
            a_tag.decompose()  # Removes the <a> tag 
        
        # remove all images
        for img_tag in soup.find_all("img"):
            img_tag.decompose()  # Removes the <img> tag
        
        html = soup.get_text()
        text = unicodedata.normalize('NFD', html)

        sentences = split_into_sentences(text)
        sentences = [cleanup_sentence(sentence) for sentence in sentences]
    else:
        text = unicodedata.normalize('NFD', message)
        sentences = split_into_sentences(text)
        sentences = [cleanup_sentence(sentence) for sentence in sentences]
        
    return sentences

In [23]:
#| export

def create_query(folder:str):
    query = {
    "selector": {
        "folder": {
            "$eq": f"{folder}"
        },
        "created": {
            "$gt": 0
        }
    },
    "sort": [
        {
            "folder": "desc"
        },
        {
            "created": "desc"
        }
    ],
   'use_index': 'client-folder-created-desc-index'
    }
    return query

def get_decoded_email_data(email, key) -> Optional[str]:
    """
    Helper function to decode email data and extract a specific key.
    """
    didComm = email.get("didCommMessage")
    if didComm:
        plainBody = base64.b64decode(didComm.get("plainBodyBase64", "")).decode("utf-8")
        email_data = json.loads(plainBody)
        return email_data.get(key, None)
    return None

def extract_html(email):
    """
    Extract the HTML body from the email.
    Returns None if the email does not contain HTML.
    """
    return get_decoded_email_data(email, "bodyHtml")

def extract_text(email):
    """
    Extract the text body from the email.
    Returns None if the email does not contain text.
    """
    return get_decoded_email_data(email, "bodyText")

def extract_subject(email):
    """
    Extract the subject from the email.
    Returns None if the email does not contain a subject.
    """
    return get_decoded_email_data(email, "subject")

def extract_sender(email):
    """
    Extract the sender from the email.
    Returns None if the email does not contain a sender.
    """
    from_sender = get_decoded_email_data(email, "from")
    name = from_sender.get("Name") if from_sender and "Name" in from_sender else None
    address = from_sender.get("Address") if from_sender and "Address" in from_sender else None
    return name, address

def extract_message_id(email):
    """
    Extract the message ID from the email.
    Returns None if the email does not contain a message ID.
    """
    return email.get("_id", None)

def extract_folder(email):
    """
    Extract the folder from the email
    Returns None if the email does not contain a folder
    """
    folder = email.get("folder", None)
    return folder

def extract_created(email):
    """
    Extract the created timestamp from the email
    Returns None if the email does not contain a created timestamp
    """
    created = email.get("created", None)
    return created

def extract_message_type(email):
    """
    Extract the message type from the email
    Returns None if the email does not contain a message type
    """
    didComm = email.get("didCommMessage")
    if didComm:
        return didComm.get("type", None)
    return None

def list_emails(folder:str, bookmark="", limit=10):
    """
    List emails from the database 
    """
    emails = []
    query = create_query(folder)
    last_emails = client.post_find(db=DB_NAME, selector=query["selector"], sort=query["sort"], use_index=query["use_index"], bookmark=bookmark,limit=limit)
    for i, doc in enumerate(last_emails.result["docs"]):
        # if i == 0:
        #     dt = datetime.fromtimestamp(doc["created"] / 1000)
        #     print(f"First email: {dt}") 
        msg_type = extract_message_type(doc)
        # skip encrypted emails, list only SMTP emails
        if msg_type is None or msg_type != "application/mailio-smtp+json":
            continue
        message = extract_html(doc)
        message_type = MessageType.HTML
        if message is None:
            message = extract_text(doc)
            message_type = MessageType.TEXT
        
        subject = extract_subject(doc)
        s_name, s_email = extract_sender(doc)
        folder = extract_folder(doc)
        message_id = extract_message_id(doc)
        if isinstance(subject, list):
            subject = ".".join(filter(lambda s: s.strip(), subject))
        if isinstance(message_id, list):
            print("WTF?", message_id)
            raise ValueError("Message ID is a list")
        created = extract_created(doc)
        sentences = message_to_sentences(message_type, message)
        email = Email(message_type=message_type, sentences=sentences, subject=subject, sender_name=s_name, sender_email=s_email, message_id=message_id, folder=folder, created=created)
        

        emails.append(email)
        
    yield emails, last_emails.result["bookmark"]

In [24]:
#| export

# save the emails to newline delimited JSON file
def save_emails_to_jsonl(file_path, folder, limit):
    processed = 0
    with open(file_path, "w") as f:
        bm = ""
        while True:
            for emails, bookmark in list_emails(folder, bm, limit=limit):
                
                if len(emails) == 0:
                    bm = None
                    break

                for e in emails:
                    email_dict = asdict(e)
                    email_json = json.dumps(email_dict, default=json_encoder)
                    f.write(email_json)
                    f.write("\n")
                    processed += 1
            
                bm = bookmark
                
            if not bm:
                break

    print(f"Processed {processed} emails for folder {folder}")

In [27]:
save_emails_to_jsonl("../data/emails_goodreads.jsonl", "goodreads", 500)

Processed 4024 emails for folder goodreads


In [28]:
#| hide

import nbdev; nbdev.nbdev_export()