# Get CSV from suspicious mailbox

Use this python notebook to create a CSV file with the suspicious mailbox data.

To labelize your data, create folders with the name of the labels you want to use in you Suspicious mailbox. Then, categorize your emails in these folders.

## Import libraries

In [None]:
import pandas as pd
from collections import defaultdict

In [None]:
import imaplib
import email
from email import policy
from bs4 import BeautifulSoup
import chardet

## Set parameters

Set the parameters for the .csv dataset created from the suspicious mailbox.

- **FOLDERS**: List of folder names in your Suspicious mailbox to use as labels. These should match the labels you want to use for the classification.
- **OUTPUT_CSV**: The name of the output CSV file.
- **EMAIL_SERVER**: The IMAP server address of your email provider.
- **EMAIL_PORT**: The IMAP server port of your email provider.
- **EMAIL_USERNAME**: The username for your email account.
- **EMAIL_PASSWORD**: The password for your email account.
- **EMAIL_INBOX**: The name of the inbox folder to read emails from.

In [None]:
FOLDERS = [
    
]
OUTPUT_CSV = ''

EMAIL_SERVER = ''
EMAIL_PORT = 
EMAIL_USERNAME = ''
EMAIL_PASSWORD = ''
EMAIL_INBOX = ''

## Utils

In [None]:
def get_header_dict_list(msg):
    headers = defaultdict(list)
    for key, value in msg.items():
        headers[key].append(value)
    return headers

## Fetch mails

In [None]:
df_mailbox = pd.DataFrame(columns=["body", "label"])

In [None]:
with imaplib.IMAP4_SSL(EMAIL_SERVER, EMAIL_PORT) as imap_server:
    imap_server.login(EMAIL_USERNAME, EMAIL_PASSWORD)
    imap_server.select(EMAIL_INBOX)

    print('Connected to IMAP server')

    for folder in FOLDERS:
        status, _ = imap_server.select(folder)
        if status != "OK":
            print(f"Failed to open folder: {folder}")
            continue

        print(f"\nFolder: {folder}")

        # Fetch email IDs
        status, email_ids = imap_server.search(None, 'ALL')
        if status == "OK":
            email_ids = email_ids[0].split()
            if not email_ids:
                print("No emails found.")
                continue

            for email_id in email_ids:
                print(f"Email ID: {email_id}")
                # email_id = b'185'
                status, msg_data = imap_server.fetch(email_id, '(RFC822)')
                msg_bytes = msg_data[0][1]
                msg = email.message_from_bytes(msg_bytes, policy=policy.default)

                # Get attached emls from email
                for part in msg.iter_attachments():
                    content_type = part.get_content_type()

                    if content_type == 'message/rfc822':
                        payload = part.get_payload()
                        if isinstance(payload, list) and payload:
                            for item in payload:
                                if isinstance(item, email.message.EmailMessage):
                                    headers = get_header_dict_list(item)

                                    body_plain = None
                                    body_html = None

                                    part_plain = item.get_body(preferencelist=('plain'))
                                    if part_plain:
                                        raw_plain = part_plain.get_payload(decode=True)
                                        if isinstance(raw_plain, bytes):
                                            enc = chardet.detect(raw_plain)['encoding'] or 'utf-8'
                                            try:
                                                body_plain = raw_plain.decode(enc)
                                            except UnicodeDecodeError:
                                                body_plain = raw_plain.decode('utf-8', errors='replace')
                                        else:
                                            body_plain = str(raw_plain)

                                    part_html = item.get_body(preferencelist=('html'))
                                    if part_html:
                                        raw_html = part_html.get_payload(decode=True)
                                        if isinstance(raw_html, bytes):
                                            enc = chardet.detect(raw_html)['encoding'] or 'utf-8'
                                            try:
                                                body_html = raw_html.decode(enc)
                                            except UnicodeDecodeError:
                                                body_html = raw_html.decode('utf-8', errors='replace')
                                        else:
                                            body_html = str(raw_html)

                                    if not body_plain and body_html:
                                        body_plain = BeautifulSoup(body_html, 'html.parser').get_text()

                                    new_row = {
                                        "body": body_plain,
                                        "label": folder,
                                    }

                                    df_mailbox = pd.concat([df_mailbox, pd.DataFrame([new_row])], ignore_index=True)

In [None]:
df_mailbox['label'].value_counts().plot(kind='bar', title='Email Labels Distribution')

In [None]:
df_mailbox.to_csv(OUTPUT_CSV, index=False)