In [68]:
import pandas as pd
import numpy as np
import glob
import email.parser
import os
import math

def remove_wacka(s):
    return s.replace('<', '').replace('>', '')

def generate_emails_df():
    df = pd.DataFrame()
    for f in glob.glob('../../raw_emails/*.mail'):
        with open(f) as payload:
            full_content = email.parser.Parser().parse(payload)
            f_name = os.path.split(f)[1]
            message_id = remove_wacka(full_content['Message-ID'].split('@')[0])
            reply_id = (
                remove_wacka(full_content['In-Reply-To'].split('@')[0]) if full_content['In-Reply-To'] else np.NaN)
            e_body = full_content.get_payload()
            e_from = full_content['From']
            from_address = remove_wacka(e_from.split('<')[1])
            e_to = full_content['To']
            to_addresses = [x.split('<')[1].replace('>', '') for x in e_to.split(', ')]
            e_subj = full_content['Subject']
            date = pd.to_datetime(full_content['Date'])
            row = {
                'Message-ID': message_id,
                'In-Reply-To': reply_id,
                'Date': date,
                'From': e_from,
                'From email': from_address,
                'To': e_to,
                'To email': to_addresses,
                'Subject': e_subj,
                'Body': e_body,
            }
            df = df.append(other=row, ignore_index=True, sort=True)
    df = df.sort_values(['Date']).reset_index(drop=True)
    return df     

# 1, 2, 3, 6, 7, 8
# 4, 9, 10, 11
# 12, 13, 14
# 5, 15, 16
# 17

# Meeting #1 conversation
# 1 - to 2 recipients
# 2 - reply 1
# 3 - reply 1
# 4 - separate email about same meeting - new sender
# 5 - spam
# 6 - reply 2, 1 random other person - excited
# 7 - reply 6, 2, 1 meeting location
# 8 - reply 7, 6, 2, 1 follow-up from meeting
#---
# 9 - separate email about same meeting - follow-up from random other person
# 10 - reply 9 - accidentally removed from thread, response
# 11 - Fw - follow-up from promise in 10

# Having some issues conversation
# 12
# 13 - reply 12
# 14 - re-re reply 13, 12
# 15 - spam
# 16 - spam

# Meeting #2 conversation
# 17

def conversationalize(df):
    # identify spam
    df.loc[:, 'Spam'] = df.loc[:, 'From email'].map(
    lambda x: True if 'promo' in x.lower() else False)
    # identify starts of conversations
    df.loc[:, 'Starter'] = df.apply(
        lambda x: True if ('re:' not in x.loc['Subject'].lower() and 'fw:' not in x.loc['Subject'].lower()) and math.isnan(float(x.loc['In-Reply-To'])) else False, axis=1)
    starters = []
    convos = []
    for i in df.index:
        if df.loc[i, 'Starter'] == True:
            starters.append(df.loc[i, 'Message-ID'])
    # separate emails into conversation arrays
    for x in starters:
        # no replies
        if x not in df.loc[:, 'In-Reply-To'].values:
            convos.append([x])
        # next message
        for i in df.index:
            # identifying data for starter email
            if df.loc[i, 'Message-ID'] == x:
                this_subj = df.loc[i, 'Subject']
                sender = df.loc[i, 'From email']
                to_list = df.loc[i, 'To email']
            next_msg_id = df.loc[i, 'Message-ID']
            # identify direct replies to starters
            if df.loc[i, 'In-Reply-To'] == x:
                convos.append([x, next_msg_id])
            # if starter subject in reply subject and `from` in reply in `to` from starter
            if this_subj in df.loc[i, 'Subject'] and ('re' in df.loc[i, 'Subject'].lower() or 'fw' in df.loc[i, 'Subject'].lower()):
                if (sender in df.loc[i, 'To email'] or sender in df.loc[i, 'From email']):
                    if df.loc[i, 'Message-ID'] not in starters and int(df.loc[i, 'Message-ID']) > int(x):
                        convo_index = starters.index(x)
                        if df.loc[i, 'Message-ID'] not in convos[convo_index]:
                            convos[convo_index].append(df.loc[i, 'Message-ID'])
    print convos            
    return convos
 
df = generate_emails_df()

convos = conversationalize(df)

df.head(17)

# df.to_csv('../../emails_dataframe.csv', index=False)


[['1', '2', '3', '6', '7', '8', '11'], ['4'], ['5'], ['9', '10', '11'], ['12', '13', '14'], ['15'], ['16'], ['17']]


Unnamed: 0,Body,Date,From,From email,In-Reply-To,Message-ID,Subject,To,To email,Spam,Starter
0,"Hi both, can you let me know ideal times for m...",2017-11-02 18:58:15,Carmella Draeger <carmella@example.com>,carmella@example.com,,1,Meeting,"Celia Prince <celia@example.com>, Alisson Silv...","[celia@example.com, alisson@example.com]",False,True
1,"Hey Carmella,\n\nTuesday would be best for me....",2017-11-02 20:38:12,Celia Prince <celia@example.com>,celia@example.com,1.0,2,RE: Meeting,Carmella Draeger <carmella@example.com>,[carmella@example.com],False,False
2,"Wide open, literally any day works!\n",2017-11-02 20:42:55,Alisson Silva <alisson@example.com>,alisson@example.com,,3,Re: Meeting,Carmella Draeger <carmella@example.com>,[carmella@example.com],False,False
3,We should chat about the meeting you're planni...,2017-11-03 07:12:34,Andy Chavez <andy@example.com>,andy@example.com,,4,Meeting,Carmella Draeger <carmella@example.com>,[carmella@example.com],False,True
4,Really excited to see where you three get to o...,2017-11-03 12:13:01,Iyana Novak <iyana@example.com>,iyana@example.com,2.0,6,RE: Meeting,"Carmella Draeger <carmella@example.com>, Aliss...","[carmella@example.com, alisson@example.com, ce...",False,False
5,[-Iyana]\n\nOk we're all free today so let's g...,2017-11-03 14:28:11,Carmella Draeger <carmella@example.com>,carmella@example.com,6.0,7,RE: Meeting,"Alisson Silva <alisson@example.com>, Celia Pri...","[alisson@example.com, celia@example.com]",False,False
6,Great meeting!\n\nSummary:\n- Alisson will cir...,2017-11-03 20:28:17,Carmella Draeger <carmella@example.com>,carmella@example.com,7.0,8,RE: Meeting,"Alisson Silva <alisson@example.com>, Celia Pri...","[alisson@example.com, celia@example.com]",False,False
7,Check out this deal - available today only!\n\...,2017-11-04 11:22:00,Red Hot Deals!!! <promotions@example.com>,promotions@example.com,,5,Fidget spinners REDUCED TO CLEAR,Carmella Draeger <carmella@example.com>,[carmella@example.com],True,True
8,Did it happen yet?\n,2017-11-05 07:44:59,Iyana Novak <iyana@example.com>,iyana@example.com,,9,Meeting,Carmella Draeger <carmella@example.com>,[carmella@example.com],False,True
9,"Sorry! Yes. I'd removed you from the thread, w...",2017-11-05 07:49:12,Carmella Draeger <carmella@example.com>,carmella@example.com,9.0,10,Meeting,Iyana Novak <iyana@example.com>,[iyana@example.com],False,False
