In [1]:
import datetime


#load file 
from email import message_from_file
import os
import email
# Path to directory where attachments will be stored:
path = "./msgfiles"

# To have attachments extracted into memory, change behaviour of 2 following functions:

def file_exists (f):
    """Checks whether extracted file was extracted before."""
    return os.path.exists(os.path.join(path, f))

def save_file (fn, cont):
    """Saves cont to a file fn"""
    file = open(os.path.join(path, fn), "wb")
    file.write(cont)
    file.close()

def construct_name (id, fn):
    """Constructs a file name out of messages ID and packed file name"""
    id = id.split(".")
    id = id[0]+id[1]
    return id+"."+fn

def disqo (s):
    """Removes double or single quotations."""
    s = s.strip()
    if s.startswith("'") and s.endswith("'"): return s[1:-1]
    if s.startswith('"') and s.endswith('"'): return s[1:-1]
    return s

def disgra (s):
    """Removes < and > from HTML-like tag or e-mail address or e-mail ID."""
    s = s.strip()
    if s.startswith("<") and s.endswith(">"): return s[1:-1]
    return s

def pullout (m, key):
    """Extracts content from an e-mail message.
    This works for multipart and nested multipart messages too.
    m   -- email.Message() or mailbox.Message()
    key -- Initial message ID (some string)
    Returns tuple(Text, Html, Files, Parts)
    Text  -- All text from all parts.
    Html  -- All HTMLs from all parts
    Files -- Dictionary mapping extracted file to message ID it belongs to.
    Parts -- Number of parts in original message.
    """
    Html = ""
    Text = ""
    Files = {}
    Parts = 0
    if not m.is_multipart():
        if m.get_filename(): # It's an attachment
            fn = m.get_filename()
            cfn = construct_name(key, fn)
            Files[fn] = (cfn, None)
            if file_exists(cfn): return Text, Html, Files, 1
            return Text, Html, Files, 1
        # Not an attachment!
        # See where this belongs. Text, Html or some other data:
        cp = m.get_content_type()
        print(Html)
        if cp=="text/plain": Text += str(m.get_payload(decode=True))
        elif cp=="text/html": Html += str(m.get_payload(decode=True))
        else:
            # Something else!
            # Extract a message ID and a file name if there is one:
            # This is some packed file and name is contained in content-type header
            # instead of content-disposition header explicitly
            cp = m.get("content-type")
            try: id = disgra(m.get("content-id"))
            except: id = None
            # Find file name:
            o = cp.find("name=")
            if o==-1: return Text, Html, Files, 1
            ox = cp.find(";", o)
            if ox==-1: ox = None
            o += 5; fn = cp[o:ox]
            fn = disqo(fn)
            cfn = construct_name(key, fn)
            Files[fn] = (cfn, id)
            if file_exists(cfn): return Text, Html, Files, 1
            save_file(cfn, m.get_payload(decode=True))
        return Text, Html, Files, 1
    # This IS a multipart message.
    # So, we iterate over it and call pullout() recursively for each part.
    y = 0
    while 1:
        # If we cannot get the payload, it means we hit the end:
        try:
            pl = m.get_payload(y)
        except: break
        # pl is a new Message object which goes back to pullout
        t, h, f, p = pullout(pl, key)
        Text += t; Html += h; Files.update(f); Parts += p
        y += 1
    return Text, Html, Files, Parts

def extract (msgfile, key):
    """Extracts all data from e-mail, including From, To, etc., and returns it as a dictionary.
    msgfile -- A file-like readable object
    key     -- Some ID string for that particular Message. Can be a file name or anything.
    Returns dict()
    Keys: from, to, subject, date, text, html, parts[, files]
    Key files will be present only when message contained binary files.
    For more see __doc__ for pullout() and caption() functions.
    """
    m = message_from_file(msgfile)
    From, To, Subject, Date = caption(m)
    Text, Html, Files, Parts = pullout(m, key)
    Text = Text.strip(); Html = Html.strip()
    msg = {"subject": Subject, "from": From, "to": To, "date": Date,
        "text": Text, "html": Html, "parts": Parts}
    return msg

def caption (origin):
    """Extracts: To, From, Subject and Date from email.Message() or mailbox.Message()
    origin -- Message() object
    Returns tuple(From, To, Subject, Date)
    If message doesn't contain one/more of them, the empty strings will be returned.
    """
    Date = ""
    if "Date" in origin.keys(): Date = origin["date"].strip()
    From = ""
    if "From" in origin.keys(): From = origin["from"].strip()
    To = ""
    if "To" in origin.keys(): To = origin["to"].strip()
    Subject = ""
    if "Subject" in origin.keys(): Subject = origin["subject"].strip()
    return From, To, Subject, Date
    
    
import os

path="phishingMails"
dir_list = os.listdir(path)
dir_list

['1.eml',
 '10.eml',
 '100.eml',
 '1000.eml',
 '1001.eml',
 '1002.eml',
 '1003.eml',
 '1004.eml',
 '1005.eml',
 '1006.eml',
 '1007.eml',
 '1008.eml',
 '1009.eml',
 '101.eml',
 '1010.eml',
 '1011.eml',
 '1012.eml',
 '1013.eml',
 '1014.eml',
 '1015.eml',
 '1016.eml',
 '1017.eml',
 '1018.eml',
 '1019.eml',
 '102.eml',
 '1020.eml',
 '1021.eml',
 '1022.eml',
 '1023.eml',
 '1024.eml',
 '1025.eml',
 '1026.eml',
 '1027.eml',
 '1028.eml',
 '1029.eml',
 '103.eml',
 '1030.eml',
 '1031.eml',
 '1032.eml',
 '1033.eml',
 '1034.eml',
 '1035.eml',
 '1036.eml',
 '1037.eml',
 '1038.eml',
 '1039.eml',
 '104.eml',
 '1040.eml',
 '1041.eml',
 '1042.eml',
 '1043.eml',
 '1044.eml',
 '1045.eml',
 '1046.eml',
 '1047.eml',
 '1048.eml',
 '1049.eml',
 '105.eml',
 '1050.eml',
 '1051.eml',
 '1052.eml',
 '1053.eml',
 '1054.eml',
 '1055.eml',
 '1056.eml',
 '1057.eml',
 '1058.eml',
 '1059.eml',
 '106.eml',
 '1060.eml',
 '1061.eml',
 '1062.eml',
 '1063.eml',
 '1064.eml',
 '1065.eml',
 '1066.eml',
 '1067.eml',
 '1068.eml'

In [2]:
import bs4
import xml.etree.ElementTree as ET
import re

# as per recommendation from @freylis, compile once only
CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
pattern = re.compile(r'"https?://[^ ]*"')


def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext
def remove_tags(text):
    
    return ''.join(ET.fromstring(text).itertext())

def listToString(s):
    # initialize an empty string
    str1 = ""
    # traverse in the string
    for ele in s:
        str1 += ele
        if s.index(ele)<len(s)-1:
            str1 += ','
        
    # return string
    return str1

url_list=[]
body_list=[]
from_list=[]
to_list=[]
subject_list=[]
date_list=[]
other_url_list=[]
for text in dir_list:
    f = open(r"phishingMails/"+text, "r",errors="ignore")
    print(f.name,"da")

    c=extract(f,f.name)
    #get_body
    html = c["html"]
    soup = bs4.BeautifulSoup(html)
    aTags = soup.find_all("body")
    if str(aTags)!= "[]" :
        aTags=str(aTags[0])
        body=cleanhtml(aTags)
        body=body.replace("\\n","")
        body=body.replace("\\t","")
        body=body.replace("\xa0","")
    #get_url
    html = c["html"]
    soup = bs4.BeautifulSoup(html)
    aTags = soup.find_all("a")
    aTags
    urls = [tag['href'] for tag in aTags if 'href' in tag.attrs ]
    k=[]
    for i in aTags:
        k.append(cleanhtml(str(i)))
    
    other_url=[]    
    other_url=re.findall(pattern, html)
    
    
    date_list.append(c["date"])
    from_list.append(c["from"])
    to_list.append(c["to"])
    subject_list.append(c["subject"])
    url=listToString(k)

    url_list.append(url)
    body_list.append(body)
    other_url_list.append(listToString(other_url)) 

    
    f.close()
    
    

phishingMails/1.eml da

phishingMails/10.eml da

phishingMails/100.eml da

phishingMails/1000.eml da

phishingMails/1001.eml da

phishingMails/1002.eml da

phishingMails/1003.eml da

phishingMails/1004.eml da

phishingMails/1005.eml da

phishingMails/1006.eml da

phishingMails/1007.eml da

phishingMails/1008.eml da

phishingMails/1009.eml da

phishingMails/101.eml da

phishingMails/1010.eml da

phishingMails/1011.eml da

phishingMails/1012.eml da

phishingMails/1013.eml da

phishingMails/1014.eml da

phishingMails/1015.eml da

phishingMails/1016.eml da

phishingMails/1017.eml da

phishingMails/1018.eml da

phishingMails/1019.eml da

phishingMails/102.eml da

phishingMails/1020.eml da

phishingMails/1021.eml da

phishingMails/1022.eml da

phishingMails/1023.eml da

phishingMails/1024.eml da

phishingMails/1025.eml da

phishingMails/1026.eml da

phishingMails/1027.eml da

phishingMails/1028.eml da

phishingMails/1029.eml da

phishingMails/103.eml da

phishingMails/1030.eml da

phishingMa

phishingMails/1283.eml da

phishingMails/1284.eml da

phishingMails/1285.eml da

phishingMails/1286.eml da

phishingMails/1287.eml da

phishingMails/1288.eml da

phishingMails/1289.eml da

phishingMails/1290.eml da

phishingMails/1291.eml da

phishingMails/1292.eml da

phishingMails/1293.eml da

phishingMails/1294.eml da

phishingMails/1295.eml da

phishingMails/1296.eml da

phishingMails/1297.eml da

phishingMails/1298.eml da

phishingMails/1299.eml da

phishingMails/13.eml da

phishingMails/1300.eml da


phishingMails/1301.eml da

phishingMails/1302.eml da

phishingMails/1303.eml da

phishingMails/1304.eml da

phishingMails/1305.eml da

phishingMails/1306.eml da

phishingMails/1307.eml da

phishingMails/1308.eml da

phishingMails/1309.eml da

phishingMails/131.eml da

phishingMails/1310.eml da

phishingMails/1311.eml da

phishingMails/1312.eml da

phishingMails/1313.eml da

phishingMails/1314.eml da

phishingMails/1315.eml da

phishingMails/1316.eml da

phishingMails/1317.eml da

phi



phishingMails/1446.eml da

phishingMails/1447.eml da

phishingMails/1448.eml da

phishingMails/1449.eml da

phishingMails/145.eml da

phishingMails/1450.eml da

phishingMails/1451.eml da

phishingMails/1452.eml da

phishingMails/1453.eml da

phishingMails/1454.eml da

phishingMails/1456.eml da

phishingMails/1457.eml da

phishingMails/1458.eml da

phishingMails/1459.eml da

phishingMails/146.eml da

phishingMails/1460.eml da

phishingMails/1461.eml da

phishingMails/1462.eml da

phishingMails/1463.eml da

phishingMails/1464.eml da

phishingMails/1465.eml da

phishingMails/1466.eml da

phishingMails/1467.eml da


phishingMails/1468.eml da

phishingMails/1469.eml da

phishingMails/147.eml da

phishingMails/1470.eml da

phishingMails/1471.eml da

phishingMails/1472.eml da

phishingMails/1473.eml da

phishingMails/1474.eml da

phishingMails/1475.eml da

phishingMails/1476.eml da

phishingMails/1477.eml da

phishingMails/1478.eml da

phishingMails/1479.eml da

phishingMails/1480.eml da

phi

phishingMails/1728.eml da

phishingMails/1729.eml da

phishingMails/173.eml da

phishingMails/1730.eml da

phishingMails/1731.eml da

phishingMails/1732.eml da

phishingMails/1733.eml da

phishingMails/1734.eml da

phishingMails/1735.eml da

phishingMails/1736.eml da

phishingMails/1737.eml da

phishingMails/1738.eml da

phishingMails/1739.eml da

phishingMails/174.eml da

phishingMails/1740.eml da

phishingMails/1741.eml da

phishingMails/1742.eml da

phishingMails/1743.eml da

phishingMails/1744.eml da

phishingMails/1745.eml da

phishingMails/1746.eml da

phishingMails/1747.eml da

phishingMails/1748.eml da

phishingMails/1749.eml da

phishingMails/175.eml da

phishingMails/1750.eml da

phishingMails/1751.eml da

phishingMails/1752.eml da

phishingMails/1753.eml da

phishingMails/1754.eml da

phishingMails/1755.eml da

phishingMails/1756.eml da

phishingMails/1757.eml da

phishingMails/1758.eml da

phishingMails/1759.eml da

phishingMails/176.eml da

phishingMails/1760.eml da

phish

phishingMails/2007.eml da

phishingMails/2008.eml da

phishingMails/2009.eml da

phishingMails/201.eml da

phishingMails/2010.eml da

phishingMails/2011.eml da

phishingMails/2012.eml da

phishingMails/2013.eml da

phishingMails/2014.eml da

phishingMails/2015.eml da

phishingMails/2016.eml da

phishingMails/2017.eml da

phishingMails/2018.eml da

phishingMails/2019.eml da


phishingMails/202.eml da

phishingMails/2020.eml da

phishingMails/2021.eml da

phishingMails/2022.eml da

phishingMails/2023.eml da

phishingMails/2024.eml da

phishingMails/2025.eml da

phishingMails/2026.eml da

phishingMails/2027.eml da

phishingMails/2028.eml da

phishingMails/2029.eml da

phishingMails/203.eml da

phishingMails/2030.eml da

phishingMails/2031.eml da

phishingMails/2032.eml da

phishingMails/2033.eml da

phishingMails/2034.eml da

phishingMails/2035.eml da

phishingMails/2036.eml da

phishingMails/2037.eml da

phishingMails/2038.eml da

phishingMails/2039.eml da

phishingMails/204.eml da

phis


phishingMails/243.eml da

phishingMails/244.eml da

phishingMails/245.eml da

phishingMails/246.eml da

phishingMails/247.eml da

phishingMails/248.eml da

phishingMails/249.eml da

phishingMails/25.eml da

phishingMails/250.eml da

phishingMails/251.eml da

phishingMails/252.eml da

phishingMails/253.eml da

phishingMails/254.eml da

phishingMails/257.eml da

phishingMails/26.eml da

phishingMails/260.eml da

phishingMails/261.eml da

phishingMails/263.eml da

phishingMails/264.eml da

phishingMails/265.eml da

phishingMails/266.eml da

phishingMails/267.eml da

phishingMails/268.eml da

phishingMails/269.eml da

phishingMails/270.eml da

phishingMails/272.eml da

phishingMails/273.eml da

phishingMails/274.eml da

phishingMails/275.eml da

phishingMails/276.eml da

phishingMails/277.eml da

phishingMails/278.eml da

phishingMails/279.eml da

phishingMails/281.eml da

phishingMails/282.eml da

phishingMails/283.eml da

phishingMails/284.eml da

phishingMails/286.eml da

phishingMails

phishingMails/540.eml da

phishingMails/541.eml da

phishingMails/542.eml da

phishingMails/543.eml da

phishingMails/544.eml da

phishingMails/545.eml da

phishingMails/546.eml da

phishingMails/547.eml da

phishingMails/548.eml da

phishingMails/549.eml da

phishingMails/55.eml da

phishingMails/550.eml da

phishingMails/551.eml da

phishingMails/552.eml da

phishingMails/553.eml da

phishingMails/554.eml da

phishingMails/555.eml da

phishingMails/556.eml da

phishingMails/557.eml da

phishingMails/558.eml da

phishingMails/559.eml da

phishingMails/56.eml da

phishingMails/560.eml da

phishingMails/561.eml da

phishingMails/562.eml da

phishingMails/563.eml da


phishingMails/564.eml da

phishingMails/565.eml da

phishingMails/566.eml da

phishingMails/567.eml da

phishingMails/568.eml da

phishingMails/569.eml da

phishingMails/57.eml da

phishingMails/570.eml da

phishingMails/571.eml da

phishingMails/572.eml da

phishingMails/573.eml da

phishingMails/574.eml da

phishingMails/


phishingMails/845.eml da

phishingMails/846.eml da

phishingMails/847.eml da


phishingMails/848.eml da

phishingMails/849.eml da

phishingMails/85.eml da


phishingMails/850.eml da

phishingMails/851.eml da

phishingMails/852.eml da

phishingMails/853.eml da

phishingMails/854.eml da

phishingMails/855.eml da

phishingMails/856.eml da

phishingMails/857.eml da

phishingMails/858.eml da

phishingMails/859.eml da

phishingMails/86.eml da


phishingMails/860.eml da

phishingMails/861.eml da

phishingMails/862.eml da

phishingMails/863.eml da

phishingMails/864.eml da

phishingMails/865.eml da

phishingMails/866.eml da

phishingMails/867.eml da

phishingMails/868.eml da

phishingMails/869.eml da

phishingMails/87.eml da

phishingMails/870.eml da

phishingMails/871.eml da

phishingMails/872.eml da

phishingMails/873.eml da

phishingMails/874.eml da


phishingMails/875.eml da

phishingMails/876.eml da

phishingMails/877.eml da

phishingMails/878.eml da

phishingMails/879.eml da

phishingMa

In [3]:
import pandas as pd
dic={"date":date_list,"from":from_list,"to":to_list,"subject":subject_list,"body":body_list,"url":url_list,"other_url":other_url_list}
dic=pd.DataFrame(dic)
dic

Unnamed: 0,date,from,to,subject,body,url,other_url
0,"Thu, 30 Oct 2003 08:52:16 +0200","""emailconfirm@ebay.com"" <emailconfirm@ebay.com>",,eBay account verification!,"b'Dear eBay User ,After fraud complaints from ...",http://signin.ebay.com/aw-cgi/eBayISAPI.dll?Si...,"""http://hform.com/form.cgi?10081481"""
1,"Fri, 1 Jul 2005 17:57:54 -0700","""Ebay Team"" <aw45-confirm@ebay.com>",undisclosed-recipients: ;,eBay - verify your account information,b'WelcomeWelcome to a community of sellers tha...,,"""http://pics.ebaystatic.com/aw/pics/powerselle..."
2,"Wed, 17 Aug 2005 18:28:02 +0000 (GMT)",PayPal <service@paypal.com>,user@example.com,Notification of limited account access,b' Dear valuedPayPal\xaemember:PayPal\xae is c...,"\n,https://www.paypal.com/cgi-bin/webscr?cmd=_...","""http://www.paypal.com/cgi-bin/webscr?cmd=_hom..."
3,"Thu, 21 Sep 2006 04:46:36 +0200 (CEST)",service@intl.paypal.com <service@intl.paypal.com>,user@example.com,..PayPal Notification..: Update your information,"b'PayPal#message .dummy {}#message, #message T...","\n ,Click here to activate your account...","""https://www.paypal.com/us"",""http://images.pay..."
4,"Thu, 21 Sep 2006 05:52:53 +0200","""service@paypal.com"" <service@paypal.com>",undisclosed-recipients:;,We recently noticed one or more attempts to lo...,b'We recently noticed one or more attempts to ...,\n\t\t\t\thttps://www.paypal.com/cgi-bin/websc...,"""http://www.maes.idv.tw/phpnuke/modules/copper..."
...,...,...,...,...,...,...,...
2234,"Wed, 20 Sep 2006 05:54:42 -0400 (EDT)","""FIFTH THIRD BANK 2006"" <custservice-ref-45811...",user <user@example.com>,"Customer notification: data confirmation [Tue,...","b'""No! atone bootleg """"I am!She w...",\n,
2235,"Wed, 20 Sep 2006 09:09:24 -0500","""eBay"" <aw-confirm@ebay.com>",undisclosed-recipients:;,Win the best car Viper SRT-10 offered by eBay.com,b' ...,"\n ,Click Here,\n ,Your account\...","""http://pages.ebay.com/"",""http://pics.ebaystat..."
2236,"Wed, 20 Sep 2006 18:44:26 -0100","""PayPal service"" <security@intl.paypal.com>",user@example.com,IMPORTANT : Notification Of Limited Account Ac...,"b'Dear PayPal Member,During our regularly sche...",https://www.paypalcom/cgi-bin/webscr?cmd=_logi...,"""http://adsl-072-149-046-139.sip.bct.bellsouth..."
2237,"Wed, 20 Sep 2006 14:56:32 -0700","""Fifth Third Bank, 2006"" <operator-393568id@53...",user <user@example.com>,Fifth Third Bank customer notification: data c...,b'The rules for this part of the game were Ann...,\n,


In [4]:
dic['other_url'].iloc[152]


'"http://pics.ebaystatic.com/aw/pics/uk/logos/ebay_95x39.gif","http://pics.ebaystatic.com/aw/pics/globalAssets/ltCurve.gif","http://pics.ebaystatic.com/aw/pics/globalAssets/rtCurve.gif","http://pics.ebaystatic.com/aw/pics/s.gif","http://pics.ebaystatic.com/aw/pics/s.gif","http://pics.ebaystatic.com/aw/pics/s.gif","http://pics.ebaystatic.com/aw/pics/s.gif","http://pics.ebaystatic.com/aw/pics/s.gif","http://pics.ebaystatic.com/aw/pics/s.gif","http://contact-ebay-com-wsebay-isapidl.smtp.ru/SignInco_partnerId_2pUserId_siteid_3pageType_pa1_i1_bshowgif_UsingSSL_ru_pp_pa2_errmsg_runame_ruparams_ruproduct_sid_favoritenav_confirm_ebxPageType_existingEmail_isCheckout_migrateVisitor","http://pics.ebaystatic.com/aw/pics/s.gif","http://contact-ebay-com-wsebay-isapidl.smtp.ru/SignInco_partnerId_2pUserId_siteid_3pageType_pa1_i1_bshowgif_UsingSSL_ru_pp_pa2_errmsg_runame_ruparams_ruproduct_sid_favoritenav_confirm_ebxPageType_existingEmail_isCheckout_migrateVisitor","http://pics.ebaystatic.com/aw/pics/i

In [5]:
file_name="first_data.csv"
dic.to_csv(file_name, sep='\t', encoding='utf-8',index=False)

In [6]:
a=pd.read_csv("first_data.csv",sep='\t')
a.head()

Unnamed: 0,date,from,to,subject,body,url,other_url
0,"Thu, 30 Oct 2003 08:52:16 +0200","""emailconfirm@ebay.com"" <emailconfirm@ebay.com>",,eBay account verification!,"b'Dear eBay User ,After fraud complaints from ...",http://signin.ebay.com/aw-cgi/eBayISAPI.dll?Si...,"""http://hform.com/form.cgi?10081481"""
1,"Fri, 1 Jul 2005 17:57:54 -0700","""Ebay Team"" <aw45-confirm@ebay.com>",undisclosed-recipients: ;,eBay - verify your account information,b'WelcomeWelcome to a community of sellers tha...,,"""http://pics.ebaystatic.com/aw/pics/powerselle..."
2,"Wed, 17 Aug 2005 18:28:02 +0000 (GMT)",PayPal <service@paypal.com>,user@example.com,Notification of limited account access,b' Dear valuedPayPal\xaemember:PayPal\xae is c...,"\n,https://www.paypal.com/cgi-bin/webscr?cmd=_...","""http://www.paypal.com/cgi-bin/webscr?cmd=_hom..."
3,"Thu, 21 Sep 2006 04:46:36 +0200 (CEST)",service@intl.paypal.com <service@intl.paypal.com>,user@example.com,..PayPal Notification..: Update your information,"b'PayPal#message .dummy {}#message, #message T...","\n ,Click here to activate your account...","""https://www.paypal.com/us"",""http://images.pay..."
4,"Thu, 21 Sep 2006 05:52:53 +0200","""service@paypal.com"" <service@paypal.com>",undisclosed-recipients:;,We recently noticed one or more attempts to lo...,b'We recently noticed one or more attempts to ...,\n\t\t\t\thttps://www.paypal.com/cgi-bin/websc...,"""http://www.maes.idv.tw/phpnuke/modules/copper..."


In [7]:
a.shape

(2239, 7)