In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re

import spacy

In [2]:
mails = pd.read_csv('CEAS_08.csv')
print(mails.head(10))

sender  \
0                   Young Esposito <Young@iworld.de>   
1                       Mok <ipline's1983@icable.ph>   
2  Daily Top 10 <Karmandeep-opengevl@universalnet...   
3                 Michael Parker <ivqrnai@pobox.com>   
4  Gretchen Suggs <externalsep1@loanofficertool.com>   
5  Caroline Aragon <dwthaidomainnamesm@thaidomain...   
6    Replica Watches <jhorton@thebakercompanies.com>   
7             Daily Top 10 <acidirev_1972@tcwpg.com>   
8                  qydlqcws-iacfym@issues.apache.org   
9      Daily Top 10 <orn|dent_1973@musicaedischi.it>   

                                         receiver  \
0                     user4@gvc.ceas-challenge.cc   
1                   user2.2@gvc.ceas-challenge.cc   
2                   user2.9@gvc.ceas-challenge.cc   
3  SpamAssassin Dev <xrh@spamassassin.apache.org>   
4                   user2.2@gvc.ceas-challenge.cc   
5                user7-ext5@gvc.ceas-challenge.cc   
6                  user2.10@gvc.ceas-challenge.cc   
7    

In [3]:
duplicates = mails.duplicated(subset=['subject', 'sender'], keep=False)

In [4]:
print(mails.isnull().sum().sort_values())
mails.dropna(subset=['subject'], inplace=True)

sender        0
date          0
body          0
label         0
urls          0
subject      28
receiver    462
dtype: int64


In [5]:
print(mails['sender'].values)

['Young Esposito <Young@iworld.de>' "Mok <ipline's1983@icable.ph>"
 'Daily Top 10 <Karmandeep-opengevl@universalnet.psi.br>' ...
 'Abhijit Vyas <xpojhbz@gmail.com>'
 'Joseph Brennan <vupzesm@columbia.edu>'
 'Christian Heimes <wluhe@cheimes.de>']


In [6]:
def strip_for_mail(sender):
    start_symbol = '<'
    end_symbol = '>'
    if start_symbol in sender:
        start_index = sender.index(start_symbol) + 1
        end_index = sender.index(end_symbol)
        output = sender[start_index:end_index]
        return output
    else:
        return sender

print(strip_for_mail('Young Esposito <Young@iworld.de>'))

mails['sender_mail'] = mails['sender'].apply(strip_for_mail)
print(mails['sender_mail'].head(100))




Young@iworld.de
0                             Young@iworld.de
1                      ipline's1983@icable.ph
2     Karmandeep-opengevl@universalnet.psi.br
3                           ivqrnai@pobox.com
4            externalsep1@loanofficertool.com
                       ...                   
95                           liivp@python.org
96                       ppcwedbyff@gmail.com
97                        xycn-vtnhz@perl.org
98                   qgzon.djsmosok@gmail.com
99      vqznddhdkax_04446187@yourvoice.net.nz
Name: sender_mail, Length: 100, dtype: object


In [9]:
mails = mails[['sender_mail', 'subject', 'body', 'urls', 'label']]
mails.columns = ['sender_mail', 'subject', 'body', 'contain_urls', 'is_phishing']

In [16]:
mails_small = mails.head(100)
print(mails_small)

sender_mail  \
0                           Young@iworld.de   
1                    ipline's1983@icable.ph   
2   Karmandeep-opengevl@universalnet.psi.br   
3                         ivqrnai@pobox.com   
4          externalsep1@loanofficertool.com   
..                                      ...   
95                         liivp@python.org   
96                     ppcwedbyff@gmail.com   
97                      xycn-vtnhz@perl.org   
98                 qgzon.djsmosok@gmail.com   
99    vqznddhdkax_04446187@yourvoice.net.nz   

                                              subject  \
0                           Never agree to be a loser   
1                              Befriend Jenna Jameson   
2                                CNN.com Daily Top 10   
3   Re: svn commit: r619753 - in /spamassassin/tru...   
4                          SpecialPricesPharmMoreinfo   
..                                                ...   
95  Re: [Python-Dev] Python-Dev Summary Draft (Apr...   
96  Re: [Py

In [10]:
print(mails.head())

sender_mail  \
0                          Young@iworld.de   
1                   ipline's1983@icable.ph   
2  Karmandeep-opengevl@universalnet.psi.br   
3                        ivqrnai@pobox.com   
4         externalsep1@loanofficertool.com   

                                             subject  \
0                          Never agree to be a loser   
1                             Befriend Jenna Jameson   
2                               CNN.com Daily Top 10   
3  Re: svn commit: r619753 - in /spamassassin/tru...   
4                         SpecialPricesPharmMoreinfo   

                                                body  contain_urls  \
0  Buck up, your troubles caused by small dimensi...             1   
1  \nUpgrade your sex and pleasures with these te...             1   
2  >+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...             1   
3  Would anyone object to removing .so from this ...             1   
4  \nWelcomeFastShippingCustomerSupport\nhttp://7...             1 

In [11]:
text_example = mails.iloc[2]['body']
print(text_example)



In [12]:
nlp = spacy.load('en_core_web_lg')

stopwords = spacy.lang.en.stop_words.STOP_WORDS

doc = nlp(text_example)

def not_stopword_and_is_alpha(s):
    return s.isalpha() and s not in stopwords

lemmas = [token.lemma_ for token in doc
                        if not_stopword_and_is_alpha(token.lemma_)]

entities = [(ent.text, ent.label_) for ent in doc.ents
                                    if not_stopword_and_is_alpha(ent.text)]
pos = [(token.text, token.pos_) for token in doc
                                    if not_stopword_and_is_alpha(token.text)]

In [13]:
print("LEMMAS")
for lemma in lemmas:
    print(lemma)

LEMMAS
DAILY
TOP
video
story
Aug
PM
EDT
VIDEOS
MONTAUK
monster
devil
dog
turtle
Montauk
Monster
CNN
Jeanne
Moos
ask
thing
racy
photo
TODDLER
mom
news
ABSURD
EPISODE
POLICE
beat
DISPUTE
MOM
PLEADS
GIRL
RETURN
defendant
fake
HEART
ATTACK
KILLER
carry
VICTIM
head
murder
confession
RECANTED
ANTHRAX
SUSPECT
home
heckler
interrupt
obama
TALK
STORIES
suspect
BEHEADING
IDENTIFIED
canadian
police
Vince
Weiguang
Li
Edmonton
charge
second
degree
murder
beheading
man
bus
judge
TAKEN
OFF
jena
case
PEOPLE
MAG
GETS
PITT
JOLIE
PIX
attack
TORONTO
RACIAL
ireporter
unusual
names
MOTHER
PLEADS
child
RETURN
KARADZIC
I
deal
SUSPECT
ARRESTED
SWIM
KILLINGS
ANTHRAX
SUSPECT
APPARENT
SUICIDE
MCCAIN
OBAMA
CRITICISM
FAIR
CNN
trusted
News
Cable
News
Network
LP
LLLP
CNN
Center
Atlanta
Georgia
Cable
News
Network
LP
LLLP
Time
Warner
Company
Rights
reserve
send
comment
suggestion
read
privacy
guideline
agree
receive
email
result
preference
setting
manage
setting
unsubscribe
Daily
Top


In [14]:
print("ENTITIES")
for name, type_ in entities:
    print(name + ": " + type_)

ENTITIES
CNN: ORG
OBAMA: PERSON
Canadian: NORP
Edmonton: GPE
second: ORDINAL
MAG: ORG
PITT: ORG
SWIM: GPE
MCCAIN: ORG
OBAMA: PERSON
CNN: ORG
Atlanta: GPE
Georgia: GPE


In [15]:
print("POS")
for word, pos_ in pos:
    print(word + ": " + pos_)

POS
THE: DET
DAILY: PROPN
TOP: PROPN
Top: ADJ
videos: NOUN
stories: NOUN
Aug: PROPN
PM: PROPN
EDT: PROPN
TOP: ADV
VIDEOS: PROPN
MONTAUK: PROPN
MONSTER: NOUN
Is: AUX
devil: NOUN
dog: NOUN
Is: AUX
turtle: NOUN
Is: AUX
Montauk: PROPN
Monster: PROPN
CNN: PROPN
Jeanne: PROPN
Moos: PROPN
asks: VERB
thing: NOUN
RACY: NOUN
PHOTOS: NOUN
OF: ADP
TODDLER: PROPN
MOM: NOUN
NEWS: NOUN
OF: ADP
THE: DET
ABSURD: PROPN
EPISODE: VERB
POLICE: NOUN
BEATING: VERB
DISPUTE: VERB
MOM: PROPN
PLEADS: PROPN
FOR: ADP
GIRL: PROPN
RETURN: PROPN
DEFENDANT: VERB
FAKES: VERB
HEART: PROPN
ATTACK: PROPN
KILLER: PROPN
CARRIED: VERB
VICTIM: NOUN
HEAD: NOUN
MURDER: NOUN
CONFESSION: NOUN
RECANTED: PROPN
ANTHRAX: PROPN
SUSPECT: PROPN
HOME: NOUN
HECKLERS: NOUN
INTERRUPT: VERB
OBAMA: NOUN
TALK: PROPN
TOP: ADV
STORIES: PROPN
SUSPECT: ADJ
IN: ADP
BEHEADING: PROPN
IDENTIFIED: PROPN
Canadian: ADJ
police: NOUN
Vince: PROPN
Weiguang: PROPN
Li: PROPN
Edmonton: PROPN
charged: VERB
second: ADJ
degree: NOUN
murder: NOUN
beheading: NOUN
m