In [1]:
import pandas as pd
import re
from tqdm import tqdm_notebook

prefix = '../data/'

In [2]:
df_all = pd.read_csv(prefix +"/spam_dataset_with_subject.csv",index_col=0)


In [3]:
df_all = df_all.drop_duplicates()
df_all.isna().sum()
df_all.dropna(subset=['text_in_img', 'text_in_html'], how='all', inplace = True)

In [4]:
df_all.shape

(11182, 10)

In [5]:
df_all = df_all.fillna(" ")


In [6]:
df_all.head()

Unnamed: 0,index,account_id,agent_id,sender,text_in_html,text_in_img,img_text_ratio,label,id_pair,subject
0,0,1129379786086,1132572860140,Wayside Recovery Center,RSVP Today and join us for a morning of empowe...,,0.0,0,11293797860861132572860140,Only 3 weeks away! Empower ME Breakfast
1,1,1119086518675,1132430885433,Hemophilia Foundation of Greater Florida,Walk As Your Favorite Superhero CAN'T MAKE IT ...,,0.0,0,11190865186751132430885433,Tampa Sunday Funday
2,2,1129108515095,1132943659666,Post and Courier,"Good morning, here are your Clemson sports hea...",,0.0,0,11291085150951132943659666,Clemson basketball team wins gold medal for US...
3,3,1129108515095,1132938328084,Post and Courier Advertising Partners,Carmella's advanced in 5 Charleston's Choice c...,,0.0,0,11291085150951132938328084,Indulge your sweet tooth. Vote for Carmella's
4,4,1103447982417,1132525777889,Bravo Troy Ohio,Easter Brunch Santa Claus doesn't hold a candl...,,0.0,0,11034479824171132525777889,"The GOOD NEWS of Troy, Ohio - 4/21/2019"


In [7]:
df_all['subject'] 

0                  Only 3 weeks away! Empower ME Breakfast
1                                      Tampa Sunday Funday
2        Clemson basketball team wins gold medal for US...
3            Indulge your sweet tooth. Vote for Carmella's
4                  The GOOD NEWS of Troy, Ohio - 4/21/2019
5                  MORE GOOD NEWS of Troy, Ohio - 7/7/2019
6           AUCTION-Register Now For Litz Crane & Rigging!
7        How to Lower Your Risk of Having More Breakout...
8        Psyche Institute, Rainbow Reiki Training, Pet ...
9                               July - Employee Newsletter
12                                                Reminder
13       The latest news for you about Field Trip & Kon...
14             One big boy moves out, a big girl moves in!
15            2019 Mid-Year Market Outlook Conference Call
16            Alcatel-Lucent 8058s IP Phone Price Decrease
17                Email Exclusive: Christmas in July Sale!
18       Is dirt bike insurance the same as motorcycle .

In [8]:

df_bert = pd.DataFrame({
    'id':range(len(df_all)),
    'label':df_all['label'],
    'alpha':['a']*df_all.shape[0],
    'text': df_all['subject'] +' '+ df_all['text_in_html'] +' '+ df_all['text_in_img']
})

In [9]:
def string_preprocess(r,c):
    s= re.sub('\s+', ' ', r[c]).strip()
    if len(s) == 0:
        return ""
    return s

df_bert['text'] = df_bert.apply (lambda row: string_preprocess(row,'text'), axis=1)


In [10]:
df_bert.isna().sum()

id       0
label    0
alpha    0
text     0
dtype: int64

In [11]:
df_bert.shape

(11182, 4)

In [12]:
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)


In [13]:
df_bert['text']


0        Only 3 weeks away! Empower ME Breakfast RSVP T...
1        Tampa Sunday Funday Walk As Your Favorite Supe...
2        Clemson basketball team wins gold medal for US...
3        Indulge your sweet tooth. Vote for Carmella's ...
4        The GOOD NEWS of Troy, Ohio - 4/21/2019 Easter...
5        MORE GOOD NEWS of Troy, Ohio - 7/7/2019 Corky ...
6        AUCTION-Register Now For Litz Crane & Rigging!...
7        How to Lower Your Risk of Having More Breakout...
8        Psyche Institute, Rainbow Reiki Training, Pet ...
9        July - Employee Newsletter Employee of the Mon...
12       Reminder Camp Hours Reminder : Summer Camp Hou...
13       The latest news for you about Field Trip & Kon...
14       One big boy moves out, a big girl moves in! Mo...
15       2019 Mid-Year Market Outlook Conference Call M...
16       Alcatel-Lucent 8058s IP Phone Price Decrease 8...
17       Email Exclusive: Christmas in July Sale! Super...
18       Is dirt bike insurance the same as motorcycle .

In [14]:
X = df_bert[['id','alpha','text']]
y = df_bert['label']

In [15]:
X.head()

Unnamed: 0,id,alpha,text
0,0,a,Only 3 weeks away! Empower ME Breakfast RSVP T...
1,1,a,Tampa Sunday Funday Walk As Your Favorite Supe...
2,2,a,Clemson basketball team wins gold medal for US...
3,3,a,Indulge your sweet tooth. Vote for Carmella's ...
4,4,a,"The GOOD NEWS of Troy, Ohio - 4/21/2019 Easter..."


In [16]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

train_df = pd.DataFrame({
    'id': X_train['id'],
    'label':y_train,
    'alpha': ['a']*X_train.shape[0],
    'text': X_train['text']
})


test_df = pd.DataFrame({
    'id': X_test['id'],
    'label':y_test,
    'alpha': ['a']*X_test.shape[0],
    'text': X_test['text']
})

In [18]:
test_df.shape

(2237, 4)

In [19]:
train_df.shape

(8945, 4)

In [20]:
train_df.to_csv(prefix+'train.tsv', sep='\t', index=False, header=False)
test_df.to_csv(prefix+'dev.tsv', sep='\t', index=False, header=False)