# Feature Preprocessing: This file is used to apply preprocessing to the data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('spam_x.csv')

In [6]:
# use the 20% of the whole data
from sklearn.utils import resample
size = round(len(df) * 0.2)
df = resample(df, n_samples=size)
df

Unnamed: 0,file_name,volume-dflt,subject,body,spm-lbl-trgt
52267,data/SH/HP/prodmsg.2.429179.2005621,4858,onlin sale low dont enough visitor,multipart messag mime format nextpartcdabf...,1
3861,data/easy_ham/1335.57112a05c5a5ba633707ed3fdec...,1516,alsaredhat compat,upon time brian wrote fresh new instal ...,0
50734,data/SH/HP/prodmsg.2.437127.200572,6620,s comput product sale,multipart messag mime format nextpartrfkindys...,1
9227,data/farmer-d/tufco/551,52,revis enron hpl actual august,teco tap hpl iferc enron,0
56789,data/SH/HP/prodmsg.2.431777.2005624,3072,new health plan month,multipart messag mime format nextpartrfkindys...,1
...,...,...,...,...,...
54985,data/SH/HP/prodmsg.2.446019.2005716,4810,hershey vs ghirardelli get gift take survey,mime encod messag febbcabddc contenttyp tex...,1
1043,data/spam/01243.0676aa0a6a02e5a0373d387b89af0e07,328,meatdemon,come check meatdemon young starlet think fut...,1
8774,data/beck-s/congratulations/44,117,congratul,salli read email announc promot congratul jo...,0
49857,data/SH/HP/prodmsg.2.434873.2005628,3598,make rival envi,multipart messag mime format nextpartceadb...,1


In [8]:
# filename was to make sure files are loaded correctly, drop this column for model training
df = df.drop(columns=['file_name', 'volume-dflt'])

In [9]:
df.replace(to_replace='ham', value=0, inplace=True)
df.replace(to_replace='spam', value=1, inplace=True)
df = df.rename(columns={'ham/spam': 'spm-lbl-trgt'})  # choose a unique name

In [10]:
df

Unnamed: 0,subject,body,spm-lbl-trgt
52267,onlin sale low dont enough visitor,multipart messag mime format nextpartcdabf...,1
3861,alsaredhat compat,upon time brian wrote fresh new instal ...,0
50734,s comput product sale,multipart messag mime format nextpartrfkindys...,1
9227,revis enron hpl actual august,teco tap hpl iferc enron,0
56789,new health plan month,multipart messag mime format nextpartrfkindys...,1
...,...,...,...
54985,hershey vs ghirardelli get gift take survey,mime encod messag febbcabddc contenttyp tex...,1
1043,meatdemon,come check meatdemon young starlet think fut...,1
8774,congratul,salli read email announc promot congratul jo...,0
49857,make rival envi,multipart messag mime format nextpartceadb...,1


In [12]:
# fill missing values with 'unknown'
df['subject'].fillna(value="unknown", inplace=True)
df['body'].fillna(value="unknown", inplace=True)

In [13]:
# concat body and subject
df["body-subject"] = df[["body", "subject"]].apply(" ".join, axis=1)
df = df.drop(columns=['body', 'subject'])

In [20]:
def feature_reduction(df):
    # for feature reduction
    print("Applying Feature Reduction...")
    # get all the column sums
    sums = df.sum(axis=0)

    n = sums.values
    n = np.delete(n, [0,1]) # discard volume and target

    # keep only the features that are above the median
    median = np.median(n) 
    features_to_drop = []
    for i, v in enumerate(n):
        if v <= median:
            features_to_drop.append(i+2)
    df=df.drop(df.columns[features_to_drop], axis=1)
    print("Applying Feature Reduction DONE!")
    return df

In [21]:
# apply count vectorizer on body and subject
from sklearn.feature_extraction.text import CountVectorizer

list_of_text_sub = df['body-subject'].values.tolist()

print("Applying Count Vectorizer...")
vectorizer = CountVectorizer()
vectorizer.fit(list_of_text_sub)
term_doc_matrix = vectorizer.transform(list_of_text_sub)
df_bow = pd.DataFrame(term_doc_matrix.toarray(), columns=vectorizer.get_feature_names())

df = pd.concat([df.reset_index(), df_bow], axis=1)
df = df.drop(columns=['body-subject','index'])
print("Applying Count Vectorizer DONE!")

Applying Count Vectorizer...
Applying Count Vectorizer DONE!


In [22]:
df = feature_reduction(df)

Applying Feature Reduction...
Applying Feature Reduction DONE!


In [23]:
print("Saving dataframe to file...")
df.to_hdf('spam_processed_cv.h5', key='df',mode='w') 
print("Saving dataframe to file DONE!")

Saving dataframe to file...
Saving dataframe to file DONE!


In [24]:
# check to make sure everything saved correctly
dd = pd.read_hdf('spam_processed_cv.h5')
dd

Unnamed: 0,volume-dflt,spm-lbl-trgt,aa,aaa,aaaa,aaaaaa,aaaaaaaaaa,aaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaa,...,þàº,þàç,þàì,þàïà,þàïàº,þàïàô,þàïáö¼ò,þîñæ,þüg,ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿó
0,430,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3247,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2201,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2707,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1185,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11779,2525,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11780,3784,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11781,17801,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11782,2503,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
