# Initialization

### Load Original Datafiles

In [5]:
p = "D:\\data\\Spam Assassin\\"
d = ["spam", "hard_ham", "easy_ham"]

3-element Array{ASCIIString,1}:
 "spam"    
 "hard_ham"
 "easy_ham"

In [2]:
N = 0
files = Array(Array, 3)

for i = 1:3
    path = string(p, d[i], "\\")
    files[i] = readdir(path)
    c = length(files[i])
    N += c
end

In [3]:
data = Array(Any, N, 2)
c = 0

for i = 1:3
    if i == 1
        flag = 1
    else
        flag = 0
    end
    
    for file in files[i]
        f = open(string(p, d[i], "\\", file))
        lines = readlines(f)
        text = ""
        
        for line in lines
            if startswith(line, "Subject")
                text = line[10:(end-1)] # omit the "Subject: " part along with the last character (line break)
                break # no need to examine the rest of the lines of the email
            end
        end
        
        close(f)
        c += 1        
        data[c,1] = text
        data[c,2] = flag
    end
end

In [4]:
size(data)

(3301,2)

### Parity Check

In [5]:
bad_records = []

for i = 1:N
    if typeof(data[i,1]) != ASCIIString # text contains some strange characters (probably in a different language)
        push!(bad_records, i)
    end
end

In [6]:
bad_records

17-element Array{Any,1}:
   37
   38
   79
  253
  254
  298
  323
  341
  347
  418
  448
 2882
 2996
 3074
 3130
 3134
 3201

In [7]:
good_records = setdiff(1:N, bad_records)

3284-element Array{Any,1}:
    1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
    ⋮
 3290
 3291
 3292
 3293
 3294
 3295
 3296
 3297
 3298
 3299
 3300
 3301

In [8]:
data = data[good_records,:];

In [9]:
N = size(data,1)

3284

### Save New Datafile (Summary)

In [10]:
fn = string(p, "titles_only.csv")

"D:\\data\\Spam Assassin\\titles_only.csv"

In [11]:
writedlm(fn, data, ",")

### Load Datafile (in case you don't want to go through the first steps)

In [6]:
# although readdlm() and readcsv() should also work, since this dataset mixes different types of elements, it is much safer to load everything manually

fn = string(p, "titles_only.csv")
f = open(fn, "r")
lines = readlines(f)
N = length(lines)
data = Array(Any, N, 2)

for i = 1:N
    data[i,1] = lines[i][1:(end-2)]
    data[i,2] = string(lines[i][end-1])
end

close(f)

### Auxiliary Functions

In [7]:
include("KFCV.jl")
include("kNN.jl")

apply_kNN (generic function with 1 method)

In [8]:
function cont_table(x::BitArray{1}, y::BitArray{1})
    a = sum(!x & !y)
    b = sum(!x & y)
    c = sum(x & !y)
    d = sum(x & y)
    return a, b, c, d
end

function SJS(x::BitArray{1}, y::BitArray{1}) # Symmetric Jaccard Similarity
    a, b, c, d = cont_table(x,y)
    J1 = b / (a + b + d)
    J2 = d / (b + c + d)
    return max(J1, J2)
end

SJS (generic function with 1 method)

In [9]:
function sig(x::Float64, a::Float64 = 0.0, b::Float64 = 1.0, c::Float64 = 1.0)
    # sigmoid function
    return 1 ./ (1 + c*exp(a - b*x))
end

function word_length_index{T <: AbstractString}(x::T)
    # feature taking values between 0 and 1, with higher values corresponding to words of 10 characters or more
    z = Float64(length(x))    
    return sig(z, 10.0)
end

function word_length_index{T <: AbstractString}(x::Array{T,1})
    n = length(x)
    Z = Array(Float64, n)
    
    for i = 1:n
        Z[i] = word_length_index(x[i])
    end
    
    return Z
end

word_length_index (generic function with 2 methods)

In [10]:
function number_of_digits_index{T <: AbstractString}(X::T)
    # feature that takes a value of 0.5 or higher for cases where there are 9 or more digits in the text
    c = 0.0
    
    for x in X
        if x in "0123456789"
            c += 1.0
        end
    end
    
    return sig(c, 9.0)
end

number_of_digits_index (generic function with 1 method)

In [11]:
function caps_proportion{T <: AbstractString}(X::T)
    L = length(X)    
    if L == 0; return 0.0; end
    z = 0
    
    for x in X
        if x in "QWERTYUIOPASDFGHJKLZXCVBNM"
            z += 1
        end
    end
    
    return z / L
end

caps_proportion (generic function with 1 method)

In [12]:
function spaces_proportion{T <: AbstractString}(X::T)
    L = length(X)    
    if L == 0; return 0.0; end
    z = 0
    
    for x in X
        if x == ' '
            z += 1
        end
    end    
    
    return z / L
end

spaces_proportion (generic function with 1 method)

In [13]:
function est_prob{T <: Real}(y::Array{Float64, 1}, Q::Array{T, 1}, th::Float64 = 0.5)
    # probabilities for continuous predictions (assuming presence of two classes only), applicable on ELMs, RRFs, etc.
    # Note: this is a heuristic and is not backed by any theory
    
    QQ = map(Float64, Q)
    q = length(Q)
    d = Array(Float64, q)
    n = length(y)
    p = Array(Float64, n)
    m = minimum(QQ)
    mp = (maximum(QQ) - m)*th + m  # middle point
    
    for i = 1:n
        for j = 1:q
            d[j] = abs(y[i] - QQ[j]) / abs(mp - QQ[j])
        end
        
        p[i] = 1 - minimum(d) / sum(d)
    end
    
    return p
end

est_prob (generic function with 2 methods)

In [14]:
function CM(pred::Array, gt::Array)
    C = sort(unique(gt)) # unique classes (sorted)
    n = length(C) # number of classes
    M = zeros(Int64, n, n) # confusion matrix

    for i in 1:length(gt)
        ind1 = indexin([gt[i]], C) # ground truth class index

        if gt[i] == pred[i]
            M[ind1, ind1] += 1
        else
            ind2 = indexin([pred[i]], C) # classifier's class index
            M[ind1, ind2] += 1
        end
    end

    return M
end

CM (generic function with 1 method)

In [15]:
F1(CM::Array{Int64, 2}, c::Int64) = 2*CM[c,c] / (sum(CM[:,c]) + sum(CM[c,:]))

F1 (generic function with 1 method)

### Packages to Be Used

In [16]:
using MultivariateStats
using DecisionTree
using ELM

 in depwarn at deprecated.jl:73
 in call at deprecated.jl:50
 in include at boot.jl:261
 in include_from_node1 at loading.jl:320
 in include at boot.jl:261
 in include_from_node1 at loading.jl:320
 in require at loading.jl:259
 in include_string at loading.jl:282
 in execute_request at C:\Users\Zacharias\.julia\v0.4\IJulia\src\execute_request.jl:164
 in eventloop at C:\Users\Zacharias\.julia\v0.4\IJulia\src\IJulia.jl:138
 in anonymous at task.jl:447
while loading C:\Users\Zacharias\.julia\v0.4\ELM\src\base.jl, in expression starting on line 77
 in depwarn at deprecated.jl:73
 in call at deprecated.jl:50
 in include at boot.jl:261
 in include_from_node1 at loading.jl:320
 in include at boot.jl:261
 in include_from_node1 at loading.jl:320
 in require at loading.jl:259
 in include_string at loading.jl:282
 in execute_request at C:\Users\Zacharias\.julia\v0.4\IJulia\src\execute_request.jl:164
 in eventloop at C:\Users\Zacharias\.julia\v0.4\IJulia\src\IJulia.jl:138
 in anonymous at task.jl:

# Data Engineering

### Data Preparation and Representation

In [17]:
S = Array(ASCIIString, N, 1);
O = Array(Int8, N, 1);

In [18]:
an = "1234567890qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM " # all alphanumeric characters

"1234567890qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM "

In [19]:
for i = 1:N
    temp = data[i,1]
    clean = ""
    
    for c in temp
        if c in an
            clean = string(clean, c)
        else
            clean = string(clean, " ")
        end
    end

    S[i] = convert(ASCIIString, clean)
    O[i] = (data[i,2] == "1" ? 1 : 0)
end

In [20]:
O = O[:];  # make sure that the outputs variable is a vector

3284-element Array{Int8,1}:
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0

### Feature Development

In [21]:
phrases = [
    "sale",
    "guaranteed",
    "low price",
    "zzzzteana",
    "fortune",
    "money",
    "entrepreneurs",
    "perl",
    "bug",
    "investment",
    "spam",
    "mortgage",
    "deserve",
    "account",
    "loss",
    "hiring",
    "wrong",
    "invest",
    "discount",
    "adult",
    "per",
    "cost",
    "make",
    "nothing",
    "rates",
    "need",
    "computer",
    "ringing",
    "home",
    "dude",
    "congratulations",
    "cartridges",
    "never pay",
    "clearance",
    "survey",
    "information",
    "mime",
    "linux",
    "only",
    "quotes",
    "cheap",
    "needed",
    "partnership",
    "protect",
    "join",
    "attn",
    "singles",
    "zzzz",
    "warranties",
    "urgent",
    "try",
    "poker",
    "you want",
    "adv",
    "company",
    "prescription",
    "future",
    "back to the",
    "ouch",
    "satalk",
    "java",
    "wealth",
    "better",
    "ilug social",
    "best product",
    "prices",
    "looking for",
    "ilug",
    "order",
    "protected",
    "marketing"
];

In [22]:
n = length(phrases)

71

In [23]:
I = Array(Float64, N, n + 4);

In [24]:
# Phrase-based features

for i = 1:N    
    s = lowercase(S[i])
    
    for j = 1:n
        I[i,j] = Float64(contains(s, phrases[j]))
    end
end

In [25]:
# Numeric features

for j = 1:N
    I[j, n + 1] = maximum(word_length_index(split(S[j], " ")))  # maximum word length feature
    I[j, n + 2] = number_of_digits_index(S[j])  # number of digits index feature
    I[j, n + 3] = caps_proportion(S[j])  # proportion of capital letters feature
    I[j, end] = spaces_proportion(S[j])  # proportion of empty spaces feature
end

### Save the feature-based dataset, just in case

In [26]:
fn = string(p, "features.csv")

"D:\\data\\Spam Assassin\\features.csv"

In [27]:
writedlm(fn, hcat(I,O), ",")

### Load the feature-based dataset (in case you need to revert to the features dataset)

In [28]:
IO = readdlm(fn, ',');

In [29]:
I = IO[:,1:(end-1)];
O = IO[:,end];

### K-Fold Cross Validation Setup

In [30]:
K = 10

10

In [31]:
P, T, PT, TT = KFCV(I, O, K);

### Dimensionality Reduction Prep

In [32]:
function apply_PCA(Itr::Array{Float64, 2}, Ite::Array{Float64, 2}, nd::Int64 = 5)
    M = MultivariateStats.fit(PCA, Itr'; maxoutdim = nd) 
    Jtr = MultivariateStats.transform(M, Itr')'
    Jte = MultivariateStats.transform(M, Ite')'
    return Jtr, Jte
end

apply_PCA (generic function with 2 methods)

# Data Modeling

### Parameter Definition

In [33]:
Y = Array(Array, K, 3); # outputs for the random forest (RF), the ELM, and the MSTC systems, respectively
p = Array(Array, K, 3); # corresponding probabilities

10x3 Array{Array{T,N},2}:
 #undef  #undef  #undef
 #undef  #undef  #undef
 #undef  #undef  #undef
 #undef  #undef  #undef
 #undef  #undef  #undef
 #undef  #undef  #undef
 #undef  #undef  #undef
 #undef  #undef  #undef
 #undef  #undef  #undef
 #undef  #undef  #undef

In [34]:
# Parameters for RF
nrf = 2 # number of random features
nt = 10 # number of trees in forest
ps = 0.5 # proportion of samples in every tree
Q = sort(unique(O))

# Parameters for ELM
nn = 15 # number of nodes (in the hidden layer)
th = 0.2 # threshold beyond which a data point is classified as spam

# Parameter for kNN
k = 5

5

### Model Training and Application

In [35]:
for i = 1:K
    println("Validation round: ", i)
    
    # Applying PCA...
    Itr, Ite = apply_PCA(P[i], PT[i], 15) # get the 15 first meta-features
    
    # Planting random forest...
    model = build_forest(T[i], Itr, nrf, nt, ps)
    y = apply_forest(model, Ite)
    Y[i, 1] = round(Int8, y)
    p[i, 1] = est_prob(y, Q)
    
    # Building ELM...
    elm = ExtremeLearningMachine(nn)
    ELM.fit!(elm, Itr, T[i])
    y = ELM.predict(elm, Ite)
    n = length(y)
    temp = zeros(Int8, n)
    temp[y .>= th] = 1
    Y[i, 2] = temp
    p[i, 2] = est_prob(y, Q, th)
    
    # Applying kNN classifier...
    y, p[i, 3] = apply_kNN(Itr, T[i], Ite, k)
    Y[i, 3] = round(Int8, y)
end

println("\nReady!")

Validation round: 1
Validation round: 2
Validation round: 3
Validation round: 4
Validation round: 5
Validation round: 6
Validation round: 7
Validation round: 8
Validation round: 9
Validation round: 10

Ready!


# Evaluating Results

In [36]:
FS = Array(Float64, K, 3)  # F1 Scores

for j = 1:3
    for i = 1:K
        cm = CM(map(Int8, TT[i]), Y[i,j])
        FS[i,j] = F1(cm, 2) # F1 score for class of interest (spam, i.e. the 2nd one)
    end
end

In [37]:
fs = mean(FS, 1) # average F1 scores over K-fold cross validation

1x3 Array{Float64,2}:
 0.617884  0.451812  0.545949

In [38]:
ind1 = indmax(fs) # best performing classifier, for predicting the spam class

1

In [39]:
ind2 = indmax(FS[:, ind1]) # best validation iteration for this classifier

5

In [40]:
CM(map(Int8, TT[ind2]), Y[ind2,ind1])

2x2 Array{Int64,2}:
 275  20
   4  29