# Initialization

### Packages

In [1]:
Pkg.add("DecisionTree")

INFO: Nothing to be done
INFO: METADATA is out-of-date — you may not have the latest version of DecisionTree
INFO: Use `Pkg.update()` to get the latest versions of your packages


In [2]:
Pkg.add("BackpropNeuralNet")

INFO: Nothing to be done
INFO: METADATA is out-of-date — you may not have the latest version of BackpropNeuralNet
INFO: Use `Pkg.update()` to get the latest versions of your packages


In [None]:
Pkg.clone("git://github.com/lepisma/ELM.jl.git")

In [3]:
Pkg.add("GLM")

INFO: Nothing to be done
INFO: METADATA is out-of-date — you may not have the latest version of GLM
INFO: Use `Pkg.update()` to get the latest versions of your packages


In [4]:
Pkg.update()

INFO: Updating METADATA...
INFO: Updating cache of MbedTLS...
INFO: Updating cache of ZMQ...
INFO: Updating cache of URIParser...
INFO: Updating cache of Grid...
INFO: Updating cache of JuliaParser...
INFO: Updating cache of JuMP...
INFO: Updating cache of HDF5...
INFO: Updating cache of ReverseDiffSparse...
INFO: Updating cache of MathProgBase...
INFO: Updating GeneratedTypes...
INFO: Updating GeneratedTables...
INFO: Updating TSne...
INFO: Updating GraphPlot...
INFO: Updating ComputeFramework...
INFO: Updating ELM...
INFO: Updating PlotlyJS...
INFO: Updating ROC...
INFO: Computing changes...
INFO: Cloning cache of Primes from git://github.com/JuliaMath/Primes.jl.git
INFO: Upgrading ForwardDiff: v0.1.8 => v0.2.2
INFO: Upgrading Grid: v0.4.1 => v0.4.2
INFO: Upgrading HDF5: v0.6.4 => v0.6.5
INFO: Upgrading Images: v0.5.7 => v0.5.8
INFO: Upgrading JuMP: v0.13.2 => v0.14.0
INFO: Upgrading MathProgBase: v0.5.1 => v0.5.2
INFO: Upgrading MbedTLS: v0.2.5 => v0.2.6
INFO: Installing Primes v0.1

### Additional Scripts and Functions

In [5]:
include("sample.jl")

sample (generic function with 4 methods)

In [6]:
include("normalize.jl")

normalize (generic function with 4 methods)

In [7]:
function vector2ANN{T <: Any}(z::Array{T,1}, Q::Array{T,1})
    # turn a vector output into a matrix one for an ANN to work with
    noc = length(Q) # number of classes
    N = length(z) # number of data points
    Z = BitArray(N, noc)
    
    for i = 1:noc
        Z[:,i] = (z .== Q[i])
    end
    
    return float(Z)
end

vector2ANN (generic function with 1 method)

In [8]:
function ANN2vector{T <: Any}(Z::Array{Float64, 2}, Q::Array{T,1})
    # turn a matrix one from an ANN into a vector output
    N, noc = size(Z)
    z = Array(T, N)
    p = Array(Float64, N)
    
    for i = 1:N
        p[i], ind = findmax(Z[i,:])
        z[i] = Q[ind]
    end
    
    return z, p
end

ANN2vector (generic function with 1 method)

In [9]:
MSE(t::Array{Float64, 1}, y::Array{Float64, 1}) = mean((t-y).^2)

MSE (generic function with 1 method)

### Datasets and Sampling

In [10]:
magic = readcsv("D:\\data\\Magic\\magic04.csv")

19020x11 Array{Any,2}:
  28.7967   16.0021  2.6449  0.3918  …   -8.2027  40.092    81.8828  "g"
  31.6036   11.7235  2.5185  0.5303      -9.9574   6.3609  205.261   "g"
 162.052   136.031   4.0612  0.0374     -45.216   76.96    256.788   "g"
  23.8172    9.5728  2.3385  0.6147      -7.1513  10.449   116.737   "g"
  75.1362   30.9205  3.1611  0.3168      21.8393   4.648   356.462   "g"
  51.624    21.1502  2.9085  0.242   …    9.8145   3.613   238.098   "g"
  48.2468   17.3565  3.0332  0.2529      10.5868   4.792   219.087   "g"
  26.7897   13.7595  2.5521  0.4236      -2.9292   0.812   237.134   "g"
  96.2327   46.5165  4.154   0.0779      43.1844   4.854   248.226   "g"
  46.7619   15.1993  2.5786  0.3377      -6.6812   7.875   102.251   "g"
  62.7766   29.9104  3.3331  0.2475  …   23.771    9.9144  323.094   "g"
  18.8562   16.46    2.4385  0.5282     -16.9327  11.461   162.848   "g"
  45.6321   22.71    3.0441  0.2213     -14.3164   0.3822  178.255   "g"
   ⋮                        

In [11]:
N = size(magic, 1)
n = round(Int64, N / 10) # number of points in the (stratified) sample for the test set

1902

In [12]:
temp = sample(collect(1:N), magic[:,11], n) # indexes of test set & corresponding labels
ind = temp[1][:] # indexes of test set only, in a 1-dim array format
ind_ = setdiff(collect(1:N), ind) # indexes of training set

17118-element Array{Int64,1}:
     1
     2
     3
     4
     5
     6
     7
     8
    10
    11
    12
    13
    14
     ⋮
 19007
 19008
 19010
 19011
 19012
 19013
 19014
 19015
 19017
 19018
 19019
 19020

In [13]:
F_magic = map(float, magic[:,1:(end-1)]) # all features of the dataset
T_magic = magic[:,end] # all targets (labels) of the dataset

19020-element Array{Any,1}:
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 ⋮  
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"

In [14]:
F_magic = normalize(F_magic, "stat")

19020x10 Array{Float64,2}:
 -0.577211   -0.336795   -0.38112      …  -0.405831   0.476803  -1.49783  
 -0.510955   -0.570012   -0.648578        -0.490081  -0.815397   0.153121 
  2.56821     6.2057      2.61571         -2.18297    1.88917    0.842613 
 -0.694749   -0.687241   -1.02945         -0.35535   -0.658786  -1.03144  
  0.516609    0.476371    0.711138         1.03659   -0.881016   2.17637  
 -0.0383845  -0.0561846   0.176647     …   0.459239  -0.920666   0.592519 
 -0.118102   -0.26297     0.440507         0.49632   -0.875499   0.338129 
 -0.624585   -0.459034   -0.577481        -0.152632  -1.02797    0.579619 
  1.01458     1.32647     2.81208          2.06145   -0.873124   0.728043 
 -0.153152   -0.380554   -0.521409        -0.332779  -0.757393  -1.22528  
  0.224867    0.421313    1.07508      …   1.12934   -0.679266   1.72987  
 -0.811851   -0.311836   -0.817855        -0.82499   -0.620018  -0.414416 
 -0.17982     0.0288364   0.463571        -0.699372  -1.04443   -0.208252

In [15]:
Q = sort(unique(T_magic)) # all classes of the dataset, sorted in ascending order

2-element Array{Any,1}:
 "g"
 "h"

In [16]:
# training and testing set for all classification experiments
P1 = F_magic[ind_,:]
T1 = T_magic[ind_]
PT1 = F_magic[ind,:]
TT1 = T_magic[ind]

1902-element Array{Any,1}:
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 ⋮  
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"

In [17]:
ONP = readcsv("D:\\data\\OnlineNewsPopularity\\OnlineNewsPopularity.csv")

39645x61 Array{Any,2}:
 "url"                                                                                    …      " shares"
 "http://mashable.com/2013/01/07/amazon-instant-video-browser/"                               593         
 "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/"                                711         
 "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/"                            1500         
 "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/"                                  1200         
 "http://mashable.com/2013/01/07/att-u-verse-apps/"                                       …   505         
 "http://mashable.com/2013/01/07/beewi-smart-toys/"                                           855         
 "http://mashable.com/2013/01/07/bodymedia-armbandgets-update/"                               556         
 "http://mashable.com/2013/01/07/canon-poweshot-n/"                                           891         
 "http://masha

In [18]:
var_names = ONP[1,2:end]

1x60 Array{Any,2}:
 " timedelta"  " n_tokens_title"  " n_tokens_content"  …  " shares"

In [19]:
N = size(ONP, 1) - 1 # number of data points in ONP dataset (without the header row)
n = round(Int64, N / 10) # number of points in the (basic) sample for the test set
nd = size(ONP, 2) - 2 # number of dimensions (without the target variable or the index)

59

In [20]:
R = randperm(N)
ind = R[1:n][:] # indexes of test set only, in a 1-dim array format
ind_ = R[(n+1):end][:] # indexes of training set only, in a 1-dim array format

35680-element Array{Int64,1}:
 29232
 25552
 36775
 22201
 37819
 30583
 15546
 18663
 13987
 30997
 29378
  2546
 19056
     ⋮
 39062
 33314
 17527
 24802
 26719
 10525
 17656
  9816
  8849
 23435
  2520
 10241

In [21]:
F_ONP = map(Float64, ONP[2:end,2:(end-1)])
T_ONP = map(Float64,ONP[2:end,end])

39644-element Array{Float64,1}:
  593.0
  711.0
 1500.0
 1200.0
  505.0
  855.0
  556.0
  891.0
 3600.0
  710.0
 2200.0
 1900.0
  823.0
    ⋮  
 1700.0
 1500.0
 1000.0
 1300.0
 1700.0
 1400.0
 1200.0
 1800.0
 1900.0
 1900.0
 1100.0
 1300.0

In [22]:
F_ONP = normalize(F_ONP, "stat")

39644x59 Array{Float64,2}:
  1.75786   0.757438  -0.695202   …  -0.97542   -1.8107     0.138918 
  1.75786  -0.661648  -0.618786      -0.269073   0.837738  -0.689649 
  1.75786  -0.661648  -0.712183      -0.269073   0.837738  -0.689649 
  1.75786  -0.661648  -0.0329325     -0.269073   0.837738  -0.689649 
  1.75786   1.23047    1.11543        0.244634  -1.56993   -0.0870549
  1.75786  -0.18862   -0.37468    …   0.538181  -1.054      0.257285 
  1.75786  -1.13468    0.877688      -0.269073   0.837738  -0.689649 
  1.75786   0.757438   0.939245       1.61452    0.837738   1.51986  
  1.75786   0.284409  -0.954166      -0.269073   0.17563   -0.689649 
  1.75786  -0.18862   -0.66973       -0.269073   0.837738  -0.689649 
  1.75786  -0.661648   1.48901    …  -0.269073   0.837738  -0.689649 
  1.75786  -0.18862   -0.763127      -0.269073   0.837738  -0.689649 
  1.75786  -0.661648  -0.578456      -4.03626    0.837738   3.72938  
  ⋮                               ⋱                            

In [23]:
# training and testing set for all regression experiments
P2 = F_ONP[ind_,:]
T2 = T_ONP[ind_]
PT2 = F_ONP[ind,:]
TT2 = T_ONP[ind]

3964-element Array{Float64,1}:
   739.0
   746.0
 32700.0
  2200.0
 11400.0
  1300.0
  1800.0
  6200.0
  1300.0
  1500.0
  5800.0
  1200.0
   957.0
     ⋮  
   701.0
   671.0
  1000.0
  1300.0
  1100.0
  7800.0
  1300.0
   923.0
  6300.0
  1200.0
  2100.0
  2400.0

# Tree-Based Systems

In [24]:
using DecisionTree

INFO: Recompiling stale cache file C:\Users\Zacharias\.julia\lib\v0.4\ScikitLearnBase.ji for module ScikitLearnBase.


In [25]:
CP = 0.9 # combined purity parameter
K = 5 # number of cross-validations

5

## Decision Trees

In [26]:
td = 3 # tree depth for viewing purposes (single decision tree)
nl = 5 # number of leaves to be used for averaging in regression applications

5

In [27]:
model = build_tree(T1, P1)
model = prune_tree(model, CP)

Decision Tree
Leaves: 1512
Depth:  37

In [28]:
print_tree(model, td)

Feature 9, Threshold -0.3096661121768262
L-> Feature 1, Threshold 1.525274809976256
    L-> Feature 9, Threshold -0.6886748399536539
        L-> 
        R-> 
    R-> Feature 7, Threshold -2.662004522793831
        L-> h : 96/96
        R-> 
R-> Feature 1, Threshold -0.16381866376775667
    L-> Feature 3, Threshold -0.9316932291591371
        L-> 
        R-> 
    R-> Feature 1, Threshold 0.42999901916694716
        L-> 
        R-> 


In [29]:
pred = apply_tree(model, PT1)

1902-element Array{Any,1}:
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 ⋮  
 "g"
 "h"
 "h"
 "h"
 "h"
 "g"
 "h"
 "g"
 "h"
 "h"
 "h"
 "h"

In [30]:
prob = apply_tree_proba(model, PT1, Q)

1902x2 Array{Float64,2}:
 1.0         0.0       
 1.0         0.0       
 1.0         0.0       
 1.0         0.0       
 1.0         0.0       
 1.0         0.0       
 1.0         0.0       
 1.0         0.0       
 1.0         0.0       
 1.0         0.0       
 1.0         0.0       
 1.0         0.0       
 0.997389    0.00261097
 ⋮                     
 0.909091    0.0909091 
 0.0         1.0       
 0.0         1.0       
 0.0         1.0       
 0.00275482  0.997245  
 1.0         0.0       
 0.0         1.0       
 0.967742    0.0322581 
 0.0         1.0       
 0.0         1.0       
 0.0         1.0       
 0.0         1.0       

In [31]:
validation = nfoldCV_tree(T_magic, F_magic, CP, K)


Fold 1


2x2 Array{Int64,2}:
 2154  346
  346  958

Classes:  Any["g","h"]
Matrix:   
Accuracy: 0.8180862250262881
Kappa:    0.5962625766871166


2x2 Array{Int64,2}:
 2107   322
  334  1041


Fold 2
Classes:  Any["g","h"]
Matrix:   
Accuracy: 0.8275499474237644
Kappa:    0.6257108410820772


2x2 Array{Int64,2}:
 2107   338
  344  1015


Fold 3
Classes:  Any["g","h"]
Matrix:   
Accuracy: 0.8207150368033649
Kappa:    0.6092287476084479


2x2 Array{Int64,2}:
 2134  319
  357  994


Fold 4
Classes:  Any["g","h"]
Matrix:   
Accuracy: 0.8222923238696109
Kappa:    0.6095576613276689


2x2 Array{Int64,2}:
 2145  360
  324  975


Fold 5
Classes:  Any["g","h"]
Matrix:   
Accuracy: 0.8201892744479495
Kappa:    0.6028431148977792

Mean Accuracy: 0.8217665615141957


5-element Array{Float64,1}:
 0.818086
 0.82755 
 0.820715
 0.822292
 0.820189

## Regression Trees

In [32]:
model = build_tree(T2, P2, nl)

Decision Tree
Leaves: 12247
Depth:  44

In [33]:
pred = apply_tree(model, PT2)

3964-element Array{Float64,1}:
   806.6 
  2366.67
  3525.0 
 72377.7 
  3200.0 
  2575.0 
  1800.0 
  4400.0 
  1269.4 
  2166.67
  2200.0 
  3033.33
   474.2 
     ⋮   
  1125.0 
  1800.0 
   947.25
  2133.33
 16066.7 
  1260.0 
  1050.0 
  1615.25
  1200.0 
   814.0 
  6400.0 
  2550.0 

In [34]:
MSE(pred, TT2)

5.1912139599959624e8

## Random Forests

In [35]:
nrf = 2 # number of random features
nt = 10 # number of trees in forest
ps = 0.5 # proportion of samples in every tree

0.5

In [36]:
model = build_forest(T1, P1, nrf, nt, ps)

Ensemble of Decision Trees
Trees:      10
Avg Leaves: 1023.8
Avg Depth:  30.0

In [37]:
pred = apply_forest(model, PT1)

1902-element Array{Any,1}:
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 "g"
 ⋮  
 "g"
 "h"
 "h"
 "h"
 "h"
 "g"
 "h"
 "h"
 "h"
 "h"
 "h"
 "h"

In [38]:
prob = apply_forest_proba(model, PT1, Q)

1902x2 Array{Float64,2}:
 0.8  0.2
 1.0  0.0
 0.8  0.2
 1.0  0.0
 0.9  0.1
 0.9  0.1
 1.0  0.0
 1.0  0.0
 0.9  0.1
 0.7  0.3
 1.0  0.0
 0.7  0.3
 1.0  0.0
 ⋮       
 0.8  0.2
 0.0  1.0
 0.0  1.0
 0.2  0.8
 0.0  1.0
 1.0  0.0
 0.1  0.9
 0.2  0.8
 0.0  1.0
 0.4  0.6
 0.2  0.8
 0.0  1.0

In [39]:
validation = nfoldCV_forest(T_magic, F_magic, nrf, nt, K, ps)

2x2 Array{Int64,2}:
 2307  133
  404  960


Fold 1
Classes:  Any["g","h"]
Matrix:   
Accuracy: 0.8588328075709779
Kappa:    0.6790515975241032


2x2 Array{Int64,2}:
 2357  123
  392  932


Fold 2
Classes:  Any["g","h"]
Matrix:   
Accuracy: 0.8646161934805467
Kappa:    0.6868548272111783


2x2 Array{Int64,2}:
 2319  110
  399  976


Fold 3
Classes:  Any["g","h"]
Matrix:   
Accuracy: 0.8661934805467929
Kappa:    0.6962835663006202


2x2 Array{Int64,2}:
 2328  167
  391  918


Fold 4
Classes:  Any["g","h"]
Matrix:   
Accuracy: 0.8533123028391167
Kappa:    0.6612593887951415


2x2 Array{Int64,2}:
 2336  152
  383  933


Fold 5
Classes:  Any["g","h"]
Matrix:   
Accuracy: 0.8593585699263933
Kappa:    0.675813564365457

Mean Accuracy: 0.8604626708727656


5-element Array{Float64,1}:
 0.858833
 0.864616
 0.866193
 0.853312
 0.859359

In [40]:
model = build_forest(T2, P2, nrf, nt, nl, ps)

Ensemble of Decision Trees
Trees:      10
Avg Leaves: 5652.2
Avg Depth:  37.8

In [41]:
pred = apply_forest(model, PT2)

3964-element Array{Float64,1}:
 6384.18 
 2504.51 
 2830.67 
 3159.65 
 2479.29 
 2587.19 
 2772.36 
 1314.98 
 2704.85 
 1598.88 
 2097.03 
 4256.26 
 3316.14 
    ⋮    
  930.389
 2520.79 
 4332.51 
 1959.95 
 6498.45 
 2294.46 
 2637.75 
 9635.65 
 3955.04 
 2074.69 
 5369.12 
 1257.9  

In [42]:
validation = nfoldCV_forest(T_ONP, F_ONP,  nrf, nt, K, nl, ps)


Fold 1
Mean Squared Error:     2.0536091199879307e8
Correlation Coeff:      0.06139627058722553
Coeff of Determination: -0.03201768027005869

Fold 2
Mean Squared Error:     1.3939776799347305e8
Correlation Coeff:      0.06160318369830016
Coeff of Determination: -0.04281211412546537

Fold 3
Mean Squared Error:     1.3486143540915173e8
Correlation Coeff:      0.10814284917909767
Coeff of Determination: -0.033410948715535405

Fold 4
Mean Squared Error:     1.5888834245165324e8
Correlation Coeff:      0.09772586699586175
Coeff of Determination: -0.05876718165288364

Fold 5
Mean Squared Error:     7.194823294213662e7
Correlation Coeff:      0.0766566110378445
Coeff of Determination: -0.14780943425051518

Mean Coeff of Determination: -0.06296347180289166


5-element Array{Float64,1}:
 -0.0320177
 -0.0428121
 -0.0334109
 -0.0587672
 -0.147809 

# Network-Based Systems

## Basic Neural Networks

In [43]:
nin = size(F_magic,2) # number of input nodes
nhln = 2*nin # number hidden layer nodes
non = length(Q) # number of output nodes
ne = 1000 # number of epochs for training

1000

In [44]:
N = length(T1)
n = size(PT1,1)
noc = length(Q)

2

In [45]:
T3 = vector2ANN(T1, Q)

17118x2 Array{Float64,2}:
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 ⋮       
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0

In [46]:
using BackpropNeuralNet

In [47]:
net = init_network([nin, nhln, non]);

In [48]:
for j = 1:ne
    if mod(j,20) == 0; println(j); end # this is just to show how the training progresses, since the whole process may take a while...
    
    for i = 1:N
        a = P1[i,:]
        b = T3[i,:]
        train(net, a[:], b[:])
    end
end

20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000


In [49]:
T4 = Array(Float64, n, noc)

for i = 1:n
    T4[i,:] = net_eval(net, PT1[i,:][:])
end

In [50]:
pred, prob = ANN2vector(T4, Q)

(Any["h","g","h","h","h","h","h","h","h","h"  …  "h","h","h","h","h","h","h","h","h","h"],[1.0,0.948883,1.0,0.999995,1.0,1.0,1.0,1.0,0.999288,1.0  …  1.0,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])

In [51]:
sum(pred .== TT1) / n # accuracy of ANN

0.41798107255520506

## Extreme Learning Machines

In [52]:
T5 = ones(N)
T6 = ones(n)

1902-element Array{Float64,1}:
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 ⋮  
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0

In [53]:
ind_h = (T1 .== "h")

17118-element BitArray{1}:
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false
     ⋮
  true
  true
  true
  true
  true
  true
  true
  true
  true
  true
  true
  true

In [54]:
T5[ind_h] = 2.0

2.0

In [55]:
ind_h = (TT1 .== "h")

1902-element BitArray{1}:
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false
 false
     ⋮
  true
  true
  true
  true
  true
  true
  true
  true
  true
  true
  true
  true

In [56]:
T6[ind_h] = 2.0

2.0

In [60]:
using ELM

In [58]:
elm = ExtremeLearningMachine(50);

In [59]:
ELM.fit!(elm, P1, T5)

1x50 Array{Float64,2}:
 -0.379202  0.0371626  0.196451  0.101317  …  0.488155  0.319478  0.348333

In [61]:
pred = round(ELM.predict(elm, PT1))

1902-element Array{Float64,1}:
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 ⋮  
 1.0
 2.0
 2.0
 2.0
 2.0
 1.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0

In [62]:
sum(pred .== T6) / n # accuracy of ELM

0.8196635120925342

# Regression

In [63]:
using DataFrames # the GLM package works only with data frames so our Array data is inapplicable

In [64]:
ONP = map(Float64, ONP[2:end,2:end]);

In [65]:
data = DataFrame(ONP)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36,x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49,x50,x51,x52,x53,x54,x55,x56,x57,x58,x59,x60
1,731.0,12.0,219.0,0.663594466988,0.999999992308,0.815384609112,4.0,2.0,1.0,0.0,4.6803652968,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,496.0,496.0,496.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500331204081,0.378278929586,0.0400046751006,0.0412626477296,0.0401225435029,0.521617145481,0.0925619834711,0.0456621004566,0.013698630137,0.769230769231,0.230769230769,0.378636363636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593.0
2,731.0,9.0,255.0,0.604743080614,0.999999993289,0.79194630341,3.0,1.0,1.0,0.0,4.9137254902,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.799755687423,0.0500466753998,0.0500962518137,0.0501006734234,0.0500007119405,0.341245791246,0.148947811448,0.043137254902,0.0156862745098,0.733333333333,0.266666666667,0.286914600551,0.0333333333333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711.0
3,731.0,9.0,211.0,0.575129530699,0.999999991597,0.66386554064,3.0,1.0,1.0,0.0,4.39336492891,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,918.0,918.0,918.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217792288518,0.033334456999,0.0333514249339,0.0333335358046,0.682188293744,0.702222222222,0.323333333333,0.0568720379147,0.00947867298578,0.857142857143,0.142857142857,0.495833333333,0.1,1.0,-0.466666666667,-0.8,-0.133333333333,0.0,0.0,0.5,0.0,1500.0
4,731.0,9.0,531.0,0.503787877834,0.999999996904,0.665634672862,9.0,0.0,1.0,0.0,4.40489642185,7.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0285732164707,0.419299641782,0.49465082574,0.0289047184252,0.0285715975818,0.42984968735,0.100704665705,0.0414312617702,0.0207156308851,0.666666666667,0.333333333333,0.385965171192,0.136363636364,0.8,-0.369696969697,-0.6,-0.166666666667,0.0,0.0,0.5,0.0,1200.0
5,731.0,13.0,1072.0,0.41564561695,0.999999998565,0.540889525766,19.0,19.0,20.0,0.0,4.6828358209,7.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,545.0,16000.0,3151.15789474,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0286328101715,0.0287935517322,0.0285751849112,0.028571675324,0.885426777861,0.513502122877,0.281003475691,0.0746268656716,0.0121268656716,0.860215053763,0.139784946237,0.411127435065,0.0333333333333,1.0,-0.220192307692,-0.5,-0.05,0.454545454545,0.136363636364,0.0454545454545,0.136363636364,505.0
6,731.0,10.0,370.0,0.559888577828,0.999999995495,0.698198195053,2.0,2.0,0.0,0.0,4.35945945946,9.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8500.0,8500.0,8500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0222452755449,0.306717575824,0.0222312775078,0.0222242903103,0.626581580813,0.437408648699,0.0711841921519,0.0297297297297,0.027027027027,0.52380952381,0.47619047619,0.350609996065,0.136363636364,0.6,-0.195,-0.4,-0.1,0.642857142857,0.214285714286,0.142857142857,0.214285714286,855.0
7,731.0,8.0,960.0,0.418162618355,0.999999998339,0.54983388613,21.0,20.0,20.0,0.0,4.65416666667,10.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,545.0,16000.0,3151.15789474,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0200816655822,0.114705387413,0.0200243688545,0.0200153281713,0.825173249979,0.514480300844,0.268302724212,0.0802083333333,0.0166666666667,0.827956989247,0.172043010753,0.402038567493,0.1,1.0,-0.224479166667,-0.5,-0.05,0.0,0.0,0.5,0.0,556.0
8,731.0,12.0,989.0,0.433573634981,0.999999998415,0.572107764545,20.0,20.0,20.0,0.0,4.61779575329,9.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,545.0,16000.0,3151.15789474,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0222243573751,0.150732973266,0.243435475507,0.0222236031538,0.561383590699,0.543474234099,0.298613469863,0.0839231547017,0.0151668351871,0.84693877551,0.15306122449,0.427720492359,0.1,1.0,-0.242777777778,-0.5,-0.05,1.0,0.5,0.5,0.5,891.0
9,731.0,11.0,97.0,0.670103085875,0.999999979592,0.836734676801,2.0,0.0,0.0,0.0,4.85567010309,7.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.458250415016,0.0289794309233,0.0286618834452,0.0296958587737,0.454412411842,0.538888888889,0.161111111111,0.0309278350515,0.020618556701,0.6,0.4,0.566666666667,0.4,0.8,-0.125,-0.125,-0.125,0.125,0.0,0.375,0.0,3600.0
10,731.0,10.0,231.0,0.636363633609,0.999999992754,0.797101443499,4.0,1.0,1.0,1.0,5.09090909091,5.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0400000958117,0.0400000263554,0.839997207202,0.0400006328035,0.0400020378274,0.313888888889,0.0518518518519,0.038961038961,0.030303030303,0.5625,0.4375,0.298412698413,0.1,0.5,-0.238095238095,-0.5,-0.1,0.0,0.0,0.5,0.0,710.0


In [66]:
for i = 1:(nd+1)
    rename!(data, names(data)[i], symbol(var_names[i][2:end]))
end

In [67]:
names(data)

60-element Array{Symbol,1}:
 :timedelta                   
 :n_tokens_title              
 :n_tokens_content            
 :n_unique_tokens             
 :n_non_stop_words            
 :n_non_stop_unique_tokens    
 :num_hrefs                   
 :num_self_hrefs              
 :num_imgs                    
 :num_videos                  
 :average_token_length        
 :num_keywords                
 :data_channel_is_lifestyle   
 ⋮                            
 :rate_negative_words         
 :avg_positive_polarity       
 :min_positive_polarity       
 :max_positive_polarity       
 :avg_negative_polarity       
 :min_negative_polarity       
 :max_negative_polarity       
 :title_subjectivity          
 :title_sentiment_polarity    
 :abs_title_subjectivity      
 :abs_title_sentiment_polarity
 :shares                      

In [68]:
using GLM

INFO: Recompiling stale cache file C:\Users\Zacharias\.julia\lib\v0.4\Distributions.ji for module Distributions.


In [69]:
join(var_names, " +")

" timedelta + n_tokens_title + n_tokens_content + n_unique_tokens + n_non_stop_words + n_non_stop_unique_tokens + num_hrefs + num_self_hrefs + num_imgs + num_videos + average_token_length + num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment + data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech + data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min + kw_min_max + kw_max_max + kw_avg_max + kw_min_avg + kw_max_avg + kw_avg_avg + self_reference_min_shares + self_reference_max_shares + self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday + weekday_is_wednesday + weekday_is_thursday + weekday_is_friday + weekday_is_saturday + weekday_is_sunday + is_weekend + LDA_00 + LDA_01 + LDA_02 + LDA_03 + LDA_04 + global_subjectivity + global_sentiment_polarity + global_rate_positive_words + global_rate_negative_words + rate_positive_words + rate_negative_words + avg_positive_polarity + min_positive_polarity + max_positive_polarity + avg_negat

In [70]:
model1 = fit(LinearModel, shares ~ timedelta + n_tokens_title + n_tokens_content + n_unique_tokens + n_non_stop_words + n_non_stop_unique_tokens + num_hrefs + num_self_hrefs + num_imgs + num_videos + average_token_length + num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment + data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech + data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min + kw_min_max + kw_max_max + kw_avg_max + kw_min_avg + kw_max_avg + kw_avg_avg + self_reference_min_shares + self_reference_max_shares + self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday + weekday_is_wednesday + weekday_is_thursday + weekday_is_friday + weekday_is_saturday + weekday_is_sunday + is_weekend + LDA_00 + LDA_01 + LDA_02 + LDA_03 + LDA_04 + global_subjectivity + global_sentiment_polarity + global_rate_positive_words + global_rate_negative_words + rate_positive_words + rate_negative_words + avg_positive_polarity + min_positive_polarity + max_positive_polarity + avg_negative_polarity + min_negative_polarity + max_negative_polarity + title_subjectivity + title_sentiment_polarity + abs_title_subjectivity + abs_title_sentiment_polarity, data[ind_,:])

DataFrames.DataFrameRegressionModel{GLM.LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredQR{Float64}},Float64}

Formula: shares ~ 1 + timedelta + n_tokens_title + n_tokens_content + n_unique_tokens + n_non_stop_words + n_non_stop_unique_tokens + num_hrefs + num_self_hrefs + num_imgs + num_videos + average_token_length + num_keywords + data_channel_is_lifestyle + data_channel_is_entertainment + data_channel_is_bus + data_channel_is_socmed + data_channel_is_tech + data_channel_is_world + kw_min_min + kw_max_min + kw_avg_min + kw_min_max + kw_max_max + kw_avg_max + kw_min_avg + kw_max_avg + kw_avg_avg + self_reference_min_shares + self_reference_max_shares + self_reference_avg_sharess + weekday_is_monday + weekday_is_tuesday + weekday_is_wednesday + weekday_is_thursday + weekday_is_friday + weekday_is_saturday + weekday_is_sunday + is_weekend + LDA_00 + LDA_01 + LDA_02 + LDA_03 + LDA_04 + global_subjectivity + global_sentiment_polarity + global_rate_positive_words + global_rate_negat

In [71]:
model2 = fit(LinearModel, shares ~ timedelta + n_tokens_title + n_tokens_content + num_hrefs + num_self_hrefs + average_token_length + data_channel_is_lifestyle + data_channel_is_entertainment + data_channel_is_bus + kw_max_min + kw_min_max + kw_min_avg + self_reference_min_shares + global_subjectivity, data[ind_,:])

DataFrames.DataFrameRegressionModel{GLM.LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredQR{Float64}},Float64}

Formula: shares ~ 1 + timedelta + n_tokens_title + n_tokens_content + num_hrefs + num_self_hrefs + average_token_length + data_channel_is_lifestyle + data_channel_is_entertainment + data_channel_is_bus + kw_max_min + kw_min_max + kw_min_avg + self_reference_min_shares + global_subjectivity

Coefficients:
                                  Estimate  Std.Error  t value Pr(>|t|)
(Intercept)                        2763.93    493.052  5.60575    <1e-7
timedelta                         0.937314   0.298315  3.14203   0.0017
n_tokens_title                     112.097    29.6459  3.78118   0.0002
n_tokens_content                 -0.150399    0.14529 -1.03516   0.3006
num_hrefs                          58.2279    6.26845  9.28904   <1e-19
num_self_hrefs                    -76.1849    17.2443 -4.41799    <1e-5
average_token_length              -930.849    91.0515 -10.2233   <1e-23
d

In [72]:
model3 = fit(LinearModel, shares ~ timedelta + n_tokens_title + num_hrefs + num_self_hrefs + average_token_length + data_channel_is_entertainment + kw_max_min + kw_min_avg + self_reference_min_shares + global_subjectivity, data[ind_,:])

DataFrames.DataFrameRegressionModel{GLM.LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredQR{Float64}},Float64}

Formula: shares ~ 1 + timedelta + n_tokens_title + num_hrefs + num_self_hrefs + average_token_length + data_channel_is_entertainment + kw_max_min + kw_min_avg + self_reference_min_shares + global_subjectivity

Coefficients:
                                Estimate  Std.Error  t value Pr(>|t|)
(Intercept)                      2702.41    491.744  5.49557    <1e-7
timedelta                        0.93205   0.296252  3.14614   0.0017
n_tokens_title                   111.964    29.6119  3.78105   0.0002
num_hrefs                        56.3832    5.88885  9.57458   <1e-20
num_self_hrefs                  -76.0022    16.9134 -4.49362    <1e-5
average_token_length            -950.672    90.2949 -10.5285   <1e-25
data_channel_is_entertainment   -633.175    158.874 -3.98539    <1e-4
kw_max_min                     0.0708171  0.0155634  4.55024    <1e-5
kw_min_avg                   

In [73]:
w = coef(model3) # weights

11-element Array{Float64,1}:
 2702.41     
    0.93205  
  111.964    
   56.3832   
  -76.0022   
 -950.672    
 -633.175    
    0.0708171
    0.325455 
    0.0319376
 6046.36     

In [74]:
PT3 = convert(Array,data[ind, [:timedelta, :n_tokens_title, :num_hrefs, :num_self_hrefs, :average_token_length, :data_channel_is_entertainment, :kw_max_min, :kw_min_avg, :self_reference_min_shares, :global_subjectivity]])

3964x10 Array{Float64,2}:
 393.0  13.0  16.0   4.0  4.64626  1.0  5600.0     0.0     979.0  0.539005
 547.0  14.0  11.0   8.0  4.77535  1.0  1100.0  1150.0     790.0  0.446161
 689.0   9.0   7.0   2.0  4.71183  0.0   382.0  1227.38    382.0  0.525177
  53.0  12.0  45.0   5.0  5.05806  0.0   489.0  3062.03   3200.0  0.478977
 722.0   9.0   4.0   1.0  4.55634  0.0   400.0     0.0       0.0  0.418182
 209.0  10.0  21.0   8.0  4.68707  0.0   734.0     0.0     839.0  0.465273
  10.0  10.0   2.0   2.0  4.7765   0.0   561.0     0.0   10800.0  0.327381
  62.0  14.0   6.0   3.0  4.74595  0.0   444.0  1611.82   1300.0  0.4844  
 124.0  15.0   3.0   3.0  4.56752  0.0   649.0     0.0    2700.0  0.448271
 276.0  12.0   7.0   3.0  4.76037  0.0   441.0  1826.52    967.0  0.554713
 531.0   6.0   4.0   4.0  4.28507  0.0  1100.0  2440.0    1100.0  0.518265
 393.0  10.0   9.0   3.0  4.63946  0.0   784.0     0.0    1900.0  0.602778
 715.0  11.0  15.0   4.0  5.03823  0.0   300.0     0.0    1100.0  0.515444

In [75]:
pred = PT3 * w[2:end] + w[1]

3964-element Array{Float64,1}:
 3758.98
 2794.02
 3729.66
 5473.5 
 2757.78
 3029.0 
 1615.36
 3452.54
 2938.97
 3954.91
 3757.7 
 3818.0 
 3525.4 
    ⋮   
 1295.31
 2363.1 
 3450.91
 2466.66
 3616.83
 5387.49
 3147.47
 4798.95
 4050.64
 2861.23
 4464.78
 3251.69

In [76]:
MSE(pred, TT2)

1.8275045910710382e8