In [1]:
using DelimitedFiles

In [2]:
data = readdlm("../Set2018/S2_Data_set.txt", comments=true, comment_char='#');

In [3]:
#--------|-------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
# col_nu | column_tag              | column_description                                                                                                                                                                         |
#--------|-------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
#      1 | subset_type             | {test, valid, train}                                                                                                                                                                       |
#      2 | protein_count_in_subset | 1, 2 ... number_of_proteins_in_a_set                                                                                                                                                       |
#      3 | pdb_code_plus_chain_id  | pdb_code + chain_id, for example 1BC2 + H = 1BC2H                                                                                                                                          |
#      4 | complete_chain_length   | length of the complete chain sequence, for example 567                                                                                                                                     |
#      5 | complete_aa_seq         | single string of amino-acid sequence made of 21-letter alphabet (20 standard amino acids + X where X is any modified amino acid which cannot be represented as standard amino acid analog) |
#      6 | DSSP_8_label_ss         | single string of 8 secondary-structure labels: {H, E, C, T, G, S, B, I} + X(disordered) where no labels are changed                                                                        |
#      7 | Rule_1_3_label_ss       | single string of 3 secondary-structure labels: {H, E, C}                + X(disordered) where {C, S, T}          -> C, {H, I, G} -> H, {E, B} -> E                                         |
#      8 | Rule_2_3_label_ss       | single string of 3 secondary-structure labels: {H, E, C}                + X(disordered) where {C, S, B, T, I, G} -> C, {H}       -> H, {E}    -> E                                         |
#      9 | 4_label_ss              | single string of 4 secondary-structure labels: {H, E, C, T}             + X(disordered) where {C, S, B, G}       -> C, {H, I}    -> H, {E}    -> E, {T} -> T                               |
#     10 | 5_label_ss              | single string of 5 secondary-structure labels: {H, E, C, T, G}          + X(disordered) where {C, S, B}          -> C, {H, I}    -> H, {E}    -> E, {T} -> T, {G} -> G                     |
#--------|-------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|

d_subset_type = data[:, 1]
d_protein_count_in_subset = Int.(data[:, 2])
d_pdb_code_plus_chain_id = data[:, 3]
d_complete_chain_length = Int.(data[:, 4])
d_complete_aa_seq = data[:, 5]
d_dssp_8_label_ss = data[:, 6]
d_rule_1_3_label_ss = data[:, 7]
d_rule_2_3_label_ss = data[:, 8]
d_4_label_ss = data[:, 9]
d_5_label_ss = data[:, 10];

In [4]:
# 1-of-K coding of amino acid sequences
aa_single_letters = "GPAVLIMCFYWHKRQNEDSTX"

window_size = 13
virtual_vector = zeros(Float64, window_size, 21)
virtual_vector[:, end] .= 1.0

d_complete_aa_vector = []
for sequence in d_complete_aa_seq
    sequence_length = length(sequence)
    vec = zeros(Float64, sequence_length, 21)
    for i in 1:sequence_length
        for k in 1:21
            if sequence[i] == aa_single_letters[k]
                vec[i, k] = 1.0
                break
            end
        end
        if sum(vec[i, :]) < 0.5
            error("something wrong")
        end
    end
    # add virtual terminal residues "X" for using windows
    push!(d_complete_aa_vector, [virtual_vector; vec; virtual_vector])
end

In [5]:
# convesion from 1-of-K vectors to windowed vectors
num_element = 0
for seq in d_complete_aa_seq
    num_element += length(seq)
end

d_complete_aa_vector_windowed = Matrix{Float64}(undef, num_element, 21*(window_size*2+1))
num_element = 0
for vec in d_complete_aa_vector
    vec_length = size(vec, 1)
    for i in (window_size+1):(vec_length-window_size)
        v = vec[(i-window_size):(i+window_size), :]
        v = v';
        num_element += 1
        d_complete_aa_vector_windowed[num_element, :] .= v[:]
    end
end

In [6]:
# convesion from label strings to label values
label_value = zeros(Float64, size(d_complete_aa_vector_windowed, 1))

num_element = 0
for ss in d_rule_1_3_label_ss
    for x in ss
        num_element += 1
        if x == 'H'
            label_value[num_element] = 1.0
        elseif x == 'E'
            label_value[num_element] = 2.0
        elseif x == 'C'
            label_value[num_element] = 3.0
        else x == 'X'
            label_value[num_element] = 4.0
        end            
    end
end

In [7]:
label_value

2217707-element Array{Float64,1}:
 3.0
 3.0
 1.0
 1.0
 1.0
 1.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 ⋮
 3.0
 3.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 3.0
 3.0

In [8]:
using Statistics, Printf, LIBSVM

In [9]:
#model = svmtrain(d_complete_aa_vector_windowed[1:10000, :]', label_value[1:10000], kernel=Kernel.Linear, gamma=0.10, cost=1.50)
model = svmtrain(d_complete_aa_vector_windowed[1:5000, :]', label_value[1:5000], kernel=Kernel.RadialBasis, gamma=0.10, cost=1.50)

LIBSVM.SVM{Float64}(SVC, LIBSVM.Kernel.RadialBasis, nothing, 567, 4, [3.0, 1.0, 2.0, 4.0], Int32[1, 2, 3, 4], Float64[], Int32[], LIBSVM.SupportVectors{Float64,Float64}(5000, Int32[1545, 2278, 750, 427], [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0  …  4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0], [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0], Int32[1, 2, 7, 8, 9, 10, 11, 12, 13, 14  …  4649, 4650, 4651, 4652, 4653, 4739, 4740, 4741, 4742, 4743], LIBSVM.SVMNode[LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0)  …  LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0), LIBSVM.SVMNode(1, 0.0)]), 0.0,

In [10]:
(predicted_label_value, decision_value) = svmpredict(model, d_complete_aa_vector_windowed[5001:5100, :]');

In [11]:
@printf "Accuracy: %.2f%%\n" mean((predicted_label_value .== label_value[5001:5100]))*100

Accuracy: 78.00%
