# Дерево решений

### Служебный код

In [545]:
using CSV
using DataFrames
using Statistics
using DecisionTree

In [546]:
# Числовые и строковые признаки
function ty(m)
    nu = filter(i -> typeof(m[1,i]) <: Number, 1:size(m,2))
    st = filter(i -> !(i in nu), 1:size(m,2))
    
    nu, st
end

ty (generic function with 2 methods)

In [547]:
# Разделить на тренировочный и тестовый
function divide(X, Y, prbl)
    trainID = []
    testID = []
    
    for i in 1:size(Y,1)
        if (rand() <= prbl)
            push!(trainID, i)
        else
            push!(testID, i)
        end
    end
    
    X[trainID,:], Y[trainID,:], X[testID,:], Y[testID,:]
end

divide (generic function with 1 method)

In [548]:
# Min-Max нормализация
function normalize(data)
    si = size(data,2)
    
    mini = [minimum(data[:,i]) for i=1:si]
    maxi = [maximum(data[:,i]) for i=1:si]
    
    for i in 1:size(data,1), j in 1:si
        data[i,j] = (data[i,j] - mini[j])/(maxi[j] - mini[j]) 
    end
    
    data
end

normalize (generic function with 1 method)

In [586]:
function quality(Y, predict, prbl)
    TP = TN = FP = FN = 0
    
    for i in 1:size(Y,1)
        comp = (Y[i,:]..., predict[i,:][1] > prbl ? 1.0 : 0.0)
        
        if comp == (1.0,1.0)
            TP += 1
        elseif comp == (0.0,0.0)
            TN += 1
        elseif comp == (1.0,0.0)
            FN += 1
        else
            FP += 1
        end             
    end
    
    TP, TN, FP, FN
end

quality (generic function with 2 methods)

### Подготовка данных

In [550]:
# Получим данные
m = Matrix{Any}(CSV.read("./data.csv", DataFrame,
                                       silencewarnings=true,
                                       typemap=Dict(Int => Float64), 
                                       types=Dict(
                                            :MMRAcquisitionAuctionAveragePrice => Float64,
                                            :MMRAcquisitionAuctionCleanPrice => Float64,
                                            :MMRAcquisitionRetailAveragePrice => Float64,
                                            :MMRAcquisitonRetailCleanPrice => Float64,
                                            :MMRCurrentAuctionAveragePrice => Float64,
                                            :MMRCurrentAuctionCleanPrice => Float64,
                                            :MMRCurrentRetailAveragePrice => Float64,
                                            :MMRCurrentRetailCleanPrice => Float64,
                                       )))

72983×34 Array{Any,2}:
     1.0  0.0  "12/7/2009"   "ADESA"  …  33619.0  "FL"   7100.0  0.0  1113.0
     2.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   7600.0  0.0  1053.0
     3.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   4900.0  0.0  1389.0
     4.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   4100.0  0.0   630.0
     5.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   4000.0  0.0  1020.0
     6.0  0.0  "12/7/2009"   "ADESA"  …  33619.0  "FL"   5600.0  0.0   594.0
     7.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   4200.0  0.0   533.0
     8.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   4500.0  0.0   825.0
     9.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   5600.0  0.0   482.0
    10.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   7700.0  0.0  1633.0
    11.0  0.0  "12/14/2009"  "ADESA"  …  33619.0  "FL"   5500.0  0.0  1373.0
    12.0  0.0  "12/14/2009"  "ADESA"     33619.0  "FL"   5300.0  0.0   869.0
    13.0  1.0  "12/14/2009"  "ADESA"     33619.0  "FL

In [551]:
# Числовой и строковый признаки
nu, st = ty(m)
println("Number: ", nu)
println("String: ", st)

Number: [1, 2, 5, 6, 15, 19, 20, 21, 22, 23, 24, 25, 26, 29, 30, 32, 33, 34]
String: [3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 27, 28, 31]


In [552]:
# Сколько missing в каждом признаке
println(map(i -> count(ismissing, m[:,i]), 1:34))

[0, 0, 0, 0, 0, 0, 0, 0, 2360, 0, 0, 1, 0, 0, 0, 0, 0, 0, 18, 18, 18, 18, 315, 315, 315, 315, 0, 0, 0, 0, 0, 0, 0, 0]


In [553]:
# Уберем строковые, где есть missing
for i in 1:size(m,2)
    good = filter(j -> !ismissing(m[j,i]), 1:size(m,1))
    m = m[good,:]
end

In [554]:
# Сколько missing в каждом признаке
println(map(i -> count(ismissing, m[:,i]), 1:34))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [555]:
# Строки в lowercase
for i in st
    m[:,i] .= lowercase.(m[:,i])
end

In [556]:
# Подсчет средних значений числовых данных (без missing)
meanval = map(i -> typeof(m[1,i]) <: Number ? mean(filter(el -> !ismissing(el), m[:,i])) : nothing, 1:size(m,2))
println(meanval)

Union{Nothing, Float64}[36444.43948854343, 0.1218905118832582, nothing, nothing, 2005.3385199618826, 4.181982392012403, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, 71776.3720576313, nothing, nothing, nothing, 6146.529576583368, 7395.145045442262, 8519.762178383991, 9876.290602910012, 6150.024591446329, 7411.158244321495, 8794.852493990811, 10165.245658450554, nothing, nothing, 26440.454635964103, 58187.67026980899, nothing, 6741.92791520289, 0.02554438265371432, 1279.4481076391357]


In [557]:
# Сколько null в строковых данных?
# Уберем признаки, где все забито null, а также некоторые неинформативные
for i in st
   println(count(el->el=="null", m[:,i]), "\tcol: ", i) 
end

m = m[:,filter(i -> !(i in [1,3,4,13,14,27,28]), 1:size(m,2))]
nu, st = ty(m)
1

0	col: 3
0	col: 4
0	col: 7
0	col: 8
0	col: 9
7	col: 10
7	col: 11
7	col: 12
3033	col: 13
3038	col: 14
1	col: 16
1	col: 17
1	col: 18
67011	col: 27
67011	col: 28
0	col: 31


1

In [558]:
# Уберем записи с оставшимися null
for i in st
    good = filter(j -> m[j,i] != "null", 1:size(m,1))
    m = m[good,:]
end

In [559]:
# Посмотрим количество различных строковых значений
for i in st
   println(size(unique(m[:,i]),1), "\t:", i) 
end

32	:4
962	:5
133	:6
834	:7
16	:8
2	:9
4	:11
12	:12
4	:13
37	:24


In [560]:
# Переведем некоторые строковые признаки в n булевых числовых признаков
for i in filter(i->size(unique(m[:,i]),1)<=32, st)
    un = unique(m[:,i]) # Уникальные элементы
    for j in 1:size(un,1)
        mask = map(el -> el == un[j] ? 1.0 : 0.0, m[:,i])
        m = hcat(m, mask)
    end
end

nu, st = ty(m)

([1, 2, 3, 10, 14, 15, 16, 17, 18, 19  …  88, 89, 90, 91, 92, 93, 94, 95, 96, 97], [4, 5, 6, 7, 8, 9, 11, 12, 13, 24])

In [569]:
# Отберем только числовые признаки
X = Matrix{Float64}(m[:,nu][:,2:end])
Y = Vector{Float64}(m[:,1])
# Нормализуем признаки
X = normalize(X)

70301×86 Array{Float64,2}:
 0.555556  0.333333  0.759487  …  0.0  0.0  0.0  1.0  0.0  0.0  0.0
 0.333333  0.555556  0.800491     0.0  0.0  0.0  0.0  1.0  0.0  0.0
 0.444444  0.444444  0.622065     0.0  0.0  0.0  0.0  1.0  0.0  0.0
 0.333333  0.555556  0.548209     0.0  0.0  0.0  0.0  1.0  0.0  0.0
 0.444444  0.444444  0.582026     0.0  0.0  0.0  0.0  0.0  1.0  0.0
 0.333333  0.555556  0.687417  …  0.0  0.0  0.0  1.0  0.0  0.0  0.0
 0.333333  0.555556  0.545603     0.0  0.0  0.0  1.0  0.0  0.0  0.0
 0.444444  0.444444  0.549904     0.0  0.0  0.0  0.0  0.0  1.0  0.0
 0.666667  0.222222  0.406666     0.0  0.0  0.0  1.0  0.0  0.0  0.0
 0.666667  0.222222  0.721846     0.0  0.0  0.0  0.0  0.0  1.0  0.0
 0.444444  0.444444  0.678633  …  0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0       0.888889  0.636601     0.0  0.0  0.0  0.0  0.0  1.0  0.0
 0.444444  0.444444  0.671735     0.0  0.0  0.0  0.0  1.0  0.0  0.0
 ⋮                             ⋱       ⋮                        ⋮
 0.666667  0.222222  0.

### Применение

In [577]:
# Разделим множества на тренировочное и тестовое
X_train, Y_train, X_test, Y_test = divide(X, Y, 0.5)
Y_train = Vector{Float64}(Y_train[:,1])
Y_test = Vector{Float64}(Y_test[:,1])

34925-element Array{Float64,1}:
 0.0
 0.0
 0.0
 0.0
 1.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [578]:
model = build_tree(Y_train, X_train)

Decision Tree
Leaves: 2441
Depth:  38

In [579]:
model = prune_tree(model, 0.9)

Decision Tree
Leaves: 1959
Depth:  38

In [581]:
print_tree(model, 3)

Feature 2, Threshold 0.5
L-> Feature 14, Threshold 0.1382620744259699
    L-> Feature 12, Threshold 0.523724804399248
        L-> 
        R-> 
    R-> Feature 4, Threshold 0.5088180952914171
        L-> 
        R-> 
R-> Feature 14, Threshold 0.10142297879827572
    L-> Feature 2, Threshold 0.7222222222222222
        L-> 
        R-> 
    R-> Feature 3, Threshold 0.6207436063918046
        L-> 
        R-> 


In [582]:
predict = apply_tree(model, X_test[:,:])

34925-element Array{Float64,1}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.4
 0.0
 0.6666666666666666
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮
 0.0
 0.0
 0.16666666666666666
 0.8888888888888888
 0.0
 0.0
 0.0
 0.0
 0.8
 0.0
 0.0
 0.0

In [592]:
TP, TN, FP, FN = quality(Y_test[:,:], predict, 0.3)

(1024, 25697, 4957, 3247)

In [589]:
# Оценки
_accuracy = (TP+TN)/(TP+TN+FP+FN)
_error = 1 - _accuracy
_precision = TP/(TP+FP)

println("Accuracy: ", _accuracy, "\nError: ", _error, "\nPrecision: ", _precision)

Accuracy: 0.7650966356478167
Error: 0.23490336435218329
Precision: 0.17120882795519143
