# Служебный код

In [148]:
using CSV
using DataFrames
using Statistics
using Distances

In [215]:
# Min-Max нормализация
function normalize(data)
    si = size(data,2)
    
    mini = [minimum(data[:,i]) for i=1:si]
    maxi = [maximum(data[:,i]) for i=1:si]
    
    for i in 1:size(data,1), j in 1:si
        data[i,j] = (data[i,j] - mini[j])/(maxi[j] - mini[j]) 
    end
    
    data
end

normalize (generic function with 2 methods)

In [236]:
# Разделить на тренировочный и тестовый
function divide(X, Y, prbl)
    trainID = []
    testID = []
    
    for i in 1:size(Y,1)
        if (rand() <= prbl)
            push!(trainID, i)
        else
            push!(testID, i)
        end
    end
    
    X[trainID,:], Y[trainID,:], X[testID,:], Y[testID,:]
end

divide (generic function with 2 methods)

In [278]:
# Получить k соседей
function neighs(el, X, Y, k, metric)
    si = size(X,1)
    
    distances = zeros(Float64, si)
    for i in 1:si
       distances[i] = metric(el, X[i,:]) 
    end
    
    sortslices(hcat(distances,Y), dims=1)[1:k,:]
end

neighs (generic function with 1 method)

In [389]:
# Получить класс
function decide(answers)
    si = size(answers, 1)
    h = maximum(answers[:,1])
    d = Dict{Array{Any}, Float64}([])
    
    # Треугольное окно используется
    for i in 1:si
        d[answers[i,2:end]] = 0
    end
    for i in 1:si
        d[answers[i,2:end]] += 1 - (answers[i,1]/h)
    end
    
    maxi = -Inf
    ans = Array{Float64}
    for k in keys(d)
        if (d[k] > maxi)
            maxi = d[k]
            ans = k
        end
    end
        
    ans
end

decide (generic function with 1 method)

In [426]:
struct knnmodel
    X
    Y
end

function (m::knnmodel)(data, k)
    si = size(data,1)
    data_y = Matrix{Float64}(undef,size(data,1),size(m.Y,2))
    
    for i in 1:si
        ns = neighs(data[i,:], m.X, m.Y, k, Euclidean()) 
        data_y[i,:] = decide(ns)
    end
    
    data_y
end

In [603]:
function quality(Y, predict)
    TP = TN = FP = FN = 0
    
    for i in 1:size(Y,1)
        comp = (Y[i,:]..., predict[i,:]...)
        
        if comp == (1.0,1.0)
            TP += 1
        elseif comp == (0.0,0.0)
            TN += 1
        elseif comp == (1.0,0.0)
            FN += 1
        else
            FP += 1
        end             
    end
    
    TP, TN, FP, FN
end

quality (generic function with 1 method)

# Применение

In [553]:
m = Matrix{Any}(CSV.read("./data.csv", DataFrame,
                                       silencewarnings=true,
                                       typemap=Dict(Int => Float64), 
                                       types=Dict(
                                            :MMRAcquisitionAuctionAveragePrice => Float64,
                                            :MMRAcquisitionAuctionCleanPrice => Float64,
                                            :MMRAcquisitionRetailAveragePrice => Float64,
                                            :MMRAcquisitonRetailCleanPrice => Float64,
                                            :MMRCurrentAuctionAveragePrice => Float64,
                                            :MMRCurrentAuctionCleanPrice => Float64,
                                            :MMRCurrentRetailAveragePrice => Float64,
                                            :MMRCurrentRetailCleanPrice => Float64,
                                       )))
m = map((el) -> ismissing(el) ? 0 : el, m)

72983×34 Array{Any,2}:
     1.0  0.0  "12/7/2009"   "ADESA"  …  33619.0  "FL"   7100.0  0.0  1113.0
     2.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   7600.0  0.0  1053.0
     3.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   4900.0  0.0  1389.0
     4.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   4100.0  0.0   630.0
     5.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   4000.0  0.0  1020.0
     6.0  0.0  "12/7/2009"   "ADESA"  …  33619.0  "FL"   5600.0  0.0   594.0
     7.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   4200.0  0.0   533.0
     8.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   4500.0  0.0   825.0
     9.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   5600.0  0.0   482.0
    10.0  0.0  "12/7/2009"   "ADESA"     33619.0  "FL"   7700.0  0.0  1633.0
    11.0  0.0  "12/14/2009"  "ADESA"  …  33619.0  "FL"   5500.0  0.0  1373.0
    12.0  0.0  "12/14/2009"  "ADESA"     33619.0  "FL"   5300.0  0.0   869.0
    13.0  1.0  "12/14/2009"  "ADESA"     33619.0  "FL

In [651]:
# Оставим только нужные данные
# 5,6,15,29,30,32,33,34
println(map(el -> typeof(m[1,el]) <: Number, 1:34))
Y = Matrix{Float64}(m[:,[2]])
X = Matrix{Float64}(m[:,[5,6,15,19,20,21,22,23,24,25,26,29,30,34]])

Bool[1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1]


72983×14 Array{Float64,2}:
 2006.0  3.0  89046.0   8155.0   9829.0  …  12409.0  21973.0  33619.0  1113.0
 2004.0  5.0  93593.0   6854.0   8383.0     12791.0  19638.0  33619.0  1053.0
 2005.0  4.0  73807.0   3202.0   4760.0      8702.0  19638.0  33619.0  1389.0
 2004.0  5.0  65617.0   1893.0   2675.0      5518.0  19638.0  33619.0   630.0
 2005.0  4.0  69367.0   3913.0   5054.0      7911.0  19638.0  33619.0  1020.0
 2004.0  5.0  81054.0   3901.0   4908.0  …   9451.0  19638.0  33619.0   594.0
 2004.0  5.0  65328.0   2966.0   4038.0      8603.0  19638.0  33619.0   533.0
 2005.0  4.0  65805.0   3313.0   4342.0      8242.0  19638.0  33619.0   825.0
 2007.0  2.0  49921.0   6196.0   7274.0     10778.0  21973.0  33619.0   482.0
 2007.0  2.0  84872.0   7845.0   9752.0     14845.0  21973.0  33619.0  1633.0
 2005.0  4.0  80080.0   5243.0   6627.0  …  11663.0   5546.0  33619.0  1373.0
 2001.0  8.0  75419.0   3168.0   4320.0      6559.0   5546.0  33619.0   869.0
 2005.0  4.0  79315.0   4225.0   5380

In [652]:
# Нормализуем признаки
normalize(X)

72983×14 Array{Float64,2}:
 0.555556  0.333333  0.759487  0.228291   …  0.213675   0.319874  0.0925242
 0.333333  0.555556  0.800491  0.191871      0.190071   0.319874  0.0839966
 0.444444  0.444444  0.622065  0.0896366     0.190071   0.319874  0.131751
 0.333333  0.555556  0.548209  0.0529926     0.190071   0.319874  0.0238772
 0.444444  0.444444  0.582026  0.10954       0.190071   0.319874  0.0793064
 0.333333  0.555556  0.687417  0.109204   …  0.190071   0.319874  0.0187607
 0.333333  0.555556  0.545603  0.0830301     0.190071   0.319874  0.010091
 0.444444  0.444444  0.549904  0.092744      0.190071   0.319874  0.0515918
 0.666667  0.222222  0.406666  0.173451      0.213675   0.319874  0.00284252
 0.666667  0.222222  0.721846  0.219613      0.213675   0.319874  0.16643
 0.444444  0.444444  0.678633  0.146772   …  0.0476215  0.319874  0.129477
 0.0       0.888889  0.636601  0.0886848     0.0476215  0.319874  0.0578454
 0.444444  0.444444  0.671735  0.118274      0.190071   0.319874 

In [653]:
# Разделим множества на тренировочное и тестовое
X_train, Y_train, X_test, Y_test = divide(X, Y, 0.5)

([0.3333333333333333 0.5555555555555556 … 0.3198735227037114 0.08399658897100626; 0.4444444444444444 0.4444444444444444 … 0.3198735227037114 0.1317509948834565; … ; 0.5555555555555556 0.3333333333333333 … 0.28455318266639024 0.21489482660602616; 0.5555555555555556 0.3333333333333333 … 0.28455318266639024 0.12094940306992609], [0.0; 0.0; … ; 0.0; 0.0], [0.5555555555555556 0.3333333333333333 … 0.3198735227037114 0.09252416145537237; 0.3333333333333333 0.5555555555555556 … 0.3198735227037114 0.018760659465605456; … ; 0.6666666666666666 0.2222222222222222 … 0.28455318266639024 0.08186469584991472; 0.4444444444444444 0.4444444444444444 … 0.28455318266639024 0.20338260375213188], [0.0; 0.0; … ; 0.0; 0.0])

In [654]:
# Создадим модель над тренировочными данными
model = knnmodel(X_train,Y_train)

knnmodel([0.3333333333333333 0.5555555555555556 … 0.3198735227037114 0.08399658897100626; 0.4444444444444444 0.4444444444444444 … 0.3198735227037114 0.1317509948834565; … ; 0.5555555555555556 0.3333333333333333 … 0.28455318266639024 0.21489482660602616; 0.5555555555555556 0.3333333333333333 … 0.28455318266639024 0.12094940306992609], [0.0; 0.0; … ; 0.0; 0.0])

In [None]:
# Предсказание на тестовых данных
predict = model(X_test[1:100,:], 3)
TP, TN, FP, FN = quality(Y_test[1:100,:], predict)

In [648]:
# Оценки
_accuracy = (TP+TN)/(TP+TN+FP+FN)
_error = 1 - accuracy
_precision = TP/(TP+FP)

println("Accuracy: ", _accuracy, "\nError: ", _error, "\nPrecision: ", _precision)

Accuracy: 0.82
Error: 0.15000000000000002
Precision: 0.2857142857142857
