In [2]:
# Kaggle tutorial
# https://www.kaggle.com/c/street-view-getting-started-with-julia/overview/julia-tutorial
# https://www.kaggle.com/c/street-view-getting-started-with-julia/discussion/96926#latest-559805

In [80]:
# Load packages
using Images
using DataFrames
using CSV
using DecisionTree
using Statistics

# Helping function
showln(x) = (show(x); println())

showln("Libraries loaded")

"Libraries loaded"


In [34]:
"""
dataset_type:
    "train" or "test"
labels_info:
    IDs of each image to read
image_size:
    amount of pixels in image
dataset_dir_path:
    path to directory with "trainResized" and "testResized" directories.
"""
function read_image(dataset_type, labels_info, image_size, dataset_dir_path)
    # Initialize X matrix
    X = zeros(size(labels_info, 1), image_size)
    showln(size(labels_info))
    for (idx, image_id) in enumerate(labels_info[:ID])
        # Load image
        image_full_path = "$(dataset_dir_path)/$(dataset_type)Resized/$(image_id).Bmp"
        image = load(image_full_path)
        image_greyscale = Gray.(image)
        # Transform image matrix to a vector and store it in data matrix
        X[idx, :] = reshape(image_greyscale, 1, image_size)
    end
    return X
end

showln("Created read_image function")

"Created read_image function"


In [46]:
# The images in the trainResized and testResized directories are 20x20 pixels,
#     so image_size should be set to 400.
image_size = 20 * 20
dataset_dir_path = "/home/max/Documents/ai/julia/julia_kaggle/data"

"/home/max/Documents/ai/julia/julia_kaggle/data"

In [47]:
# Load labels (ID and Class) for train and test 
train_labels_info = CSV.read("$(dataset_dir_path)/trainLabels.csv")
showln(size(train_labels_info))
showln(names(train_labels_info))

test_labels_info = CSV.read("$(dataset_dir_path)/sampleSubmission.csv")
showln(size(test_labels_info))
showln(names(test_labels_info))

# Display some info about these labels
showln(first(train_labels_info, 3))
showln(first(test_labels_info, 3))

(6283, 2)
Symbol[:ID, :Class]
(6220, 2)
Symbol[:ID, :Class]
3×2 DataFrame
│ Row │ ID    │ Class  │
│     │ [90mInt64[39m │ [90mString[39m │
├─────┼───────┼────────┤
│ 1   │ 1     │ n      │
│ 2   │ 2     │ 8      │
│ 3   │ 3     │ T      │
3×2 DataFrame
│ Row │ ID    │ Class  │
│     │ [90mInt64[39m │ [90mString[39m │
├─────┼───────┼────────┤
│ 1   │ 6284  │ A      │
│ 2   │ 6285  │ A      │
│ 3   │ 6286  │ A      │


In [44]:
# Read training and testing sets
@time X_train = read_image("train", train_labels_info, image_size, dataset_dir_path)
@time X_test = read_image("test", test_labels_info, image_size, dataset_dir_path)

display(size(X_train))
display(size(X_test))

(6283, 2)
  3.144973 seconds (1.70 M allocations: 135.245 MiB, 0.95% gc time)
(6220, 2)
 

(6283, 400)

(6220, 400)

 3.136396 seconds (1.68 M allocations: 133.856 MiB, 3.43% gc time)


In [57]:
# Create Y vector from each image.
# Target variable == first character of 'Class' column

y_train = map(col_value -> col_value[1], train_labels_info[:Class])
y_train = convert(Array{Int64, 1}, y_train)

showln(size(y_train))
showln(typeof(y_train))
showln(y_train[1:10])

(6283,)
Array{Int64,1}
[110, 56, 84, 73, 82, 87, 76, 108, 75, 71]


In [71]:
# Create a training model

# Learn patterns in the images that identify the character in the label.
# Use Julia version of Random Forest algorithm.
# Parameters to set:
#     number of features to choose at each split
#         sqrt(number_of_features)
#     number of trees
#         larger is better, but it takes more time to train
#     ratio of sampling
#         usually 1.0

num_of_features = sqrt(size(X_train, 2))
num_of_trees = 50
ratio_of_sampling = 1.0
model = build_forest(
    y_train, X_train,  # labels, features
    num_of_features,
    num_of_trees,
    ratio_of_sampling
)

# build_forest(labels::Array{T,1}, features::Array{S,2}, n_subfeatures, n_trees, partial_sampling)

Ensemble of Decision Trees
Trees:      50
Avg Leaves: 2204.12
Avg Depth:  18.72

In [73]:
# Use trained model: identify the characters in the test data
test_set_predictions = apply_forest(model, X_test)

6220-element Array{Int64,1}:
  72
  69
  97
  79
  84
  65
  48
  97
  66
  72
  78
 100
  75
   ⋮
  82
  65
  65
  67
  69
  84
  75
  79
  82
  82
  73
  79

In [74]:
# Convert predictions back to characters
test_labels_info[:Class] = convert(Array{Char, 1}, test_set_predictions)

6220-element Array{Char,1}:
 'H'
 'E'
 'a'
 'O'
 'T'
 'A'
 '0'
 'a'
 'B'
 'H'
 'N'
 'd'
 'K'
 ⋮  
 'R'
 'A'
 'A'
 'C'
 'E'
 'T'
 'K'
 'O'
 'R'
 'R'
 'I'
 'O'

In [86]:
# Print model accuracy
n_folds = 4

accuracy = nfoldCV_forest(
    y_train, X_train,  # labels, features
    n_folds,
    convert(Int, num_of_features),
    num_of_trees,
    ratio_of_sampling,
)

# nfoldCV_forest(
#     labels::Array{T,1}, features::Array{S,2},
#     n_folds::Integer, n_subfeatures::Integer, n_trees::Integer, partial_sampling::Float64,
# )

mean_accuracy = mean(accuracy)
showln("10 fold accuracy: $(mean_accuracy)")

62×62 Array{Int64,2}:
 24   0   0   0   0   0   0  0  0   0  …  0   0   0   0   0  0  0  0   0  0
  0  14   0   0   0   0   0  0  0   0     0   0   0   0   0  0  0  0   0  0
  0   0  10   0   0   0   0  0  0   0     0   0   0   0   0  0  0  0   0  0
  0   0   0  12   0   0   0  0  0   0     0   0   0   0   0  0  0  0   0  0
  0   0   0   0  10   0   0  0  0   0     0   0   0   0   0  0  0  0   0  0
  0   0   0   0   0  11   0  0  0   0  …  0   0   0   0   0  0  0  0   0  0
  0   0   0   0   0   0  16  0  0   0     0   0   0   0   0  0  0  0   0  0
  0   0   0   0   0   0   0  6  0   0     0   0   0   0   0  0  0  0   0  0
  0   0   0   0   0   0   0  0  3   0     0   0   0   0   0  0  0  0   0  0
  0   0   0   0   0   0   0  0  0  12     0   0   0   0   0  0  0  0   0  0
  0   0   0   0   0   0   0  0  0   0  …  0   0   0   0   0  0  0  0   0  0
  0   0   0   0   0   0   0  0  0   0     0   0   0   0   0  0  0  0   0  0
  0   0   0   0   0   0   0  0  0   0     0   0   0   0   0  0  0 


Fold 1
Classes:  [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122]
Matrix:   
Accuracy: 1.0
Kappa:    1.0


62×62 Array{Int64,2}:
 16   0   0  0   0   0  0  0  0  0    0  …  0   0   0   0  0  0   0  0  0  0
  0  18   0  0   0   0  0  0  0  0    0     0   0   0   0  0  0   0  0  0  0
  0   0  16  0   0   0  0  0  0  0    0     0   0   0   0  0  0   0  0  0  0
  0   0   0  7   0   0  0  0  0  0    0     0   0   0   0  0  0   0  0  0  0
  0   0   0  0  10   0  0  0  0  0    0     0   0   0   0  0  0   0  0  0  0
  0   0   0  0   0  10  0  0  0  0    0  …  0   0   0   0  0  0   0  0  0  0
  0   0   0  0   0   0  7  0  0  0    0     0   0   0   0  0  0   0  0  0  0
  0   0   0  0   0   0  0  5  0  0    0     0   0   0   0  0  0   0  0  0  0
  0   0   0  0   0   0  0  0  4  0    0     0   0   0   0  0  0   0  0  0  0
  0   0   0  0   0   0  0  0  0  4    0     0   0   0   0  0  0   0  0  0  0
  0   0   0  0   0   0  0  0  0  0  108  …  0   0   0   0  0  0   0  0  0  0
  0   0   0  0   0   0  0  0  0  0    0     0   0   0   0  0  0   0  0  0  0
  0   0   0  0   0   0  0  0  0  0    0     0   0   0 


Fold 2
Classes:  [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122]
Matrix:   
Accuracy: 0.9993630573248408
Kappa:    0.9993427288224714


62×62 Array{Int64,2}:
 21   0   0  0   0  0  0   0  0  0  …  0  0   0   0   0  0  0  0  0   0  0
  0  18   0  0   0  0  0   0  0  0     0  0   0   0   0  0  0  0  0   0  0
  0   0  16  0   0  0  0   0  0  0     0  0   0   0   0  0  0  0  0   0  0
  0   0   0  7   0  0  0   0  0  0     0  0   0   0   0  0  0  0  0   0  0
  0   0   0  0  10  0  0   0  0  0     0  0   0   0   0  0  0  0  0   0  0
  0   0   0  0   0  8  0   0  0  0  …  0  0   0   0   0  0  0  0  0   0  0
  0   0   0  0   0  0  3   0  0  0     0  0   0   0   0  0  0  0  0   0  0
  0   0   0  0   0  0  0  13  0  0     0  0   0   0   0  0  0  0  0   0  0
  0   0   0  0   0  0  0   0  9  0     0  0   0   0   0  0  0  0  0   0  0
  0   0   0  0   0  0  0   0  0  6     0  0   0   0   0  0  0  0  0   0  0
  0   0   0  0   0  0  0   0  0  0  …  0  0   0   0   0  0  0  0  0   0  0
  0   0   0  0   0  0  0   0  0  0     0  0   0   0   0  0  0  0  0   0  0
  0   0   0  0   0  0  0   0  0  0     0  0   0   0   0  0  0  0  0   0  0
  ⋮


Fold 3
Classes:  [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122]
Matrix:   
Accuracy: 1.0
Kappa:    1.0


62×62 Array{Int64,2}:
 17   0   0   0  0   0   0  0  0   0  …   0   0   0   0  0  0  0  0  0  0
  0  14   0   0  0   0   0  0  0   0      0   0   0   0  0  0  0  0  0  0
  0   0  17   0  0   0   0  0  0   0      0   0   0   0  0  0  0  0  0  0
  0   0   0  10  0   0   0  0  0   0      0   0   0   0  0  0  0  0  0  0
  0   0   0   0  7   0   0  0  0   0      0   0   0   0  0  0  0  0  0  0
  0   0   0   0  0  12   0  0  0   0  …   0   0   0   0  0  0  0  0  0  0
  0   0   0   0  0   0  16  0  0   0      0   0   0   0  0  0  0  0  0  0
  0   0   0   0  0   0   0  7  0   0      0   0   0   0  0  0  0  0  0  0
  0   0   0   0  0   0   0  0  5   0      0   0   0   0  0  0  0  0  0  0
  0   0   0   0  0   0   0  0  0  14      0   0   0   0  0  0  0  0  0  0
  0   0   0   0  0   0   0  0  0   0  …   0   0   0   0  0  0  0  0  0  0
  0   0   0   0  0   0   0  0  0   0      0   0   0   0  0  0  0  0  0  0
  0   0   0   0  0   0   0  0  0   0      0   0   0   0  0  0  0  0  0  0
  ⋮             


Fold 4
Classes:  [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Mean Accuracy: 0.9998407643312102
"10 fold accuracy: 0.9998407643312102"


In [85]:
# Save predictions
CSV.write(
    "$(dataset_dir_path)/julia_submission.csv",
    test_labels_info,
    writeheader=true
)

"/home/max/Documents/ai/julia/julia_kaggle/data/julia_submission.csv"