In [130]:
using DataFrames
using CSV
using Statistics
using MLJ

In [114]:
open("house_tiny.csv", "w") do io
    print(io, "NumRooms,Alley,Price\n")
    print(io, ",Pave, 127500 \n")
    print(io, "2,,106000\n")
    print(io, "4,,178100\n")
    print(io, ",,140000\n")
end

In [131]:
data = DataFrame(CSV.File("house_tiny.csv"))

┌──────────┬─────────────────────────┬─────────────────────────┐
│[22m names    [0m│[22m scitypes                [0m│[22m types                   [0m│
├──────────┼─────────────────────────┼─────────────────────────┤
│ NumRooms │ Union{Missing, Count}   │ Union{Missing, Int64}   │
│ Alley    │ Union{Missing, Textual} │ Union{Missing, String7} │
│ Price    │ Count                   │ Int64                   │
└──────────┴─────────────────────────┴─────────────────────────┘


### Handling missing data

In [170]:
inputs, outputs = data[:, 1:2], data[:, end]
allowmissing!(inputs)

inputs[:, :NumRooms] = replace!(inputs[:, :NumRooms], missing => mean(skipmissing(inputs[:, :NumRooms])))
inputs

Unnamed: 0_level_0,NumRooms,Alley
Unnamed: 0_level_1,Int64?,String7?
1,3,Pave
2,2,missing
3,4,missing
4,3,missing


In [171]:
inputs[:, :Alley_nan] = [0, 1, 1, 1]
rename!(inputs, :Alley => :Alley_pave)
inputs[:, :Alley_pave] = replace!(inputs[:, :Alley_pave], "Pave" => "1")
inputs

Unnamed: 0_level_0,NumRooms,Alley_pave,Alley_nan
Unnamed: 0_level_1,Int64?,String7?,Int64
1,3,1,0
2,2,missing,1
3,4,missing,1
4,3,missing,1


### Workaround to create the tensor format without pd.get_dummies()

The only workaround I found is to add a new column with a numerical type manually. String7 cannot be converted to any form of Integer, so one has to find the corresponding column manually.

In [172]:
inputs[:, :Alley_pave_bool] =  [1, 0, 0, 0]

4-element Vector{Int64}:
 1
 0
 0
 0

In [173]:
inputs = select!(inputs, Not(:Alley_pave)) # this is how you delet a column by name!
inputs

Unnamed: 0_level_0,NumRooms,Alley_nan,Alley_pave_bool
Unnamed: 0_level_1,Int64?,Int64,Int64
1,3,0,1
2,2,1,0
3,4,1,0
4,3,1,0


In [184]:
inputs2 = Matrix(inputs)
outputs # already a Vector

4-element Vector{Int64}:
 127500
 106000
 178100
 140000