In [14]:
using DataFrames, CSV, Impute, Dates, Plots, Statistics, Interpolations

In [3]:
df = CSV.File("boston.csv") |> DataFrame;

In [4]:
dfdaily = filter(row -> strip(row[:REPORT_TYPE]) == "SOD", df);
dfh = filter(row -> !(strip(row[:REPORT_TYPE]) in ["SOD", "SOM"]), df)
sum(ismissing.(dfh[:, :HourlyDryBulbTemperature]))

0

In [11]:
function clean(x)
    if ismissing(x) || x isa Float64
        return x
    end
    m = match(r"[+-]?([0-9]*[.])?[0-9]+",x)
    return m ==nothing ? missing : parse(Float64, m.match)
end
function resample(dfh, factor)
    dfh[!,factor] = clean.(dfh[:,factor])
    timeseries = Impute.interp(dfh[!, [:DATE, factor]]) |> Impute.locf() |> Impute.nocb() 
    timeseries[!, factor] = convert.(Float64, timeseries[!, factor]) # remove the missing type
    # even sampling
    timeseries[!, :rounded] = map((d) -> round(d, Dates.Hour), timeseries[!, :DATE])
    even_sampling = by(timeseries, :rounded, factor => mean)
    rename!(even_sampling, Symbol(factor, "_mean") => :data)
    return even_sampling
end

resample (generic function with 1 method)

In [12]:
factors = [:HourlyDryBulbTemperature, :HourlyPrecipitation, :HourlySeaLevelPressure];

In [37]:
mdata = resample(dfh, :HourlyDryBulbTemperature)
prec = resample(dfh, :HourlyPrecipitation)
pres = resample(dfh, :HourlySeaLevelPressure)
mdata[!, :prec] = prec[:, :data]
mdata[!, :pres] = pres[:, :data];
rename!(mdata, Dict(:data => :temp, :rounded => :sampleT));

In [34]:
size(mdata).-1

(25053, 3)

In [71]:
monthdayhour = x -> [Dates.month(x) Dates.day(x) Dates.hour(x)];

In [117]:
mdata[1:5,2:3]

Unnamed: 0_level_0,temp,prec
Unnamed: 0_level_1,Float64,Float64
1,2.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
5,0.0,0.0


In [120]:
reduce(vcat, (mdata[1:5,2:3] |> Matrix)')'

1×10 LinearAlgebra.Adjoint{Float64,Array{Float64,1}}:
 2.0  0.0  1.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0

In [122]:
_, ntypes = size(mdata) .- 1
year, month, startday = start
additional_factors = 3                    # for month, day, hour info
historical_factors = 24*dayslookback
ntype_factors = ntypes*historical_factors
total_factors = ntype_factors + additional_factors
n_datapoints = 24*ndays
X = zeros(n_datapoints, total_factors)
y = zeros(n_datapoints);
t0 = DateTime(year, month, startday, 0, 0)
dateidx = findall(x -> x== t0, mdata[:, :sampleT])[1];

In [123]:
hist = reduce(vcat, (mdata[dateidx:dateidx+historical_factors-1,2:end] |> Matrix)')'

1×72 LinearAlgebra.Adjoint{Float64,Array{Float64,1}}:
 8.0  0.0  30.26  8.0  0.0  30.24  8.0  …  16.0  0.0  30.24  17.0  0.0  30.24

In [124]:
offset=1

1

In [125]:
[1 2 (mdata[1, 2:end] |> Vector)']

1×5 LinearAlgebra.Adjoint{Float64,Array{Float64,1}}:
 1.0  2.0  2.0  0.0  30.2

In [126]:
for idx=1:n_datapoints
    next_point = dateidx+historical_factors+idx
    predict = next_point+offset-1
    predict_date = mdata[predict,:sampleT]
    X[idx, 1:total_factors] = [hist monthdayhour(predict_date)]
    y[idx] = mdata[predict,predicttype]
    hist = [hist[(1+ntypes):end]' (mdata[1, 2:end] |> Vector)']
end

In [127]:
X

240×75 Array{Float64,2}:
  8.0  0.0  30.26   8.0  0.0  30.24  …  17.0  0.0  30.24  1.0   3.0   1.0
  8.0  0.0  30.24   8.0  0.0  30.25      2.0  0.0  30.2   1.0   3.0   2.0
  8.0  0.0  30.25   7.0  0.0  30.24      2.0  0.0  30.2   1.0   3.0   3.0
  7.0  0.0  30.24   6.0  0.0  30.24      2.0  0.0  30.2   1.0   3.0   4.0
  6.0  0.0  30.24   5.0  0.0  30.25      2.0  0.0  30.2   1.0   3.0   5.0
  5.0  0.0  30.25   5.0  0.0  30.27  …   2.0  0.0  30.2   1.0   3.0   6.0
  5.0  0.0  30.27   5.0  0.0  30.29      2.0  0.0  30.2   1.0   3.0   7.0
  5.0  0.0  30.29   6.0  0.0  30.31      2.0  0.0  30.2   1.0   3.0   8.0
  6.0  0.0  30.31   7.0  0.0  30.32      2.0  0.0  30.2   1.0   3.0   9.0
  7.0  0.0  30.32   9.0  0.0  30.33      2.0  0.0  30.2   1.0   3.0  10.0
  9.0  0.0  30.33  11.0  0.0  30.31  …   2.0  0.0  30.2   1.0   3.0  11.0
 11.0  0.0  30.31  13.0  0.0  30.27      2.0  0.0  30.2   1.0   3.0  12.0
 13.0  0.0  30.27  14.0  0.0  30.25      2.0  0.0  30.2   1.0   3.0  13.0
  ⋮          

In [128]:
function build_multi_data(mdata, predicttype, start, dayslookback, ndays; offset=1)
    _, ntypes = size(mdata) .- 1
    year, month, startday = start
    additional_factors = 3                    # for month, day, hour info
    historical_factors = 24*dayslookback
    ntype_factors = ntypes*historical_factors
    total_factors = ntype_factors + additional_factors
    n_datapoints = 24*ndays
    X = zeros(n_datapoints, total_factors)
    y = zeros(n_datapoints);
    t0 = DateTime(year, month, startday, 0, 0)
    dateidx = findall(x -> x== t0, mdata[:, :sampleT])[1];
    hist = reduce(vcat, (mdata[dateidx:dateidx+historical_factors-1,2:end] |> Matrix)')'
    for idx=1:n_datapoints
        next_point = dateidx+historical_factors+idx
        predict = next_point+offset-1
        predict_date = mdata[predict,:sampleT]
        X[idx, 1:total_factors] = [hist monthdayhour(predict_date)]
        y[idx] = mdata[predict,predicttype]
        hist = [hist[(1+ntypes):end]' (mdata[1, 2:end] |> Vector)']
    end
   return X,y
end

build_multi_data (generic function with 1 method)

In [73]:
start = 2018, 1, 2
dayslookback = 1
ndays = 10
predicttype = :temp

:temp

In [131]:
Xm, ym = build_multi_data(mdata, predicttype, start, dayslookback, ndays);