In [14]:
using DataFrames, CSV, Impute, Dates, Plots, Statistics, Interpolations

In [3]:
df = CSV.File("boston.csv") |> DataFrame;

In [4]:
dfdaily = filter(row -> strip(row[:REPORT_TYPE]) == "SOD", df);
dfh = filter(row -> !(strip(row[:REPORT_TYPE]) in ["SOD", "SOM"]), df)
sum(ismissing.(dfh[:, :HourlyDryBulbTemperature]))

0

In [11]:
function clean(x)
    if ismissing(x) || x isa Float64
        return x
    end
    m = match(r"[+-]?([0-9]*[.])?[0-9]+",x)
    return m ==nothing ? missing : parse(Float64, m.match)
end
function resample(dfh, factor)
    dfh[!,factor] = clean.(dfh[:,factor])
    timeseries = Impute.interp(dfh[!, [:DATE, factor]]) |> Impute.locf() |> Impute.nocb() 
    timeseries[!, factor] = convert.(Float64, timeseries[!, factor]) # remove the missing type
    # even sampling
    timeseries[!, :rounded] = map((d) -> round(d, Dates.Hour), timeseries[!, :DATE])
    even_sampling = by(timeseries, :rounded, factor => mean)
    rename!(even_sampling, Symbol(factor, "_mean") => :data)
    return even_sampling
end

resample (generic function with 1 method)

In [226]:
a,b = first(Dict(:d=>:b, :a=>:d))

:a => :d

In [225]:
b

:b

In [256]:
function selectdata(dfh, factors)
    factor, frename = first(factors)
    mdata = resample(dfh, factor)
    rename!(mdata, Dict(:data => frename, :rounded => :sampleT));
    for (factor, frename) in factors[2:end]
        mdata[:, frename] = resample(dfh, factor)[:, :data]
    end
    return mdata
end

selectdata (generic function with 1 method)

In [258]:
factors =  [(:HourlyDryBulbTemperature,:temp), (:HourlyPrecipitation,:prec), (:HourlySeaLevelPressure,:pres)];

In [261]:
mdata = selectdata(dfh, factors);

In [12]:
factors = [:HourlyDryBulbTemperature, :HourlyPrecipitation, :HourlySeaLevelPressure];

In [251]:
mdata = resample(dfh, :HourlyDryBulbTemperature)
prec = resample(dfh, :HourlyPrecipitation)
pres = resample(dfh, :HourlySeaLevelPressure)
mdata[!, :prec] = prec[:, :data]
mdata[!, :pres] = pres[:, :data];
rename!(mdata, Dict(:data => :temp, :rounded => :sampleT));

In [34]:
size(mdata).-1

(25053, 3)

In [71]:
monthdayhour = x -> [Dates.month(x) Dates.day(x) Dates.hour(x)];

In [117]:
mdata[1:5,2:3]

Unnamed: 0_level_0,temp,prec
Unnamed: 0_level_1,Float64,Float64
1,2.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
5,0.0,0.0


In [120]:
reduce(vcat, (mdata[1:5,2:3] |> Matrix)')'

1×10 LinearAlgebra.Adjoint{Float64,Array{Float64,1}}:
 2.0  0.0  1.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0

In [266]:
hr = 0
_, ntypes = size(mdata) .- 1
year, month, startday = start
additional_factors = 3                    # for month, day, hour info
historical_factors = 24*dayslookback
ntype_factors = ntypes*historical_factors
total_factors = ntype_factors + additional_factors
n_datapoints = 24*ndays
X = zeros(n_datapoints, total_factors)
y = zeros(n_datapoints);
t0 = DateTime(year, month, startday, hr, 0)
dateidx = findall(x -> x== t0, mdata[:, :sampleT])[1];

In [267]:
mdata

Unnamed: 0_level_0,sampleT,temp,prec,pres
Unnamed: 0_level_1,DateTime,Float64,Float64,Float64
1,2018-01-01T01:00:00,2.0,0.0,30.2
2,2018-01-01T02:00:00,1.0,0.0,30.21
3,2018-01-01T03:00:00,1.0,0.0,30.21
4,2018-01-01T04:00:00,1.0,0.0,30.2
5,2018-01-01T05:00:00,0.0,0.0,30.19
6,2018-01-01T06:00:00,0.0,0.0,30.2
7,2018-01-01T07:00:00,0.0,0.0,30.22
8,2018-01-01T08:00:00,0.0,0.0,30.24
9,2018-01-01T09:00:00,1.0,0.0,30.24
10,2018-01-01T10:00:00,4.0,0.0,30.25


In [268]:
hist = reduce(vcat, (mdata[dateidx:dateidx+historical_factors-1,2:end] |> Matrix)')'

1×72 LinearAlgebra.Adjoint{Float64,Array{Float64,1}}:
 8.0  0.0  30.26  8.0  0.0  30.24  8.0  …  16.0  0.0  30.24  17.0  0.0  30.24

In [124]:
offset=1

1

In [125]:
[1 2 (mdata[1, 2:end] |> Vector)']

1×5 LinearAlgebra.Adjoint{Float64,Array{Float64,1}}:
 1.0  2.0  2.0  0.0  30.2

In [126]:
for idx=1:n_datapoints
    next_point = dateidx+historical_factors+idx-1
    predict = next_point+offset-1
    predict_date = mdata[predict,:sampleT]
    X[idx, 1:total_factors] = [hist monthdayhour(predict_date)]
    y[idx] = mdata[predict,predicttype]
    hist = [hist[(1+ntypes):end]' (mdata[next_point, 2:end] |> Vector)']
end

In [209]:
function build_multi_data(mdata, predicttype, start, dayslookback, ndays; offset=1, hour=0)
    _, ntypes = size(mdata) .- 1
    year, month, startday = start
    additional_factors = 3                    # for month, day, hour info
    historical_factors = 24*dayslookback
    ntype_factors = ntypes*historical_factors
    total_factors = ntype_factors + additional_factors
    n_datapoints = 24*ndays
    X = zeros(n_datapoints, total_factors)
    y = zeros(n_datapoints);
    t0 = DateTime(year, month, startday, hour, 0)
    dateidx = findall(x -> x== t0, mdata[:, :sampleT])[1];
    hist = reduce(vcat, (mdata[dateidx:dateidx+historical_factors-1,2:end] |> Matrix)')'
    for idx=1:n_datapoints
        next_point = dateidx+historical_factors+idx-1
        predict = next_point+offset-1
        predict_date = mdata[predict,:sampleT]
        X[idx, 1:total_factors] = [hist monthdayhour(predict_date)]
        y[idx] = mdata[predict,predicttype]
        hist = [hist[(1+ntypes):end]' (mdata[next_point, 2:end] |> Vector)']
    end
   return X,y
end

build_multi_data (generic function with 1 method)

In [263]:
start = 2018, 1, 2
dayslookback = 1
ndays = 10
predicttype = :temp

:temp

In [215]:
Xm, ym = build_multi_data(mdata, predicttype, start, dayslookback, ndays);

In [216]:
ym

240-element Array{Float64,1}:
 19.0              
 18.0              
 19.0              
 18.0              
 18.0              
 18.0              
 17.0              
 17.0              
 17.0              
 20.0              
 21.0              
 23.0              
 25.0              
  ⋮                
 60.0              
 57.333333333333336
 57.25             
 58.5              
 58.5              
 57.5              
 57.0              
 59.0              
 60.0              
 60.0              
 57.75             
 57.0              

In [218]:
Xm[1,:]

75-element Array{Float64,1}:
  8.0 
  0.0 
 30.26
  8.0 
  0.0 
 30.24
  8.0 
  0.0 
 30.25
  7.0 
  0.0 
 30.24
  6.0 
  ⋮   
 17.0 
  0.0 
 30.23
 16.0 
  0.0 
 30.24
 17.0 
  0.0 
 30.24
  1.0 
  3.0 
  0.0 

In [208]:
mdata[24:49,:]

Unnamed: 0_level_0,sampleT,temp,prec,pres
Unnamed: 0_level_1,DateTime,Float64,Float64,Float64
1,2018-01-02T00:00:00,8.0,0.0,30.26
2,2018-01-02T01:00:00,8.0,0.0,30.24
3,2018-01-02T02:00:00,8.0,0.0,30.25
4,2018-01-02T03:00:00,7.0,0.0,30.24
5,2018-01-02T04:00:00,6.0,0.0,30.24
6,2018-01-02T05:00:00,5.0,0.0,30.25
7,2018-01-02T06:00:00,5.0,0.0,30.27
8,2018-01-02T07:00:00,5.0,0.0,30.29
9,2018-01-02T08:00:00,6.0,0.0,30.31
10,2018-01-02T09:00:00,7.0,0.0,30.32
