In [1]:
using DataFrames, CSV, Impute, Dates, Plots, Statistics, Interpolations

In [2]:
include("utils.jl");

## Choose data file

In [3]:
data_available = [
    "boston_09_12",
    "boston_13_15",
    "boston_16_17",
    "boston_18_20",
    "hanscom_18_20",
    "pittsfield_18_20"
];
choice = "boston_18_20";

In [4]:
df = CSV.File(string("data/", choice, ".csv")) |> DataFrame;

In [5]:
dfdaily = filter(row -> strip(row[:REPORT_TYPE]) == "SOD", df);
dfh = filter(row -> !(strip(row[:REPORT_TYPE]) in ["SOD", "SOM"]), df)
sum(ismissing.(dfh[:, :HourlyDryBulbTemperature]))

0

In [6]:
factors =  [(:HourlyDryBulbTemperature,:temp), (:HourlyPrecipitation,:prec), (:HourlySeaLevelPressure,:pres)];

In [7]:
mdata = selectdata(dfh, factors);
first(mdata, 4)

Unnamed: 0_level_0,sampleT,temp,prec,pres
Unnamed: 0_level_1,DateTime,Float64,Float64,Float64
1,2018-01-01T01:00:00,2.0,0.0,30.2
2,2018-01-01T02:00:00,1.0,0.0,30.21
3,2018-01-01T03:00:00,1.0,0.0,30.21
4,2018-01-01T04:00:00,1.0,0.0,30.2


## Define characteristic of training set

In [43]:
start = 2018, 1, 2
dayslookback = 7
ndays = 365-dayslookback-1
predicttype = :temp
offset = 25;

In [44]:
X, y = build_multi_data(mdata, predicttype, start, dayslookback, ndays; offset=offset);

In [45]:
X

8568×507 Array{Float64,2}:
  8.0   0.0  30.26     8.0   0.0  …  0.0      29.95    1.0  10.0   0.0
  8.0   0.0  30.24     8.0   0.0     0.0      29.93    1.0  10.0   1.0
  8.0   0.0  30.25     7.0   0.0     0.0      29.91    1.0  10.0   2.0
  7.0   0.0  30.24     6.0   0.0     0.0      29.91    1.0  10.0   3.0
  6.0   0.0  30.24     5.0   0.0     0.0      29.92    1.0  10.0   4.0
  5.0   0.0  30.25     5.0   0.0  …  0.0      29.92    1.0  10.0   5.0
  5.0   0.0  30.27     5.0   0.0     0.0      29.93    1.0  10.0   6.0
  5.0   0.0  30.29     6.0   0.0     0.0      29.93    1.0  10.0   7.0
  6.0   0.0  30.31     7.0   0.0     0.0      29.94    1.0  10.0   8.0
  7.0   0.0  30.32     9.0   0.0     0.0      29.96    1.0  10.0   9.0
  9.0   0.0  30.33    11.0   0.0  …  0.0      29.98    1.0  10.0  10.0
 11.0   0.0  30.31    13.0   0.0     0.0      30.0     1.0  10.0  11.0
 13.0   0.0  30.27    14.0   0.0     0.0      30.0     1.0  10.0  12.0
  ⋮                               ⋱               

In [46]:
start = 2019, 1, 1
dayslookback = 7
ndays = 365-dayslookback-1
predicttype = :temp
test_X, test_y = build_multi_data(mdata, predicttype, start, dayslookback, ndays; offset=offset);

In [47]:
start = 2020, 1, 1
dayslookback = 7
ndays = 300
predicttype = :temp
test_X2, test_y2 = build_multi_data(mdata, predicttype, start, dayslookback, ndays; offset=offset);

In [53]:
(train_X, train_y), (test_X, test_y) = IAI.split_data(:regression, X, y);

In [49]:
@time begin
grid = IAI.GridSearch(
    IAI.OptimalTreeRegressor(
        random_seed=123,
        show_progress=false
    ),
    max_depth = 5:10,
    minbucket = 10, #10:10:100
    show_progress=false  
)
IAI.fit!(grid, train_X, train_y)
end

└ @ IAILicensing C:\Users\iai\builds\InterpretableAI\SysImgBuilder\.julia\packages\IAILicensing\x1CT6\src\precompile.jl:19
│ 1d23b3eee464c21ff412aa6a5b247b9f0543ded03e05d39d863fbfc9241e7a1f
└ @ IAILicensing C:\Users\iai\builds\InterpretableAI\SysImgBuilder\.julia\packages\IAILicensing\x1CT6\src\precompile.jl:29


9964.579932 seconds (31.60 M allocations: 2.685 GiB, 0.11% gc time)


All Grid Results:

│ Row │ show_progress │ minbucket │ max_depth │ cp          │ train_score │
│     │ [90mBool[39m          │ [90mInt64[39m     │ [90mInt64[39m     │ [90mFloat64[39m     │ [90mFloat64[39m     │
├─────┼───────────────┼───────────┼───────────┼─────────────┼─────────────┤
│ 1   │ false         │ 10        │ 5         │ 0.000276064 │ 0.853147    │
│ 2   │ false         │ 10        │ 6         │ 7.95869e-5  │ 0.88024     │
│ 3   │ false         │ 10        │ 7         │ 6.62916e-6  │ 0.912189    │
│ 4   │ false         │ 10        │ 8         │ 5.29251e-5  │ 0.930317    │
│ 5   │ false         │ 10        │ 9         │ 4.0264e-5   │ 0.942335    │
│ 6   │ false         │ 10        │ 10        │ 1.28649e-5  │ 0.953296    │

│ Row │ valid_score │ rank_valid_score │
│     │ [90mFloat64[39m     │ [90mInt64[39m            │
├─────┼─────────────┼──────────────────┤
│ 1   │ 0.810456    │ 6                │
│ 2   │ 0.829235    │ 5                │
│ 3   │ 0.849655    │

In [54]:
lnr = IAI.get_learner(grid)
IAI.score(lnr, test_X, test_y)  # Testing on the same year

0.9441814168562882

In [55]:
start = 2019, 1, 1
ndays = 365-dayslookback-1
test_X1, test_y1 = build_multi_data(mdata, predicttype, start, dayslookback,
 ndays; offset=offset);

start = 2020, 1, 1
ndays = 300 # since we don't have days until the end
test_X2, test_y2 = build_multi_data(mdata, predicttype, start, dayslookback, 
    ndays; offset=offset)

([40.0 0.0 … 9.0 0.0; 40.0 0.0 … 9.0 1.0; … ; 53.0 0.0 … 3.0 22.0; 53.0 0.0 … 3.0 23.0], [29.0, 28.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 23.0, 25.0  …  41.0, 41.0, 40.0, 38.0, 37.0, 35.0, 34.0, 33.0, 33.0, 33.0])

In [56]:
score1 = IAI.score(lnr, test_X1, test_y1)  
score2 = IAI.score(lnr, test_X2, test_y2)

0.5693326519703015

In [57]:
score1

0.6544199474139569