# Create stratified training sets

The goal of this notebook is to create a stratified training set. Also used for filtering of scenarios.

In [1]:
cd("..")

using Pkg
Pkg.activate(".")

[32m[1m  Activating[22m[39m project at `/mnt/NGI_disks/ebr/P/2022/01/20220127/Calculations/AP3/models/tsunami-inundation-emulator`


In [2]:
using Plots, DelimitedFiles, Distributed, Dates, DataFrames, Distributions, StatsPlots, StatsBase, CSV, Random
theme(:ggplot2)

# Calculate statistics.

First part of this notebook is used to calculate basic statistical features associated with scenarios (in a generated textfile).

In [3]:
pwd()

"/mnt/NGI_disks/ebr/P/2022/01/20220127/Calculations/AP3/models/tsunami-inundation-emulator"

In [4]:
# Add process for dataloading.
addprocs(3; exeflags="--project")
@everywhere include("scripts/datareader.jl")

In [5]:
data_dir = "/data_large/stg/UMA_download/";
#train_data = "data/train_test_1000_strat/train_test.txt";
grid_file = "/data_large/grids/Catania/C_CT.grd";
batch_size = 100;
ct_slice = (1:912,1:2224)
ts_slice = 30:45,1:480 # gauge number, time

config = Dict(
    "data_dir" => data_dir,
    #"scenarios_file" => train_data,
    "grid_file" => grid_file,
    "batch_size" => batch_size,
    "ct_slice" => ct_slice,
    "ts_slice" => ts_slice,
)

Dict{String, Any} with 5 entries:
  "grid_file"  => "/data_large/grids/Catania/C_CT.grd"
  "batch_size" => 100
  "ct_slice"   => (1:912, 1:2224)
  "data_dir"   => "/data_large/stg/UMA_download/"
  "ts_slice"   => (30:45, 1:480)

In [7]:
@everywhere begin
    reader = DataReader.Reader($config)
end

scenarios = DataReader.scenarios("article_data/all_UMABS_shuf.txt")

@info "Load batches with scenarios."
batches = RemoteChannel(()->Channel(4))

for worker in workers()
    remote_do(reader, worker, scenarios, batches)
end

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mReads epoch: 1
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mLoad batches with scenarios.


      From worker 2:	[90m    @ [39m[35mDistributed[39m [90m./[39m[90m[4mtask.jl:429[24m[39m[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mload_batches
      From worker 3:	[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mload_batches
      From worker 4:	[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mload_batches


In [29]:
mask = falses(length(ct_slice[1]), length(ct_slice[2]))
df = DataFrame(
    scenario = String[], 
    max_flow_depth = Float32[], 
    inundated_pixels = Float32[], 
    max_eta = Float32[], 
    max_deformation = Float32[]
)

nr_of_scenarios_left = 15000
#df = DataFrame()
while nr_of_scenarios_left > 0
    println(nr_of_scenarios_left)
    batch = take!(batches)
    
    for k in 1:reader.batch_size
        mask = mask .| (batch.flow_depths[:,:,1,k] .> 0.)
        push!(df,  
                (
                scenario = batch.scenario_names[k],
                max_flow_depth = maximum(batch.flow_depths[:,:,:,k]), 
                inundated_pixels = count(h->(h>0.001), batch.flow_depths[:,:,:,k]),
                max_eta = maximum(abs.(batch.etas[:,:,:,k])),
                max_deformation = maximum(abs.(batch.deformed_topographies[:,:,1,k] - reader.topography))
                )
            )
        nr_of_scenarios_left -= 1
        if nr_of_scenarios_left == 0
            break
        end
    end
end

15000
14900
14800
14700
14600
14500
14400
14300
14200
14100
14000
13900
13800
13700
13600
13500
13400
13300
13200
13100
13000
12900
12800
12700
12600
12500
12400
12300
12200
12100
12000
11900
11800
11700
11600
11500
11400
11300
11200
11100
11000
10900
10800
10700
10600
10500
10400
10300
10200
10100
10000
9900
9800
9700
9600
9500
9400
9300
9200
9100
9000
8900
8800
8700
8600
8500
8400
8300
8200
8100
8000
7900
7800
7700
7600
7500
7400
7300
7200
7100
7000
6900
6800
6700
6600
6500
6400
6300
6200
6100
6000
5900
5800
5700
5600
5500
5400
5300
5200
5100
5000
4900
4800
4700
4600
4500
4400
4300
4200
4100
4000
3900
3800
3700
3600
3500
3400
3300
3200
3100
3000
2900
2800
2700
2600
2500
2400
2300
2200
2100
2000
1900
1800
1700
1600
1500
1400
1300
1200
1100
1000
900
800
700
600
500


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mReads epoch: 2


400
300
200
100


In [33]:
df = unique(df)

Unnamed: 0_level_0,scenario
Unnamed: 0_level_1,String
1,9_BS_1474/0478_E01522N3585E03578N2685-BS-M774_E02353N3310_D010_S202D50R090_A003139_S050
2,4_BS_1474/0161_E01108N3882E01527N3690-BS-M774_E01506N3760_D010_S247D30R270_A005608_S028
3,3_BS_1474/1142_E01446N4259E02038N3783-BS-M774_E02009N3850_D107_S337D50R270_A003139_S050
4,4_BS_1474/0751_E01108N3882E01527N3690-BS-M754_E01509N3782_D010_S067D30R270_A003524_S022
5,10_BS_1469/0475_E01267N3753E01646N3535-BS-M809_E01491N3670_D010_S022D50R090_A006995_S075
6,5_BS_1474/0701_E01108N3882E01527N3690-BS-M774_E01477N3760_D010_S247D30R270_A005608_S028
7,3_BS_1474/0101_E01964N3926E02184N3685-BS-M707_E02022N3782_D010_S337D50R090_A001194_S013
8,7_BS_1474/0127_E01267N3753E01646N3535-BS-M732_E01523N3692_D076_S202D50R270_A001180_S030
9,BS_1474/0288_E01267N3753E01646N3535-BS-M732_E01495N3692_D076_S067D50R090_A001180_S030
10,BS_1474/0765_E01267N3753E01646N3535-BS-M754_E01495N3692_D010_S022D30R270_A001972_S039


In [37]:
# Select samples with small deformations:
df_small = df[df.max_deformation .< 0.01,:]
df_large = df[df.max_deformation .> 0.01,:]

Unnamed: 0_level_0,scenario
Unnamed: 0_level_1,String
1,4_BS_1474/0161_E01108N3882E01527N3690-BS-M774_E01506N3760_D010_S247D30R270_A005608_S028
2,4_BS_1474/0751_E01108N3882E01527N3690-BS-M754_E01509N3782_D010_S067D30R270_A003524_S022
3,10_BS_1469/0475_E01267N3753E01646N3535-BS-M809_E01491N3670_D010_S022D50R090_A006995_S075
4,5_BS_1474/0701_E01108N3882E01527N3690-BS-M774_E01477N3760_D010_S247D30R270_A005608_S028
5,7_BS_1474/0127_E01267N3753E01646N3535-BS-M732_E01523N3692_D076_S202D50R270_A001180_S030
6,BS_1474/0288_E01267N3753E01646N3535-BS-M732_E01495N3692_D076_S067D50R090_A001180_S030
7,BS_1474/0765_E01267N3753E01646N3535-BS-M754_E01495N3692_D010_S022D30R270_A001972_S039
8,5_BS_1474/0295_E01267N3753E01646N3535-BS-M754_E01495N3692_D172_S247D10R270_A001972_S039
9,2_BS_1474/0410_E01267N3753E01646N3535-BS-M774_E01575N3670_D107_S022D50R270_A003139_S050
10,10_BS_1469/1298_E01470N3877E01631N3641-BS-M732_E01579N3692_D115_S157D10R270_A002108_S017


In [38]:
## Write files
out_dir = "/home/ebr/projects/tsunami-inundation-emulator/article_data/BS_events"

open(joinpath(out_dir,"large_deformation.txt"), "w") do file
    for row in eachrow(df_large)
        println(file, row.scenario)
    end
end

CSV.write(joinpath(out_dir,"df_large.csv"), df_large)

open(joinpath(out_dir,"small_deformation.txt"), "w") do file
    for row in eachrow(df_small)
        println(file, row.scenario)
    end
end

CSV.write(joinpath(out_dir,"df_small.csv"), df_small)

"/home/ebr/projects/tsunami-inundation-emulator/article_data/BS_events/df_small.csv"

In [39]:
CSV.write(joinpath(out_dir, "df.csv"), df)

"/home/ebr/projects/tsunami-inundation-emulator/article_data/BS_events/df.csv"

julia --project make-model-summary.jl --scenarios /home/ebr/projects/tsunami-inundation-emulator/article_data/BS_events/small_deformation.txt --eval_dir evaluation/BS_events

In [None]:
@df df corrplot([:max_eta :inundated_pixels])

# Select training set.

This part of the notebook is used for creating training sets based on maximum amplitude of the incoming wave.

In [None]:
sort!(df, :max_eta);

df.square_max_eta = abs2.(df.max_eta);

nbins = 15
max_samples_per_bin = 70

h = fit(Histogram, df.square_max_eta, nbins=nbins)
p = plot(h, label="Total", xlabel="Max offshore wave amplitude", ylabel="Nr. of scenarios")

In [None]:
h.weights

In [None]:
bounds = prepend!(cumsum(h.weights),1)

In [None]:
df[!, :is_train] = falses(size(df)[1]);
#df = df[shuffle(1:size(df, 1)),:] # Random selection

bounds = prepend!(cumsum(h.weights),1)
for i in 1:length(h.weights)
    nr_of_samples = min(max_samples_per_bin, h.weights[i]+1)
    println("Nr of samples: $nr_of_samples. Bounds: $(bounds[i]), $(bounds[i+1])")
    rows = sample(bounds[i]:bounds[i+1], nr_of_samples; replace = false, ordered = true)
    println("Selected rows: $rows")
    df.is_train[rows] = trues(nr_of_samples)
end

In [None]:
sample(9:10, 2; replace = false, ordered = true)

In [None]:
p = histogram!(p, df[df.is_train .== true,:max_eta], label="Train")

# Create output directory
out_dir = "article_data_X/train_$(sum(df.is_train .== true))"
println(out_dir)
if !isdir(out_dir)
    mkpath(out_dir)
end

savefig(p, joinpath(out_dir, "total-train.svg")) # saves the CURRENT_PLOT.
display(p)

In [None]:


# Create plot
h1 = histogram(df[df.is_train .== true,:].max_flow_depth; nbins=10, xlabel="Maximum inundation height", normalize=:false)
h2 = histogram(df[df.is_train .== true,:].inundated_pixels; nbins=10, xlabel="Number of inundated pixels", normalize=:false)
h3 = histogram(df[df.is_train .== true,:].max_eta; nbins=10, xlabel="Maximum amplitude", normalize=:false)

p = plot(h1, h2, h3, layout=(3, 1), legend=false)
savefig(p, joinpath(out_dir, "train_distribution.svg")) # saves the CURRENT_PLOT.
display(p)

In [None]:
df_train = df[df.is_train .== true,:]
df_remain = df[df.is_train .== false,:]

df_train = df_train[shuffle(1:size(df_train, 1)),:]
df_remain = df_remain[shuffle(1:size(df_remain, 1)),:]

## Write files

open(joinpath(out_dir,"train.txt"), "w") do file
    for row in eachrow(df_train)
        println(file, row.scenario)
    end
end

CSV.write(joinpath(out_dir,"df_train.csv"), df_train)

open(joinpath(out_dir,"train_remain.txt"), "w") do file
    for row in eachrow(df_remain)
        println(file, row.scenario)
    end
end

CSV.write(joinpath(out_dir,"df_remain.csv"), df_remain)

To save the notebook for later:
```terminal
jupyter nbconvert --to html create-model.ipynb
```

## Load from file

In [None]:
# Read dataframes from file.
dataset_dir = "article_data/train_591"

df_train = CSV.read(joinpath(dataset_dir,"df_train.csv"), DataFrame)
df_remain = CSV.read(joinpath(dataset_dir,"df_remain.csv"), DataFrame)

df = vcat(df_train, df_remain)

#df = CSV.read(joinpath("article_data_X","df.csv"), DataFrame)

In [None]:
#CSV.write(joinpath("article_data_X","df.csv"), df)

## Make dataset distribution plot.

In [None]:
names(df)

In [None]:
theme(:default)
p = @df df histogram(
    :square_max_eta, 
    xlabel="Max offshore wave amplitude squared",
    ylabel="Nr. of scenarios",
    label="Basis",
    nbins=15,
    alpha=0.8,
    dpi=300,
    ylim = (0.5,10000),
    yaxis = :log10
)

@df df[df.is_train .== true,:] histogram!(p,
    :square_max_eta,
    label="Train",
    nbins=15,
    yaxis = :log10,
    ylim = (0.5,10000)
)
display(p)
savefig(p, joinpath(dataset_dir,"selection_by_bin.png"))

In [None]:
scale = 3
df.inundated_area =  map(x -> x*100/(1000*1000), df.inundated_pixels)
df.inundated_area_scaled = df.inundated_area/scale

cols = [:max_flow_depth, :max_eta, :inundated_area_scaled]
df_stacked = stack(df[!, cols])
df_train_stacked = stack(df[df.is_train .== true, cols])

df_stacked[!,:is_train] .= "Basis"
df_train_stacked[!, :is_train] .= "Train"

In [None]:
df.inundated_area_scaled

In [None]:
df_stacked_wtrain = vcat(df_stacked, df_train_stacked)

In [None]:
category_mapping = Dict(
    "max_flow_depth" => "Max flow depth", 
    "max_eta" => "Max wave amplitude",
    "square_max_eta" => "Max wave amplitude squared",
    "inundated_area_scaled" => "Inundated area"
)
df_stacked_wtrain.map_variable = map(x -> category_mapping[x], df_stacked_wtrain.variable)

In [None]:
p = groupedboxplot(
    df_stacked_wtrain.map_variable, 
    df_stacked_wtrain.value, 
    group = df_stacked_wtrain.is_train,
    alpha=1.,
    markersize=2.5,
    legend=:top,
    bar_width = 0.8,
    ylabel="Meter",
    outliers=true,
    right_margin=15Plots.mm,
    dpi=300
)

plot!(twinx(p),
    xticks=:none,
    ylim=[-1,42.5],
yticks = (yticks(p)[1][1]*scale, map(x -> string(x), yticks(p)[1][1]*scale)),
    ylabel="Square kilometer"
)
savefig(p, joinpath(dataset_dir,"parameter_distibution_box.png"))
display(p)