# Sensor shifts

Make sure sensor locations are stable over time. Most are, but some are not. Write out a file with those that are not so we can drop them.

TODO one file from d03 is empty - but shouldn't matter since it's the last one in 2015, and there's a new one in March 2016, and we're not using any data from pre-March 2016.

In [1]:
using CSV, DataFrames, DataFramesMeta, Logging, ProgressMeter, Geodesy, Dates, StatsBase

In [2]:
files = filter(readdir("../data/meta/")) do fn
    !isnothing(match(r"^d.*_text_meta_.*\.txt", fn))
end
nothing

## Figure out which dates to read

We want to read all metadata files from 2016 or later, and the last file before 2016-01-01, so we have valid metadata for the entire analysis period.

In [3]:
dates_by_district = Dict{String, Vector{Date}}()

for file in files
    parsed = match(r"^d0?([1-9][0-9]?)_text_meta_([0-9]{4})_([0-9]{2})_([0-9]{2}).txt", file)
    if !haskey(dates_by_district, parsed[1])
        dates_by_district[parsed[1]] = []
    end
    date = Date(parse(Int64, parsed[2]), parse(Int64, parsed[3]), parse(Int64, parsed[4]))
    push!(dates_by_district[parsed[1]], date)
end

In [4]:
dates_to_retain_by_district = Dict{String, Set{Date}}()

for (district, dates) in pairs(dates_by_district)
    # retain the file before 2016-01-01 and all after
    last_date_before_2016 = Date(1970, 1, 1)
    
    for date in dates
        if date <= Date(2016, 1, 1) && date > last_date_before_2016
            last_date_before_2016 = date
        end
    end
    
    dates_to_retain_by_district[district] = Set(collect(filter(d -> d >= last_date_before_2016, dates)))
end

In [5]:
all_meta = vcat(skipmissing(map(files) do file
        parsed = match(r"^d0?([1-9][0-9]?)_text_meta_([0-9]{4})_([0-9]{2})_([0-9]{2}).txt", file)
        date = Date(parse(Int64, parsed[2]), parse(Int64, parsed[3]), parse(Int64, parsed[4]))

        if !in(date, dates_to_retain_by_district[parsed[1]])
            return missing
        else
            data = CSV.read(joinpath("../data/meta", file), DataFrame)
            select!(data, [:ID, :Fwy, :Dir, :Latitude, :Longitude, :District, :Lanes])
            data[!, :date] .= date
            return data
        end
    end)...)
nothing

└ @ CSV /Users/mwbc/.julia/packages/CSV/Zl2ww/src/file.jl:613
└ @ CSV /Users/mwbc/.julia/packages/CSV/Zl2ww/src/file.jl:613
└ @ CSV /Users/mwbc/.julia/packages/CSV/Zl2ww/src/file.jl:613
└ @ CSV /Users/mwbc/.julia/packages/CSV/Zl2ww/src/file.jl:613
└ @ CSV /Users/mwbc/.julia/packages/CSV/Zl2ww/src/file.jl:613
└ @ CSV /Users/mwbc/.julia/packages/CSV/Zl2ww/src/file.jl:613
└ @ CSV /Users/mwbc/.julia/packages/CSV/Zl2ww/src/file.jl:613
└ @ CSV /Users/mwbc/.julia/packages/CSV/Zl2ww/src/file.jl:613
└ @ CSV /Users/mwbc/.julia/packages/CSV/Zl2ww/src/file.jl:613
└ @ CSV /Users/mwbc/.julia/packages/CSV/Zl2ww/src/file.jl:613


## Compute station-level statistics

Make sure that freeway, direction, and number of lanes are stable, and that location did not shift by more than 100 meters.

In [6]:
function max_shift(lats, lons)
    @assert length(lats) == length(lons)
    max_shift = 0
    for i in 1:length(lats)
        if ismissing(lats[i]) && ismissing(lons[i]) continue end
        pos_i = LLA(lats[i], lons[i], 0)
        for j in 1:length(lons)
            if ismissing(lats[j]) && ismissing(lons[j]) continue end
            pos_j = LLA(lats[j], lons[j], 0)
            dist = euclidean_distance(pos_i, pos_j)
            if dist > max_shift
                max_shift = dist
            end
        end
    end
    return max_shift
end

station_stats = combine(groupby(all_meta, :ID),
    :Fwy => (x -> length(unique(x)) == 1) => :fwy_stable,
    :Dir => (x -> length(unique(x)) == 1) => :dir_stable,
    :Lanes => (x -> length(unique(x)) == 1) => :lanes_stable,
    [:Latitude, :Longitude] => max_shift => :max_shift_meters,
    # save representative values so we have them for all sensors
    # this file will be used to identify the lat/lons of sensors in the final dataset,
    # some sensors may not appear in one particular metadata file, so use the combination
    :Latitude => last => :Latitude,
    :Longitude => last => :Longitude,
    :Fwy => last => :Fwy,
    :Dir => last => :Dir,
    :District => last => :District,
    :Lanes => last => :Lanes
    
)

Unnamed: 0_level_0,ID,fwy_stable,dir_stable,lanes_stable,max_shift_meters,Latitude,Longitude
Unnamed: 0_level_1,Int64,Bool,Bool,Bool,Float64,Float64?,Float64?
1,311831,1,1,1,59.5162,38.4098,-121.484
2,311832,1,1,1,301.266,38.4098,-121.484
3,311844,1,1,1,80.6189,38.4128,-121.484
4,311845,1,1,1,0.0,38.4062,-121.483
5,311847,1,1,1,91.9547,38.4283,-121.488
6,311864,1,1,1,394.968,38.4246,-121.487
7,311903,1,1,1,1812.91,38.5669,-121.506
8,311930,1,1,1,1799.79,38.5669,-121.506
9,311973,1,1,1,406.343,38.5642,-121.496
10,311974,1,1,1,207.906,38.5642,-121.496


In [7]:
mean(station_stats.fwy_stable)

0.9954039671020803

In [8]:
mean(station_stats.dir_stable)

0.9888727624576681

In [9]:
mean(station_stats.lanes_stable)

0.9625544267053701

In [10]:
mean(station_stats.max_shift_meters .< 100)

0.9528785679729076

In [11]:
mean(
    station_stats.fwy_stable .&
    station_stats.dir_stable .&
    station_stats.lanes_stable .&
    (station_stats.max_shift_meters .< 100)
    )

0.9118045476536043

## Extract metadata for good sensors

This will be used to filter the sensor data to exclude the sensors that are unstable.

In [12]:
good_sensor_meta = station_stats[station_stats.fwy_stable .&
    station_stats.dir_stable .&
    station_stats.lanes_stable .&
    (station_stats.max_shift_meters .< 100), :]

Unnamed: 0_level_0,ID,fwy_stable,dir_stable,lanes_stable,max_shift_meters,Latitude,Longitude
Unnamed: 0_level_1,Int64,Bool,Bool,Bool,Float64,Float64?,Float64?
1,311831,1,1,1,59.5162,38.4098,-121.484
2,311844,1,1,1,80.6189,38.4128,-121.484
3,311845,1,1,1,0.0,38.4062,-121.483
4,311847,1,1,1,91.9547,38.4283,-121.488
5,312010,1,1,1,9.62967,38.5638,-121.493
6,312098,1,1,1,85.7125,38.559,-121.476
7,312103,1,1,1,12.8604,38.561,-121.463
8,312132,1,1,1,9.56544,38.4098,-121.484
9,312133,1,1,1,53.2263,38.4282,-121.488
10,312134,1,1,1,16.0884,38.4124,-121.484


In [13]:
CSV.write("../data/good_sensors.csv", good_sensor_meta)

"../data/good_sensors.csv"