diff --git a/tracker-neuralnet/deadzone_filter.cpp b/tracker-neuralnet/deadzone_filter.cpp index b41afdba6..fa96eeb36 100644 --- a/tracker-neuralnet/deadzone_filter.cpp +++ b/tracker-neuralnet/deadzone_filter.cpp @@ -3,6 +3,7 @@ #include "opencv_contrib.h" #include "unscented_trafo.h" +#include #include #include #include @@ -12,19 +13,25 @@ namespace neuralnet_tracker_ns using namespace cvcontrib; -using StateVec = cv::Vec; -using StateCov = cv::Matx; +// Number of degrees of freedom of position and rotation +static constexpr int dofs = 6; -static constexpr int num_sigmas = ukf_cv::MerweScaledSigmaPoints<6>::num_sigmas; +using StateVec = cv::Vec; +using StateCov = cv::Matx; + +static constexpr int num_sigmas = ukf_cv::MerweScaledSigmaPoints::num_sigmas; +// Rescaling factor for position/size living in the space of the face crop. +// Applied prior to application of UKF to prevent numerical problems. static constexpr float img_scale = 200.f; +// Similar rescaling factor for position/size that live in world space. static constexpr float world_scale = 1000.f; // mm +// Fills the 6 DoF covariance factor, as in L L^T factorization. +// Covariance is given wrt the tangent space of current predictions StateCov make_tangent_space_uncertainty_tril(const PoseEstimator::Face &face) { StateCov tril = StateCov::eye(); - tril(0,0) = face.center_stddev.x/img_scale; - tril(1,1) = face.center_stddev.y/img_scale; - tril(2,2) = face.size_stddev/img_scale; + set_minor<3,3>(tril, 0, 0, face.center_size_cov_tril / img_scale); set_minor<3,3>(tril, 3, 3, face.rotaxis_cov_tril); return tril; } @@ -41,7 +48,7 @@ QuatPose apply_offset(const QuatPose& pose, const StateVec& offset) } -PoseEstimator::Face apply_offset(const PoseEstimator::Face& face, const StateVec& offset) +std::tuple apply_offset(const PoseEstimator::Face& face, const StateVec& offset) { const cv::Quatf dr = cv::Quatf::createFromRvec(cv::Vec3f{ offset[3], offset[4], offset[5] }); const auto r = face.rotation * dr; @@ -57,14 +64,10 @@ PoseEstimator::Face apply_offset(const PoseEstimator::Face& face, const StateVec // is designed to handle non-linearities like this. const float sz = std::max(0.1f*face.size, face.size + offset[2]*img_scale); - return PoseEstimator::Face{ + return { r, - {}, - {}, p, - {}, sz, - {} }; } @@ -77,9 +80,9 @@ StateVec relative_to(const QuatPose& reference, const QuatPose& pose) } -ukf_cv::SigmaPoints<6> relative_to(const QuatPose& pose, const std::array& sigmas) +ukf_cv::SigmaPoints relative_to(const QuatPose& pose, const std::array& sigmas) { - ukf_cv::SigmaPoints<6> out; + ukf_cv::SigmaPoints out; // Beware, the number of points is != the number of DoFs. std::transform(sigmas.begin(), sigmas.end(), out.begin(), [&pose](const QuatPose& s) { return relative_to(pose, s); }); @@ -87,14 +90,14 @@ ukf_cv::SigmaPoints<6> relative_to(const QuatPose& pose, const std::array compute_world_pose_from_sigma_point(const PoseEstimator::Face& face, const ukf_cv::SigmaPoints<6>& sigmas, Face2WorldFunction face2world) +std::array compute_world_pose_from_sigma_point(const PoseEstimator::Face& face, const ukf_cv::SigmaPoints& sigmas, Face2WorldFunction face2world) { std::array out; std::transform(sigmas.begin(), sigmas.end(), out.begin(), [face2world=std::move(face2world), &face](const StateVec& sigma_point) { // First unpack the state vector and generate quaternion rotation w.r.t image space. - const auto sigma_face = apply_offset(face, sigma_point); + const auto [rotation, center, size] = apply_offset(face, sigma_point); // Then transform ... - QuatPose pose = face2world(sigma_face.rotation, sigma_face.center, sigma_face.size); + QuatPose pose = face2world(rotation, center, size); pose.pos /= world_scale; return pose; }); @@ -131,24 +134,32 @@ StateVec apply_filter_to_offset(const StateVec& offset, const StateCov& offset_c QuatPose apply_filter(const PoseEstimator::Face &face, const QuatPose& previous_pose_, float dt, Face2WorldFunction face2world, const FiltParams& params) { - ukf_cv::MerweScaledSigmaPoints<6> unscentedtrafo; + ukf_cv::MerweScaledSigmaPoints unscentedtrafo; auto previous_pose = previous_pose_; previous_pose.pos /= world_scale; - // Here we have the covariance matrix for the offset from the observed values in `face`. + // Get 6 DoF covariance factor for the predictions in the face crop space. const auto cov_tril = make_tangent_space_uncertainty_tril(face); - // The filter uses an unscented transform to translate that into a distribution for the offset from the previous pose. - const ukf_cv::SigmaPoints<6> sigmas = unscentedtrafo.compute_sigmas(to_vec(StateVec::zeros()), cov_tril, true); + // Compute so called sigma points. These represent the distribution from the covariance matrix in terms of + // sampling points. + const ukf_cv::SigmaPoints sigmas = unscentedtrafo.compute_sigmas(to_vec(StateVec::zeros()), cov_tril, true); + // The filter uses an unscented transform to translate that into a distribution for the offset from the previous pose. + // The trick is to transform the sampling points and compute a covariance from them in the output space. // We have many of these sigma points. This is why that callback comes into play here. + // The transform to 3d world space is more than Face2WorldFunction because we also need to apply the sigma point (as + // a relative offset) to the pose in face crop space. const std::array pose_sigmas = compute_world_pose_from_sigma_point(face, sigmas, std::move(face2world)); - const ukf_cv::SigmaPoints<6> deltas_sigmas = relative_to(previous_pose, pose_sigmas); + // Compute sigma points relative to the previous pose + const ukf_cv::SigmaPoints deltas_sigmas = relative_to(previous_pose, pose_sigmas); + // Compute the mean offset from the last pose and the spread due to the networks uncertainty output. const auto [offset, offset_cov] = unscentedtrafo.compute_statistics(deltas_sigmas); - // Then the deadzone is applied to the offset and finally the previous pose is transformed by the offset to arrive at the final output. + // Then the deadzone is applied to the offset and finally the previous pose is transformed by the offset to arrive + // at the final output. const StateVec scaled_offset = apply_filter_to_offset(offset, offset_cov, dt, params); QuatPose new_pose = apply_offset(previous_pose, scaled_offset); diff --git a/tracker-neuralnet/model_adapters.cpp b/tracker-neuralnet/model_adapters.cpp index a8580a892..f53478af1 100644 --- a/tracker-neuralnet/model_adapters.cpp +++ b/tracker-neuralnet/model_adapters.cpp @@ -192,11 +192,15 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses if (session_.GetOutputCount() < 2) throw std::runtime_error("Invalid Model: must have at least two outputs"); - // WARNING UB .. but still ... - // If the model was saved without meta data, it seems the version field is uninitialized. - // In that case reading from it is UB. However, in practice we will just get some arbitrary number - // which is hopefully different from the numbers used by models where the version is set. - if (model_version_ != 2 && model_version_ != 3) + // WARNING: Messy model compatibility issues! + // When reading the initial model release, it did not have the version field set. + // Reading it here will result in some unspecified value. It's probably UB due to + // reading uninitialized memory. But there is little choice. + // Now, detection of this old version is messy ... we have to guess based on the + // number we get. Getting an uninitialized value matching a valid version is unlikely. + // But the real problem is that this line must be updated whenever we want to bump the + // version number!! + if (model_version_ <= 0 || model_version_ > 4) model_version_ = 1; const cv::Size input_image_shape = get_input_image_shape(session_); @@ -224,10 +228,9 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses { "box", TensorSpec{ { 1, 4}, &output_box_[0], output_box_.rows } }, { "rotaxis_scales_tril", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, { "rotaxis_std", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, // TODO: Delete when old models aren't used any more - { "eyes", TensorSpec{ { 1, 2}, output_eyes_.val, output_eyes_.rows }}, - { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}}, - { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}}, - //{ "box_std", TensorSpec{ {1, 4}, output_box_scales_.val, output_box_scales_ .rows}} + { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_std_.val, output_coord_scales_std_.rows}}, + { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_std_.val, output_coord_scales_std_.rows}}, + { "pos_size_scales_tril", TensorSpec{ {1, 3, 3}, output_coord_scales_tril_.val, 9}} }; qDebug() << "Pose model inputs (" << session_.GetInputCount() << ")"; @@ -236,17 +239,17 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses output_c_names_.resize(session_.GetOutputCount()); for (size_t i=0; isecond; + TensorSpec& t = my_tensor_spec_it->second; if (onnx_tensor_spec.GetShape() != t.shape || onnx_tensor_spec.GetElementType() != Ort::TypeToTensorType::type) throw std::runtime_error("Invalid output tensor spec for "s + name); @@ -266,29 +269,9 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses has_uncertainty_ = understood_outputs.at("rotaxis_scales_tril").available || understood_outputs.at("rotaxis_std").available; has_uncertainty_ &= understood_outputs.at("pos_size_std").available || - understood_outputs.at("pos_size_scales").available; - //has_uncertainty_ &= understood_outputs.at("box_std").available; - has_eye_closed_detection_ = understood_outputs.at("eyes").available; - - // FIXME: Recurrent states - - // size_t num_regular_outputs = 2; - - // num_recurrent_states_ = session_.GetInputCount()-1; - // if (session_.GetOutputCount()-num_regular_outputs != num_recurrent_states_) - // throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers."); - - // // Create tensors for recurrent state - // for (size_t i = 0; i < num_recurrent_states_; ++i) - // { - // const auto& input_info = session_.GetInputTypeInfo(1+i); - // const auto& output_info = session_.GetOutputTypeInfo(num_regular_outputs+i); - // if (input_info.GetTensorTypeAndShapeInfo().GetShape() != - // output_info.GetTensorTypeAndShapeInfo().GetShape()) - // throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output"); - // input_val_.push_back(create_tensor(input_info, allocator_)); - // output_val_.push_back(create_tensor(output_info, allocator_)); - // } + understood_outputs.at("pos_size_scales").available || + understood_outputs.at("pos_size_scales_tril").available; + pos_scale_uncertainty_is_matrix_ = understood_outputs.at("pos_size_scales_tril").available; input_names_.resize(session_.GetInputCount()); input_c_names_.resize(session_.GetInputCount()); @@ -348,38 +331,32 @@ std::optional PoseEstimator::run( return {}; } - for (size_t i = 0; i PoseEstimator::run( assert(output_rotaxis_scales_tril_(0, 1) == 0); assert(output_rotaxis_scales_tril_(0, 2) == 0); assert(output_rotaxis_scales_tril_(1, 2) == 0); + assert(center_size_cov_tril(0, 1) == 0); + assert(center_size_cov_tril(0, 2) == 0); + assert(center_size_cov_tril(1, 2) == 0); cv::Matx33f rotaxis_scales_tril = output_rotaxis_scales_tril_; @@ -407,29 +387,9 @@ std::optional PoseEstimator::run( 0.5f*patch_size*(output_box_[2]-output_box_[0]), 0.5f*patch_size*(output_box_[3]-output_box_[1]) }; - // const RoiCorners outbox = { - // patch_center + 0.5f*patch_size*cv::Point2f{output_box_[0], output_box_[1]}, - // patch_center + 0.5f*patch_size*cv::Point2f{output_box_[2], output_box_[3]} - // }; - // RoiCorners outbox_stddev = { - // 0.5f*patch_size*cv::Point2f{output_box_scales_[0], output_box_scales_[1]}, - // 0.5f*patch_size*cv::Point2f{output_box_scales_[2], output_box_scales_[3]} - // }; - - // Because the model is sensitive to closing eyes we increase the uncertainty - // a lot to make the subsequent filtering smooth the output more. This should suppress - // "twitching" when the user blinks. - if (has_eye_closed_detection_) - { - const float eye_open = std::min(output_eyes_[0], output_eyes_[1]); - const float increase_factor = 1.f + 10.f * std::pow(1. - eye_open,4.f); - rotaxis_scales_tril *= increase_factor; - size_stddev *= increase_factor; - center_stddev *= increase_factor; - } return std::optional({ - rotation, rotaxis_scales_tril, outbox, center, center_stddev, size, size_stddev + rotation, rotaxis_scales_tril, outbox, center, size, center_size_cov_tril }); } diff --git a/tracker-neuralnet/model_adapters.h b/tracker-neuralnet/model_adapters.h index 48f2fa2c7..c1aaa6de3 100644 --- a/tracker-neuralnet/model_adapters.h +++ b/tracker-neuralnet/model_adapters.h @@ -50,9 +50,8 @@ class PoseEstimator cv::Matx33f rotaxis_cov_tril; // Lower triangular factor of Cholesky decomposition cv::Rect2f box; cv::Point2f center; - cv::Point2f center_stddev; float size; - float size_stddev; + cv::Matx33f center_size_cov_tril; // Lower triangular factor of Cholesky decomposition }; PoseEstimator(Ort::MemoryInfo &allocator_info, @@ -84,16 +83,15 @@ class PoseEstimator cv::Vec output_quat_{}; // Quaternion output cv::Vec output_box_{}; // Bounding box output cv::Matx33f output_rotaxis_scales_tril_{}; // Lower triangular matrix of LLT factorization of covariance of rotation vector as offset from output quaternion - cv::Vec output_eyes_{}; - cv::Vec output_coord_scales_{}; + cv::Matx33f output_coord_scales_tril_{}; // Lower triangular factor + cv::Vec3f output_coord_scales_std_{}; // Depending on the model, alternatively a 3d vector with standard deviations. std::vector output_val_; // Tensors to put the model outputs in. std::vector output_names_; // Refers to the names in the onnx model. std::vector output_c_names_; // Refers to the C names in the onnx model. // More bookkeeping - size_t num_recurrent_states_ = 0; double last_inference_time_ = 0; bool has_uncertainty_ = false; - bool has_eye_closed_detection_ = false; + bool pos_scale_uncertainty_is_matrix_ = false; };