m5stack · xuruiray · May 17, 2026 · May 17, 2026 · May 17, 2026
diff --git a/firmware/main/apps/app_avatar/app_avatar.cpp b/firmware/main/apps/app_avatar/app_avatar.cpp
@@ -12,6 +12,7 @@
 #include <smooth_lvgl.hpp>
 #include <stackchan/stackchan.h>
 #include <apps/common/common.h>
+#include <hal/board/hal_bridge.h>
 #include <string_view>
 #include <cstdint>
 #include <memory>
@@ -89,6 +90,13 @@ void AppAvatar::onOpen()
     avatar->getPanel()->onClick().connect([&]() { _screen_clicked_flag = true; });
     GetStackChan().attachAvatar(std::move(avatar));
 
+    auto gaze_modifier = std::make_unique<ConversationGazeModifier>(ConversationGazeModifier::Mode::Listening);
+    gaze_modifier->setTargetProvider([](float& target_x, float& target_y) {
+        auto* camera = hal_bridge::board_get_camera();
+        return camera && camera->DetectVisualTarget(target_x, target_y);
+    });
+    _conversation_gaze_modifier_id = GetStackChan().addModifier(std::move(gaze_modifier));
+
     /* ------------------------------- BLE events ------------------------------- */
     GetHAL().onBleAvatarData.connect([&](const char* data) {
         std::lock_guard<std::mutex> lock(_mutex);
@@ -271,6 +279,11 @@ void AppAvatar::onClose()
         GetStackChan().resetAvatar();
         _video_window.reset();
 
+        if (_conversation_gaze_modifier_id >= 0) {
+            GetStackChan().removeModifier(_conversation_gaze_modifier_id);
+            _conversation_gaze_modifier_id = -1;
+        }
+
         GetHAL().onBleAvatarData.clear();
         GetHAL().onBleMotionData.clear();
 

diff --git a/firmware/main/apps/app_avatar/app_avatar.h b/firmware/main/apps/app_avatar/app_avatar.h
@@ -42,7 +42,8 @@ class AppAvatar : public mooncake::AppAbility {
     std::unique_ptr<view::VideoWindow> _video_window;
 
     bool _screen_clicked_flag = false;
-    int _dance_modifier_id    = -1;
+    int _dance_modifier_id             = -1;
+    int _conversation_gaze_modifier_id = -1;
 
     void check_auto_angle_sync_mode();
 };
diff --git a/firmware/main/hal/board/stackchan_camera.cc b/firmware/main/hal/board/stackchan_camera.cc
@@ -5,7 +5,9 @@
 #include <unistd.h>
 #include <errno.h>
 #include <esp_heap_caps.h>
+#include <algorithm>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 
 #include "esp_imgfx_color_convert.h"
@@ -58,6 +60,147 @@
 
 #define TAG "StackChanCamera"
 
+namespace {
+
+constexpr int kTrackingGridWidth  = 16;
+constexpr int kTrackingGridHeight = 12;
+
+struct VisualSample {
+    uint8_t y       = 0;
+    uint8_t cb      = 128;
+    uint8_t cr      = 128;
+    uint8_t r       = 0;
+    uint8_t g       = 0;
+    uint8_t b       = 0;
+    bool has_yuv    = false;
+    bool has_rgb    = false;
+};
+
+uint8_t clamp_u8(int value)
+{
+    if (value < 0) {
+        return 0;
+    }
+    if (value > 255) {
+        return 255;
+    }
+    return static_cast<uint8_t>(value);
+}
+
+void yuv_to_rgb(VisualSample& sample)
+{
+    const int c = static_cast<int>(sample.y) - 16;
+    const int d = static_cast<int>(sample.cb) - 128;
+    const int e = static_cast<int>(sample.cr) - 128;
+    sample.r    = clamp_u8((298 * c + 409 * e + 128) >> 8);
+    sample.g    = clamp_u8((298 * c - 100 * d - 208 * e + 128) >> 8);
+    sample.b    = clamp_u8((298 * c + 516 * d + 128) >> 8);
+    sample.has_rgb = true;
+}
+
+void rgb_to_luma(VisualSample& sample)
+{
+    sample.y = static_cast<uint8_t>((77 * sample.r + 150 * sample.g + 29 * sample.b) >> 8);
+}
+
+bool read_visual_sample(const uint8_t* data, size_t len, v4l2_pix_fmt_t format, int width, int height, int x, int y,
+                        VisualSample& sample)
+{
+    if (!data || width <= 0 || height <= 0 || x < 0 || y < 0 || x >= width || y >= height) {
+        return false;
+    }
+
+    switch (format) {
+        case V4L2_PIX_FMT_YUV422P:
+        case V4L2_PIX_FMT_YUYV: {
+            const size_t pair_offset = (static_cast<size_t>(y) * width + (x & ~1)) * 2;
+            if (pair_offset + 3 >= len) {
+                return false;
+            }
+            sample.y       = data[pair_offset + ((x & 1) ? 2 : 0)];
+            sample.cb      = data[pair_offset + 1];
+            sample.cr      = data[pair_offset + 3];
+            sample.has_yuv = true;
+            yuv_to_rgb(sample);
+            return true;
+        }
+        case V4L2_PIX_FMT_RGB565:
+        case V4L2_PIX_FMT_RGB565X: {
+            const size_t offset = (static_cast<size_t>(y) * width + x) * 2;
+            if (offset + 1 >= len) {
+                return false;
+            }
+            uint16_t pixel = static_cast<uint16_t>(data[offset]) | (static_cast<uint16_t>(data[offset + 1]) << 8);
+            if (format == V4L2_PIX_FMT_RGB565X) {
+                pixel = __builtin_bswap16(pixel);
+            }
+            sample.r = static_cast<uint8_t>(((pixel >> 11) & 0x1F) << 3);
+            sample.g = static_cast<uint8_t>(((pixel >> 5) & 0x3F) << 2);
+            sample.b = static_cast<uint8_t>((pixel & 0x1F) << 3);
+            sample.has_rgb = true;
+            rgb_to_luma(sample);
+            return true;
+        }
+        case V4L2_PIX_FMT_RGB24: {
+            const size_t offset = (static_cast<size_t>(y) * width + x) * 3;
+            if (offset + 2 >= len) {
+                return false;
+            }
+            sample.r = data[offset];
+            sample.g = data[offset + 1];
+            sample.b = data[offset + 2];
+            sample.has_rgb = true;
+            rgb_to_luma(sample);
+            return true;
+        }
+        case V4L2_PIX_FMT_GREY:
+        case V4L2_PIX_FMT_YUV420: {
+            const size_t offset = static_cast<size_t>(y) * width + x;
+            if (offset >= len) {
+                return false;
+            }
+            sample.y = data[offset];
+            return true;
+        }
+        default:
+            return false;
+    }
+}
+
+int skin_score(const VisualSample& sample)
+{
+    if (sample.has_yuv && sample.y > 45 && sample.cb >= 70 && sample.cb <= 140 && sample.cr >= 128 &&
+        sample.cr <= 190 && sample.cr > sample.cb + 5) {
+        return 16;
+    }
+
+    if (sample.has_rgb) {
+        const int max_rgb = std::max(static_cast<int>(sample.r), std::max(static_cast<int>(sample.g),
+                                                                          static_cast<int>(sample.b)));
+        const int min_rgb = std::min(static_cast<int>(sample.r), std::min(static_cast<int>(sample.g),
+                                                                          static_cast<int>(sample.b)));
+        if (sample.r > 70 && sample.g > 35 && sample.b > 20 && sample.r > sample.g && sample.r > sample.b &&
+            max_rgb - min_rgb > 15 && std::abs(static_cast<int>(sample.r) - static_cast<int>(sample.g)) > 8) {
+            return 12;
+        }
+    }
+
+    return 0;
+}
+
+float clamp_unit(float value)
+{
+    if (value < -1.0f) {
+        return -1.0f;
+    }
+    if (value > 1.0f) {
+        return 1.0f;
+    }
+    return value;
+}
+
+}  // namespace
+
 #if defined(CONFIG_CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER) || defined(CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP)
 #warning \
     "CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER or CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP is enabled, which may cause image corruption in YUV422 format!"
@@ -398,6 +541,8 @@ bool StackChanCamera::Capture()
         encoder_thread_.join();
     }
 
+    std::lock_guard<std::mutex> lock(capture_mutex_);
+
     if (!streaming_on_ || video_fd_ < 0) {
         return false;
     }
@@ -857,6 +1002,8 @@ bool StackChanCamera::StreamCaptures()
         encoder_thread_.join();
     }
 
+    std::lock_guard<std::mutex> lock(capture_mutex_);
+
     if (!streaming_on_ || video_fd_ < 0) {
         return false;
     }
@@ -967,6 +1114,110 @@ bool StackChanCamera::StreamCaptures()
     return true;
 }
 
+bool StackChanCamera::DetectVisualTarget(float& target_x, float& target_y, int* confidence)
+{
+    if (confidence) {
+        *confidence = 0;
+    }
+
+    std::lock_guard<std::mutex> lock(capture_mutex_);
+
+    if (!streaming_on_ || video_fd_ < 0) {
+        return false;
+    }
+
+    struct v4l2_buffer buf = {};
+    buf.type               = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    buf.memory             = V4L2_MEMORY_MMAP;
+    if (ioctl(video_fd_, VIDIOC_DQBUF, &buf) != 0) {
+        ESP_LOGD(TAG, "VIDIOC_DQBUF failed during visual tracking: errno=%d(%s)", errno, strerror(errno));
+        return false;
+    }
+
+    bool detected = false;
+    do {
+        if (buf.index >= mmap_buffers_.size() || !mmap_buffers_[buf.index].start || buf.bytesused == 0) {
+            break;
+        }
+
+        int width  = frame_.width;
+        int height = frame_.height;
+#ifdef CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
+        width  = sensor_width_;
+        height = sensor_height_;
+#endif  // CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
+        if (width <= 0 || height <= 0) {
+            break;
+        }
+
+        const auto* data = static_cast<const uint8_t*>(mmap_buffers_[buf.index].start);
+        uint32_t weighted_x = 0;
+        uint32_t weighted_y = 0;
+        uint32_t total_weight = 0;
+        uint16_t hit_count    = 0;
+
+        for (int gy = 0; gy < kTrackingGridHeight; ++gy) {
+            const int y = (height * (gy * 2 + 1)) / (kTrackingGridHeight * 2);
+            for (int gx = 0; gx < kTrackingGridWidth; ++gx) {
+                const int x = (width * (gx * 2 + 1)) / (kTrackingGridWidth * 2);
+
+                VisualSample sample;
+                if (!read_visual_sample(data, buf.bytesused, sensor_format_, width, height, x, y, sample)) {
+                    continue;
+                }
+
+                const int grid_index = gy * kTrackingGridWidth + gx;
+                int motion_score     = 0;
+                if (tracking_luma_initialized_) {
+                    const int diff = std::abs(static_cast<int>(sample.y) - static_cast<int>(tracking_luma_[grid_index]));
+                    if (diff > 16) {
+                        motion_score = std::min(diff / 3, 20);
+                    }
+                    tracking_luma_[grid_index] =
+                        static_cast<uint8_t>((static_cast<int>(tracking_luma_[grid_index]) * 3 + sample.y) / 4);
+                } else {
+                    tracking_luma_[grid_index] = sample.y;
+                }
+
+                const int weight = skin_score(sample) * 3 + motion_score;
+                if (weight < 12) {
+                    continue;
+                }
+
+                weighted_x += static_cast<uint32_t>(x * weight);
+                weighted_y += static_cast<uint32_t>(y * weight);
+                total_weight += weight;
+                hit_count++;
+            }
+        }
+        tracking_luma_initialized_ = true;
+
+        if (hit_count < 3 || total_weight < 60) {
+            break;
+        }
+
+        const float center_x = static_cast<float>(weighted_x) / static_cast<float>(total_weight);
+        const float center_y = static_cast<float>(weighted_y) / static_cast<float>(total_weight);
+        const float norm_x   = (center_x - (static_cast<float>(width) - 1.0f) * 0.5f) /
+                             ((static_cast<float>(width) - 1.0f) * 0.5f);
+        const float norm_y = (center_y - (static_cast<float>(height) - 1.0f) * 0.5f) /
+                             ((static_cast<float>(height) - 1.0f) * 0.5f);
+
+        target_x = clamp_unit(norm_x * 2.40f);
+        target_y = clamp_unit(-norm_y * 1.80f);
+        if (confidence) {
+            *confidence = std::min<int>(100, total_weight / 8);
+        }
+        detected = true;
+    } while (false);
+
+    if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
+        ESP_LOGE(TAG, "VIDIOC_QBUF failed after visual tracking");
+    }
+
+    return detected;
+}
+
 bool StackChanCamera::SetHMirror(bool enabled)
 {
     if (video_fd_ < 0) return false;

diff --git a/firmware/main/hal/board/stackchan_camera.h b/firmware/main/hal/board/stackchan_camera.h
@@ -5,6 +5,7 @@
 #include <lvgl.h>
 #include <thread>
 #include <memory>
+#include <mutex>
 #include <vector>
 
 #include <freertos/FreeRTOS.h>
@@ -29,6 +30,9 @@ class StackChanCamera : public Camera {
         v4l2_pix_fmt_t format = 0;
     } frame_;
     v4l2_pix_fmt_t sensor_format_ = 0;
+    std::mutex capture_mutex_;
+    uint8_t tracking_luma_[16 * 12] = {};
+    bool tracking_luma_initialized_ = false;
 #ifdef CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
     uint16_t sensor_width_  = 0;
     uint16_t sensor_height_ = 0;
@@ -51,6 +55,7 @@ class StackChanCamera : public Camera {
     virtual void SetExplainUrl(const std::string& url, const std::string& token);
     virtual bool Capture() override;
     bool StreamCaptures();
+    bool DetectVisualTarget(float& target_x, float& target_y, int* confidence = nullptr);
 
     // 翻转控制函数
     virtual bool SetHMirror(bool enabled) override;