In [2]:
import cv2  # OpenCVで動画と画像処理
import torch  # PyTorchでYOLOv5モデル利用
import sys
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
import colorsys
from tqdm.notebook import tqdm
import warnings
import math
from itertools import combinations
import random
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# ========= YOLOv5モデルのロード =========
#   yolov5s.ptは自動でダウンロードされます
#   deviceを'cpu'に明示することでCPU専用実行になります
yolo_model = torch.hub.load(
    './yolov5',          # クローンしたYOLOv5リポジトリのパス
    'custom',            # カスタムモデル（yolov5s等も可）
    path='yolov5s.pt',   # 学習済みモデル（デフォルトでOK）
    source='local',      # ローカルファイルとして読み込む
    device='cpu'         # CPU強制
)

YOLOv5 🚀 v7.0-425-g85acef3a Python-3.12.3 torch-2.8.0+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


In [4]:
def get_object_sim_dict(obj_dict):
    # ラベル（人、車…などの名前リスト）
    labels_list = list(obj_dict.values())

    # ================= Embedding =================
    model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
    embeddings = model.encode(labels_list, convert_to_numpy=True)

    # ================= 類似度 → 距離 =================
    # cosine類似度
    def cosine_sim(a, b):
        return np.dot(a, b) / (np.linalg.norm(a)*np.linalg.norm(b))

    n = len(labels_list)
    sim_matrix = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            sim_matrix[i,j] = cosine_sim(embeddings[i], embeddings[j])

    # 類似度 → 距離に変換（1-類似度）
    dist_matrix = 1 - sim_matrix

    # ================= MDSで2D座標に埋め込み =================
    mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
    coords = mds.fit_transform(dist_matrix)  # shape = (n_labels, 2)

    # ================= 座標 → 角度 =================
    # 中心を原点に移動
    coords_centered = coords - coords.mean(axis=0)
    angles_rad = np.arctan2(coords_centered[:,1], coords_centered[:,0])

    class_names = {}
    base_angle_rad = angles_rad[0]
    target_angle_rad = np.radians(210)
    angles_rad_rotated = angles_rad + (target_angle_rad - base_angle_rad)
    angles_deg_rotated = (np.degrees(angles_rad_rotated) + 360) % 360
    for (k, v), angle in tqdm(zip(obj_dict.items(), angles_deg_rotated)):
        class_names[int(k)] = {
            'label': v,
            'degree': float(angle)
        }
    return class_names


In [5]:
def draw_class_sim(class_names):
    data = class_names.values()
    # 半径
    r = 1.0  
    points = []
    colors = []

    # 座標計算と色付け
    for item in data:
        theta = np.deg2rad(item['degree'])
        x, y = r * np.cos(theta), r * np.sin(theta)
        points.append((x, y))
        rgb = colorsys.hsv_to_rgb(item['degree']/360.0, 1.0, 1.0)
        colors.append(rgb)

    # 描画
    fig, ax = plt.subplots(figsize=(6,6))
    ax.set_aspect('equal')
    ax.axis('off')

    # 円
    circle = plt.Circle((0,0), r, color='lightgray', fill=False, linestyle='--')
    ax.add_artist(circle)

    # 点 + ラベル
    for (x, y), c, item in zip(points, colors, data):
        ax.plot(x, y, 'o', color=c, markersize=12)
        ax.text(x*1.15, y*1.15, item['label'], ha='center', va='center')

    plt.show()

In [6]:
# 近傍の3点を探し出す
def distance(p1, p2):
    """2点間のユークリッド距離（x, yのみ使用）"""
    return math.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)

def total_distance(triplet):
    """3点の合計距離"""
    return distance(triplet[0], triplet[1]) + distance(triplet[0], triplet[2]) + distance(triplet[1], triplet[2])

def find_nearest_triplets(points):
    points = points.copy()
    triplets = []

    while len(points) >= 3:
        min_dist = float('inf')
        nearest_triplet = None

        # 3点の組み合わせをすべてチェック
        for comb in combinations(range(len(points)), 3):
            triplet = [points[comb[0]], points[comb[1]], points[comb[2]]]
            d = total_distance(triplet)
            if d < min_dist:
                min_dist = d
                nearest_triplet = comb

        # 見つけたトリプレットを記録
        triplets.append((points[nearest_triplet[0]], points[nearest_triplet[1]], points[nearest_triplet[2]]))
        # 使用済みの点を削除（インデックスが大きい順）
        for index in sorted(nearest_triplet, reverse=True):
            points.pop(index)

    return triplets

- 可視化表現の設計

- 使えそうな入力変数
    - xc, yc -> 物体の中心
    - w -> 幅
    - h -> 高さ
    - score -> 信頼度
    - label -> 識別名
    - cls -> labelのインデックス


- 視覚変数
    - 位置：xc,yc
    - 色相: cls
    - 明度: 
    - 彩度: 
    - 透明度: 
    - 

opencvで使える画像メソッド

| 図形       | メソッド                                                                            | 主な引数                                                                          | 簡単な使い方                                                                                                     |
| -------- | ------------------------------------------------------------------------------- | ----------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
| 線        | `cv2.line(img, pt1, pt2, color, thickness)`                                     | `pt1=(x1,y1)`, `pt2=(x2,y2)`, `color=(B,G,R)`, `thickness=線の太さ`               | `cv2.line(img, (50,50), (200,50), (0,255,0), 3)`                                                           |
| 矩形（四角形）  | `cv2.rectangle(img, pt1, pt2, color, thickness)`                                | `pt1=(左上x,左上y)`, `pt2=(右下x,右下y)`, `thickness=-1で塗りつぶし`                        | `cv2.rectangle(img, (50,50), (200,150), (255,0,0), 2)`                                                     |
| 円        | `cv2.circle(img, center, radius, color, thickness)`                             | `center=(x,y)`, `radius=半径`, `thickness=-1で塗りつぶし`                             | `cv2.circle(img, (150,150), 40, (0,0,255), -1)`                                                            |
| 楕円       | `cv2.ellipse(img, center, axes, angle, startAngle, endAngle, color, thickness)` | `center=(x,y)`, `axes=(長径/2, 短径/2)`, `angle=回転角度`, `startAngle/endAngle=描く範囲` | `cv2.ellipse(img, (200,200), (80,40), 30, 0, 360, (255,255,0), 2)`                                         |
| 多角形      | `cv2.polylines(img, [pts], isClosed, color, thickness)`                         | `pts = numpy.array([[x1,y1],[x2,y2],...], np.int32)`                          | `pts = np.array([[100,50],[200,150],[50,150]], np.int32); cv2.polylines(img, [pts], True, (0,255,255), 2)` |
| 塗りつぶし多角形 | `cv2.fillPoly(img, [pts], color)`                                               | `pts` は上と同じ                                                                   | `cv2.fillPoly(img, [pts], (0,128,255))`                                                                    |
| 文字       | `cv2.putText(img, text, org, font, fontScale, color, thickness)`                | `org=(x,y)`, `font=cv2.FONT_HERSHEY_SIMPLEX` など                               | `cv2.putText(img, "Hello", (50,250), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2)`                       |


In [None]:
def get_object_name_dict(cap):
    obj_list = {}
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    for _ in tqdm(range(total_frames)):
        ret, frame = cap.read()
        if not ret:
            break  # 動画の最後でループ終了
        
        results = yolo_model(frame[..., ::-1])  # YOLOv5に画像を渡して検出
        for *box, conf, cls in results.xywh[0].tolist():
            obj_list[int(cls)] = yolo_model.names[int(cls)]
            
    return obj_list

def hsv2bgr(h, s, v):
    # HSV色 (H=60°, S=255, V=255)
    hsv_color = np.uint8([[[h, s, v]]])  # OpenCVはH:0-179, S/V:0-255
    bgr_color = cv2.cvtColor(hsv_color, cv2.COLOR_HSV2BGR)[0][0]
    return tuple(int(c) for c in bgr_color)

def draw_info_on_mov(result_hist, frame, crnt_frame_num, class_names):
    alpha = 0.5
    height, width = frame.shape[:2]
    transparent_layer = np.zeros((height, width, 3), dtype=np.uint8)
    
    for result_dict in result_hist:
        results = result_dict['results']
        frame_num = result_dict['frame_num']
        pos_list = []
        for *box, conf, cls in results.xywh[0].tolist():
            # 座標とラベル
            cx, cy, w, h = map(int, box)
            label = class_names[int(cls)]['label']
            score = float(conf)

            damping_coef = (frame_num / crnt_frame_num) ** 3
            r = int(np.min([w, h]) * 0.3 * damping_coef)
            hue = class_names[int(cls)]['degree'] / 2   # cv2では[0,180]で指定する
            saturation = 255 * 0.9 * damping_coef
            brightness = (255 - 100) * score  + 100
            
            # 検出枠
            color = hsv2bgr(hue, saturation, brightness)
            cv2.circle(transparent_layer, (cx, cy), r, color, -1)
            cv2.circle(frame, (cx, cy), r, color, 1)

            # テキスト
            if frame_num == crnt_frame_num:
                text = f'{"\n".join(str.capitalize(label))}\n{score:.2f}'
                x0 = int(cx - r)
                y0 = int(cy - h/2)
                dy = 30
                for i, char in enumerate(reversed(text.split("\n"))):
                    y = y0 - i*dy
                    x = x0 + 20 if len(char) == 1 else x0
                    cv2.putText(frame, char, (x, y), cv2.FONT_HERSHEY_TRIPLEX, 1, (255,255,255), 1)

                # 3角形を書くために一時保存
                pos_list.append((cx, cy, r))
        
        if frame_num == crnt_frame_num:
            # 最新フレームで3近傍を取得
            triplets = find_nearest_triplets(pos_list)
            for t in triplets:
                pos = []
                for (cx, cy, r) in t:
                    t = random.uniform(0, 2*math.pi)
                    x = int(cx + r * math.cos(t))
                    y = int(cy + r * math.sin(t))
                    cv2.line(frame, (cx, cy), (x, y),  (255,255,255), 1)
                    pos.append((x,y))
                cv2.line(frame, pos[0], pos[1], (255,255,255), 1)
                cv2.line(frame, pos[1], pos[2], (255,255,255), 1)
                cv2.line(frame, pos[2], pos[0], (255,255,255), 1)
        
        display_frame = cv2.addWeighted(frame, 1.0, transparent_layer, alpha, 0)
    return display_frame

In [28]:
# ========= 動画ファイルの読み込み =========
video_path = 'data/shinjuku.mp4' # 例：'input.mp4'  カメラなら 0
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    print("動画が開けません。ファイルパスを確認してください。")
    sys.exit(1)
print(f'動画読み込み完了:{video_path}')

# ========= 動画ファイルの書き出し設定 =========
# 出力設定（例: MP4, 30fps, 元の解像度）
out_path = "out/output.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = cap.get(cv2.CAP_PROP_FPS)
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(out_path, fourcc, fps, (w, h))
print(f'動画書き出し準備完了:{out_path}')

動画読み込み完了:data/shinjuku.mp4
動画書き出し準備完了:out/output.mp4


In [9]:
# ========= 事前の物体識別 =========
print('識別物体の事前検出と類似度計算')
cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # 最初に戻す
obj_dict = get_object_name_dict(cap)
class_names = get_object_sim_dict(obj_dict)
draw_class_sim(class_names)

識別物体の事前検出と類似度計算


  0%|          | 0/425 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [29]:

# ========= 描画開始 =========
cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # 最初に戻す
result_hist = []
while True:
    # ========= フレーム取得 ========= 
    ret, frame = cap.read()
    frame_num = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
    crnt_frame_num = frame_num
    if not ret:
        break  # 動画の最後でループ終了

    # ========= 推論（物体検出） =========
    # OpenCVはBGR⇒YOLOはRGB
    results = yolo_model(frame[..., ::-1])  # YOLOv5に画像を渡して検出
    result_hist.append({'results':results, 'frame_num': frame_num})
    
    # ========= 結果をフレームに描画 =========
    # 動画はグレースケール化
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    gray_frame = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
    
    # ========= 視覚表現の追加 ========= 
    display_frame = draw_info_on_mov(result_hist[-30:], gray_frame, crnt_frame_num, class_names)
    
    # ========= ウィンドウ表示 =========
    cv2.imshow("YOLOv5 Detection", display_frame)

    # ========= フレームを書き込み ========= 
    out.write(display_frame) 

    # ========= qキーで途中終了 ========= 
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
    
# ========= 後処理 =========
cap.release()
out.release()
cv2.destroyAllWindows()

In [11]:
# class_names = get_object_sim_dict(obj_dict)
# draw_class_sim(class_names)

In [12]:
# m = math.exp(1)
# n = math.log(0.1)
# for a in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
#     print(a, math.exp(a) / m, 1 - math.log(a) / n )