# Face Recognition dengan YOLOv8 + ViT

Notebook ini menggunakan:
- **YOLOv8-face** untuk deteksi wajah (lebih cepat dari MTCNN)
- **ViT (Vision Transformer)** untuk klasifikasi wajah

## Perbandingan Speed
| Detector | Speed | Accuracy |
|----------|-------|----------|
| MTCNN | ~2-3 FPS | Tinggi |
| YOLOv8-face | ~15-30 FPS | Tinggi |

## 1. Install Dependencies

In [None]:
!pip install ultralytics torch torchvision transformers opencv-python pillow huggingface_hub

## 2. Download YOLOv8-face Model

In [None]:
from huggingface_hub import hf_hub_download
import os

# Download YOLOv8-face model dari Hugging Face
model_path = hf_hub_download(
    repo_id="arnabdhar/YOLOv8-Face-Detection", 
    filename="model.pt",
    local_dir="."
)

# Rename untuk konsistensi
if os.path.exists("model.pt") and not os.path.exists("yolov8-face.pt"):
    os.rename("model.pt", "yolov8-face.pt")
    print("Model saved as: yolov8-face.pt")
else:
    print("Model ready:", model_path)

## 3. Import Libraries

In [None]:
import torch
import numpy as np
import cv2
from PIL import Image
from ultralytics import YOLO
from torchvision import transforms
from transformers import ViTForImageClassification
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import time

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

## 4. Configuration

In [None]:
# Class names - sesuaikan dengan dataset training kamu
CLASS_NAMES = [
    "akbar", "aprilianza", "bian", "fadhilah", "falah",
    "iksan", "imelda", "rifqy", "yolanda"
]

# Role mapping untuk authorization
ROLE_MAPPING = {
    "iksan": "Aslab",
    "akbar": "Aslab",
    "aprilianza": "Aslab",
    "bian": "Dosen",
    "fadhilah": "Aslab",
    "falah": "Aslab",
    "imelda": "Aslab",
    "rifqy": "Aslab",
    "yolanda": "Aslab",
}

# Thresholds
CONFIDENCE_THRESHOLD = 0.5  # Minimum confidence untuk klasifikasi
FACE_DETECTION_THRESHOLD = 0.35  # Minimum confidence untuk deteksi wajah

# Model paths
YOLO_MODEL_PATH = "yolov8-face.pt"  # YOLOv8-face model
VIT_MODEL_PATH = "../best_vit_mtcnn.pth"  # ViT classification model

print(f"Classes: {CLASS_NAMES}")
print(f"Number of classes: {len(CLASS_NAMES)}")

## 5. Load Models

In [None]:
# Load YOLOv8-face detector
print("Loading YOLOv8-face...")
yolo = YOLO(YOLO_MODEL_PATH)
print("YOLOv8-face loaded!")

# Warm up YOLO
_ = yolo.predict(np.zeros((320, 320, 3), dtype=np.uint8), verbose=False)
print("YOLOv8 warmed up!")

In [None]:
# Load ViT classifier
print("Loading ViT classifier...")
vit_model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=len(CLASS_NAMES),
    ignore_mismatched_sizes=True
)

# Load trained weights
if os.path.exists(VIT_MODEL_PATH):
    vit_model.load_state_dict(torch.load(VIT_MODEL_PATH, map_location=device))
    print(f"ViT weights loaded from {VIT_MODEL_PATH}")
else:
    print(f"WARNING: ViT weights not found at {VIT_MODEL_PATH}")

vit_model.to(device)
vit_model.eval()
print("ViT classifier ready!")

In [None]:
# Transform for ViT input
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

## 6. Helper Functions

In [None]:
def get_full_label(name):
    """Get full label with role and authorization status"""
    name_lower = name.lower()
    if name_lower in ROLE_MAPPING:
        role = ROLE_MAPPING[name_lower]
        return f"{name.capitalize()} ({role})", role, True
    else:
        return f"{name} (Guest)", "Guest", False


def detect_faces_yolo(image):
    """Detect faces using YOLOv8-face"""
    if isinstance(image, Image.Image):
        img_np = np.array(image)
    else:
        img_np = image
    
    # Run detection
    results = yolo.predict(img_np, verbose=False, conf=FACE_DETECTION_THRESHOLD, imgsz=480)
    
    faces = []
    for r in results:
        for box in r.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            conf = float(box.conf[0])
            faces.append((x1, y1, x2, y2, conf))
    
    return faces


def classify_face(face_image):
    """Classify face using ViT"""
    if isinstance(face_image, np.ndarray):
        face_image = Image.fromarray(face_image)
    
    face_tensor = transform(face_image).unsqueeze(0).to(device)
    
    with torch.no_grad():
        outputs = vit_model(face_tensor).logits
        probs = torch.softmax(outputs, dim=1)
        confidence, predicted = torch.max(probs, 1)
    
    return predicted.item(), confidence.item()


def process_image(image):
    """Process image: detect faces and classify each"""
    if isinstance(image, np.ndarray):
        if len(image.shape) == 3 and image.shape[2] == 3:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(image)
    else:
        pil_image = image
    
    # Detect faces
    faces = detect_faces_yolo(pil_image)
    
    results = []
    img_w, img_h = pil_image.size
    
    for x1, y1, x2, y2, det_conf in faces:
        # Add padding
        w, h = x2 - x1, y2 - y1
        pad = int(max(w, h) * 0.15)
        x1 = max(0, x1 - pad)
        y1 = max(0, y1 - pad)
        x2 = min(img_w, x2 + pad)
        y2 = min(img_h, y2 + pad)
        
        # Crop face
        face = pil_image.crop((x1, y1, x2, y2))
        
        # Classify
        predicted_idx, confidence = classify_face(face)
        
        if confidence >= CONFIDENCE_THRESHOLD:
            name = CLASS_NAMES[predicted_idx]
            full_label, role, authorized = get_full_label(name)
        else:
            name = "Unknown"
            full_label = "Unknown (Guest)"
            role = "Guest"
            authorized = False
        
        results.append({
            "name": name,
            "full_label": full_label,
            "role": role,
            "authorized": authorized,
            "confidence": confidence,
            "bbox": (x1, y1, x2, y2),
            "detection_score": det_conf
        })
    
    return results

## 7. Test on Image

In [None]:
def visualize_detection(image_path):
    """Visualize detection results on an image"""
    # Load image
    img = cv2.imread(image_path)
    if img is None:
        print(f"Could not load image: {image_path}")
        return
    
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # Process
    start_time = time.time()
    results = process_image(img_rgb)
    elapsed = time.time() - start_time
    
    # Draw results
    for r in results:
        x1, y1, x2, y2 = r['bbox']
        color = (0, 255, 0) if r['authorized'] else (255, 0, 0)
        
        cv2.rectangle(img_rgb, (x1, y1), (x2, y2), color, 2)
        
        label = f"{r['full_label']} ({r['confidence']*100:.1f}%)"
        cv2.putText(img_rgb, label, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
    
    # Display
    plt.figure(figsize=(12, 8))
    plt.imshow(img_rgb)
    plt.title(f"Detected {len(results)} face(s) in {elapsed*1000:.1f}ms ({1/elapsed:.1f} FPS)")
    plt.axis('off')
    plt.show()
    
    return results

# Test dengan gambar (ganti path sesuai kebutuhan)
# results = visualize_detection("test_image.jpg")

## 8. Real-time Webcam Detection

In [None]:
def run_webcam_detection(duration=30):
    """
    Run real-time face detection on webcam
    
    Args:
        duration: How long to run (seconds). Set to 0 for infinite.
    """
    cap = cv2.VideoCapture(0)
    
    if not cap.isOpened():
        print("Error: Could not open webcam")
        return
    
    # Set resolution
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
    
    print("Webcam started. Press 'q' to quit.")
    print("=" * 50)
    
    frame_count = 0
    start_time = time.time()
    fps_list = []
    
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            frame_start = time.time()
            
            # Convert to RGB
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Process
            results = process_image(rgb_frame)
            
            # Draw results
            for r in results:
                x1, y1, x2, y2 = r['bbox']
                color = (0, 255, 0) if r['authorized'] else (0, 0, 255)
                
                cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                
                label = f"{r['full_label']} ({r['confidence']*100:.1f}%)"
                
                # Background for text
                (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
                cv2.rectangle(frame, (x1, y1-25), (x1+tw+10, y1), color, -1)
                cv2.putText(frame, label, (x1+5, y1-8), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 2)
            
            # Calculate FPS
            frame_time = time.time() - frame_start
            fps = 1 / frame_time if frame_time > 0 else 0
            fps_list.append(fps)
            avg_fps = np.mean(fps_list[-30:])  # Average of last 30 frames
            
            # Draw FPS
            cv2.putText(frame, f"FPS: {avg_fps:.1f}", (10, 30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            cv2.putText(frame, f"Faces: {len(results)}", (10, 60), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            
            # Display
            cv2.imshow('YOLOv8 + ViT Face Recognition', frame)
            
            frame_count += 1
            
            # Check for quit
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
            
            # Check duration
            if duration > 0 and (time.time() - start_time) > duration:
                break
                
    finally:
        cap.release()
        cv2.destroyAllWindows()
        
        # Print stats
        total_time = time.time() - start_time
        print("\n" + "=" * 50)
        print(f"Total frames: {frame_count}")
        print(f"Total time: {total_time:.1f}s")
        print(f"Average FPS: {frame_count/total_time:.1f}")
        print(f"Min FPS: {min(fps_list):.1f}")
        print(f"Max FPS: {max(fps_list):.1f}")

In [None]:
# Run webcam detection for 30 seconds
# Press 'q' to quit early
run_webcam_detection(duration=30)

## 9. Benchmark: YOLO vs MTCNN

In [None]:
def benchmark_detection(num_frames=100):
    """Benchmark YOLOv8-face detection speed"""
    cap = cv2.VideoCapture(0)
    
    if not cap.isOpened():
        print("Error: Could not open webcam")
        return
    
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
    
    print(f"Benchmarking {num_frames} frames...")
    
    detection_times = []
    classification_times = []
    total_times = []
    
    for i in range(num_frames):
        ret, frame = cap.read()
        if not ret:
            break
        
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(rgb_frame)
        
        # Time detection
        t1 = time.time()
        faces = detect_faces_yolo(pil_image)
        t2 = time.time()
        detection_times.append(t2 - t1)
        
        # Time classification (if faces found)
        if faces:
            x1, y1, x2, y2, _ = faces[0]
            face = pil_image.crop((x1, y1, x2, y2))
            t3 = time.time()
            classify_face(face)
            t4 = time.time()
            classification_times.append(t4 - t3)
        
        total_times.append(time.time() - t1)
        
        if (i + 1) % 20 == 0:
            print(f"  Processed {i+1}/{num_frames} frames...")
    
    cap.release()
    
    # Print results
    print("\n" + "=" * 50)
    print("BENCHMARK RESULTS")
    print("=" * 50)
    print(f"\nYOLOv8-face Detection:")
    print(f"  Average: {np.mean(detection_times)*1000:.1f}ms")
    print(f"  Min: {np.min(detection_times)*1000:.1f}ms")
    print(f"  Max: {np.max(detection_times)*1000:.1f}ms")
    
    if classification_times:
        print(f"\nViT Classification:")
        print(f"  Average: {np.mean(classification_times)*1000:.1f}ms")
        print(f"  Min: {np.min(classification_times)*1000:.1f}ms")
        print(f"  Max: {np.max(classification_times)*1000:.1f}ms")
    
    print(f"\nTotal Pipeline:")
    print(f"  Average: {np.mean(total_times)*1000:.1f}ms")
    print(f"  Estimated FPS: {1/np.mean(total_times):.1f}")

# Run benchmark
# benchmark_detection(100)

## 10. Save Configuration

In [None]:
import json

config = {
    "detector": "YOLOv8-face",
    "classifier": "ViT-base-patch16-224",
    "class_names": CLASS_NAMES,
    "role_mapping": ROLE_MAPPING,
    "confidence_threshold": CONFIDENCE_THRESHOLD,
    "face_detection_threshold": FACE_DETECTION_THRESHOLD,
    "yolo_model_path": YOLO_MODEL_PATH,
    "vit_model_path": VIT_MODEL_PATH
}

with open("yolov8_config.json", "w") as f:
    json.dump(config, f, indent=2)

print("Configuration saved to yolov8_config.json")