In [None]:
# Import required libraries
import os
import pandas as pd
import shutil
from pathlib import Path
from tqdm.notebook import tqdm
import cv2
import torch
from facenet_pytorch import MTCNN
from kaggle.api.kaggle_api_extended import KaggleApi
import json

# Initialize MTCNN for face detection
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
mtcnn = MTCNN(
    image_size=224, 
    margin=0,
    min_face_size=20,
    thresholds=[0.6, 0.7, 0.7],
    factor=0.709,
    device=device,
    keep_all=False
)

# Download dataset from Kaggle
print("Downloading dataset...")
api = KaggleApi()
api.authenticate()
api.dataset_download_files('hlly34/liveness-detection-zalo-2022', path='/kaggle/working', unzip=True)

# Create new dataset structure
dataset_root = Path('/kaggle/working/Zalo_AIC_dataset') 
for folder in ['live', 'spoof']:
    (dataset_root / folder).mkdir(parents=True, exist_ok=True)

# Read labels and move videos to appropriate folders
print("\nMoving videos to live/spoof folders...")
label_file = Path('/kaggle/working/train/train/label.csv')
labels_df = pd.read_csv(label_file)

for _, row in tqdm(labels_df.iterrows(), total=len(labels_df)):
    src = Path('/kaggle/working/train/train/videos') / row['fname']
    if row['liveness_score'] == 1:
        dst = dataset_root / 'live' / row['fname']
    else:
        dst = dataset_root / 'spoof' / row['fname']
    shutil.move(str(src), str(dst))

# Remove original dataset
print("\nRemoving original dataset...")
shutil.rmtree('/kaggle/working/train')

# Function to extract frames from video
def extract_frames(video_path, save_dir, start_idx):
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        print(f"Error opening video: {video_path}")
        return start_idx
        
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = frame_count / fps
    
    idx = start_idx
    for sec in range(int(duration)):
        cap.set(cv2.CAP_PROP_POS_FRAMES, sec * fps)
        ret, frame = cap.read()
        if ret:
            save_path = save_dir / f"{idx:06d}.jpg"
            cv2.imwrite(str(save_path), frame)
            idx += 1
            
    cap.release()
    return idx

# Extract frames from all videos
print("\nExtracting frames from videos...")
next_idx = 1
for folder in ['live', 'spoof']:
    folder_path = dataset_root / folder
    videos = list(folder_path.glob('*.mp4'))
    
    for video in tqdm(videos, desc=f"Processing {folder} videos"):
        next_idx = extract_frames(video, folder_path, next_idx)
        video.unlink()  # Delete video after extracting frames

# Function to detect face and save bounding box
def detect_face(img_path):
    try:
        img = cv2.imread(str(img_path))
        if img is None:
            return False
            
        real_h, real_w = img.shape[:2]
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        boxes, probs = mtcnn.detect(img_rgb)
        
        if boxes is None or len(boxes) == 0:
            return False
            
        box = boxes[0]
        prob = probs[0]
        
        x1, y1, x2, y2 = box
        w = x2 - x1
        h = y2 - y1
        
        x = int(x1 * 224 / real_w)
        y = int(y1 * 224 / real_h)
        w = int(w * 224 / real_w)
        h = int(h * 224 / real_h)
        
        bb_path = img_path.parent / f"{img_path.stem}_BB.txt"
        with open(bb_path, 'w') as f:
            f.write(f"{x} {y} {w} {h} {prob:.7f}")
            
        return True
        
    except Exception as e:
        print(f"Error processing {img_path}: {str(e)}")
        return False

# Detect faces in all images
print("\nDetecting faces...")
for folder in ['live', 'spoof']:
    folder_path = dataset_root / folder
    images = list(folder_path.glob('*.jpg'))
    
    for img_path in tqdm(images, desc=f"Processing {folder} images"):
        if not detect_face(img_path):
            img_path.unlink()

# Function to rename files with new indices
def rename_files(folder_path, start_idx=1):
    files = sorted(f for f in folder_path.iterdir() if not f.name.endswith('_BB.txt'))
    
    # First rename to 7 digits
    print(f"Converting {folder_path.name} to 7 digits...")
    for idx, file in enumerate(tqdm(files)):
        ext = file.suffix
        # Rename image
        new_name = f"{(idx+1):07d}{ext}"
        new_path = file.parent / new_name
        file.rename(new_path)
        
        # Rename BB file if exists
        bb_file = file.parent / f"{file.stem}_BB.txt"
        if bb_file.exists():
            new_bb_name = f"{(idx+1):07d}_BB.txt"
            new_bb_path = file.parent / new_bb_name
            bb_file.rename(new_bb_path)
    
    # Then rename back to 6 digits
    files = sorted(f for f in folder_path.iterdir() if not f.name.endswith('_BB.txt'))
    print(f"Renaming {folder_path.name} to 6 digits...")
    for idx, file in enumerate(tqdm(files)):
        ext = file.suffix
        # Rename image
        new_name = f"{(start_idx+idx):06d}{ext}"
        new_path = file.parent / new_name
        file.rename(new_path)
        
        # Rename BB file if exists
        bb_file = file.parent / f"{file.stem}_BB.txt"
        if bb_file.exists():
            new_bb_name = f"{(start_idx+idx):06d}_BB.txt"
            new_bb_path = file.parent / new_bb_name
            bb_file.rename(new_bb_path)

# Rename all files
print("\nRenaming files...")
live_path = dataset_root / 'live'
spoof_path = dataset_root / 'spoof'

rename_files(live_path)  # Start live files from 000001
live_count = len(list(f for f in live_path.iterdir() if not f.name.endswith('_BB.txt')))
rename_files(spoof_path, start_idx=live_count+1)  # Continue numbering for spoof files

# Create and upload new dataset
print("\nCreating Kaggle dataset...")
metadata = {
    'title': 'Zalo-AIC - Face Anti-Spoofing Dataset',
    'id': 'zalo-aic-face-anti-spoofing-dataset', 
    'licenses': [{'name': 'CC0-1.0'}]
}

metadata_path = dataset_root / 'dataset-metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=4)

print("Uploading dataset to Kaggle...")
api.dataset_create_new(
    folder=str(dataset_root),
    dir_mode='zip',
    quiet=False
)

print("\nAll done!")