In [1]:
%%capture
!pip install huggingface_hub[cli] transformers[torch] datasets

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
GH_TOKEN = user_secrets.get_secret("GH_TOKEN")
GH_USERN = user_secrets.get_secret("GH_USERNAME")
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

In [3]:
from huggingface_hub import login
login(HF_TOKEN)

In [4]:
!rm -rf facial-landmark-localization
!git clone https://{GH_USERN}:{GH_TOKEN}@github.com/nguyenvmthien/facial-landmark-localization.git
!cd facial-landmark-localization && git checkout vmphat --force

Cloning into 'facial-landmark-localization'...
remote: Enumerating objects: 292, done.[K
remote: Counting objects: 100% (104/104), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 292 (delta 60), reused 75 (delta 33), pack-reused 188 (from 1)[K
Receiving objects: 100% (292/292), 80.75 MiB | 46.45 MiB/s, done.
Resolving deltas: 100% (118/118), done.
Branch 'vmphat' set up to track remote branch 'vmphat' from 'origin'.
Switched to a new branch 'vmphat'


In [5]:
!rm -rf /kaggle/working/facial-landmark-localization/Source/saves

In [6]:
# !wget https://huggingface.co/kartiknarayan/facexformer/resolve/main/ckpts/model.pt 
# !mv model.pt /kaggle/working/facial-landmark-localization/Source

In [7]:
%%writefile facial-landmark-localization/Source/train.py
import os
import torch
import torchvision
import datasets
import PIL.Image
from datetime import datetime
from dataclasses import dataclass
from network import FaceXFormer
from transformers import Trainer, TrainingArguments
from torchvision.transforms import InterpolationMode

device = "cuda:0"
dtype = torch.float32
batch_size = 24
gradient_accumulation = 2


def load_model(weight_path):
    model = FaceXFormer().to(device)
    if weight_path:
        checkpoint = torch.load(weight_path, map_location=device)
        model.load_state_dict(checkpoint["state_dict_backbone"])
        print("Loaded pretrained model")
    model = model.to(dtype=dtype)
    return model


def load_dataset(dataset_path):
    dataset = datasets.load_dataset("json", data_files=dataset_path, split="train")
    # dataset = dataset.cast_column("img", datasets.Image())
    dataset = dataset.train_test_split(0.2, shuffle=True, seed=42)
    return dataset


@dataclass
class DataCollator:
    image_base_path: str
    return_tensors: str = "pt"

    transforms_image = torchvision.transforms.Compose(
        [
            torchvision.transforms.Resize(
                size=(224, 224), interpolation=InterpolationMode.BICUBIC
            ),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize(
                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
            ),
        ]
    )

    def __call__(self, features: list[dict], return_tensors=None):
        batch = {
            "labels": [],
            "x": [],
            # "tasks": torch.ones(len(features)),
        }

        for feature in features:
            image = PIL.Image.open(
                os.path.join(self.image_base_path, feature["img"])
            ).convert("RGB")
            label = feature["pts"]
            image, x_min, y_min, width = self.cut_face(image, label)
            batch["labels"].append([[x - x_min, y - y_min] for x, y in label])
            batch["x"].append(image)

        # print("This was called")
        batch["labels"] = torch.Tensor(batch["labels"]).to(device) / width * 2 - 1
        batch["x"] = torch.stack(batch["x"]).to(device)
        return batch

    def adjust_bbox(
        self,
        x_min: int,
        y_min: int,
        x_max: int,
        y_max: int,
        image_width: int,
        image_height: int,
        margin_percentage=50,
    ):
        width = x_max - x_min
        height = y_max - y_min

        increase_width = width * (margin_percentage / 100.0) / 2
        increase_height = height * (margin_percentage / 100.0) / 2

        x_min_adjusted = max(0, x_min - increase_width)
        y_min_adjusted = max(0, y_min - increase_height)
        x_max_adjusted = min(image_width, x_max + increase_width)
        y_max_adjusted = min(image_height, y_max + increase_height)

        return x_min_adjusted, y_min_adjusted, x_max_adjusted, y_max_adjusted

    def cut_face(self, img: PIL.Image.Image, pts: list[list[float]]):
        x_vals = [point[0] for point in pts]
        y_vals = [point[1] for point in pts]
        x_min, x_max = int(min(x_vals)), int(max(x_vals))
        y_min, y_max = int(min(y_vals)), int(max(y_vals))
        # Make the image square
        face_img_w, face_img_h = x_max - x_min, y_max - y_min
        side_len = max(face_img_w, face_img_h)
        x_min = max(0, x_min - (side_len - face_img_w) / 2)
        x_max = min(img.width, x_max + (side_len - face_img_w) / 2)
        y_min = max(0, y_min - (side_len - face_img_h) / 2)
        y_max = min(img.height, y_max + (side_len - face_img_h) / 2)

        # Padding
        x_min, y_min, x_max, y_max = self.adjust_bbox(
            x_min, y_min, x_max, y_max, img.width, img.height
        )
        image = img.crop((int(x_min), int(y_min), int(x_max), int(y_max)))
        image: torch.Tensor = self.transforms_image(image)
        return image, x_min, y_min, x_max - x_min


def main():
    dataset = load_dataset(dataset_path="/kaggle/input/300w-with-json/train.json")
    # print(dataset["train"][78])
    # exit(1)
    # model = load_model(weight_path="model.pt")
    model = load_model(weight_path=None)
    trainer = Trainer(
        model,
        args=TrainingArguments(
            output_dir=os.path.join(
                "saves", datetime.strftime(datetime.now(), "%Y%m%d-%H%M")
            ),
            dataloader_pin_memory=False,
            eval_strategy="steps",
            eval_steps=200,
            optim="adamw_torch",
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            gradient_accumulation_steps=gradient_accumulation,
            learning_rate=1e-4,
            save_strategy="steps",
            save_steps=200,
            save_total_limit=3,
            save_safetensors=False,
            remove_unused_columns=False,
            report_to=["tensorboard"],
            # max_steps=200,
            num_train_epochs=12,
            logging_steps=1,
        ),
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        data_collator=DataCollator(image_base_path="/kaggle/input/ibug-300w-large-face-landmark-dataset/ibug_300W_large_face_landmark_dataset"),
    )

    trainer.train()


if __name__ == "__main__":
    main()


Overwriting facial-landmark-localization/Source/train.py


In [8]:
!cd facial-landmark-localization/Source && CUDA_VISIBLE_DEVICES=0 python train.py

2025-05-01 12:56:46.573782: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746104207.046563      66 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746104207.167217      66 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Generating train split: 6666 examples [00:01, 5742.25 examples/s]
Downloading: "https://download.pytorch.org/models/swin_b-68c6b09e.pth" to /root/.cache/torch/hub/checkpoints/swin_b-68c6b09e.pth
100%|█████████████████████████████████████████| 335M/335M [00:01<00:00, 205MB/s]
{'loss': 3.2881, 'grad_norm': 1.4522455930709839, 'learning_rate': 0.0001, 'epoch': 0.01}
{'loss': 1.5019, 'grad_norm': 0.48907673358917236, 'learning_rat

In [9]:
!huggingface-cli whoami

thng292
[1morgs: [0m venera-ai,pre-view


In [10]:
!huggingface-cli upload FaceXFormer-train-test facial-landmark-localization/Source/saves . --private

Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
Start hashing 19 files.
Finished hashing 19 files.

optimizer.pt:   0%|                                 | 0.00/33.8M [00:00<?, ?B/s]

training_args.bin:   0%|                            | 0.00/5.30k [00:00<?, ?B/s][A[A


Upload 10 LFS files:   0%|                               | 0/10 [00:00<?, ?it/s][A[A[A



pytorch_model.bin:   0%|                             | 0.00/369M [00:00<?, ?B/s][A[A[A[A




optimizer.pt:   0%|                                 | 0.00/33.8M [00:00<?, ?B/s][A[A[A[A[A
pytorch_model.bin:   0%|                     | 377k/369M [00:00<01:44, 3.53MB/s][A




optimizer.pt:   1%|▎                        | 377k/33.8M [00:00<00:09, 3.44MB/s]



training_args.bin: 100%|███████████████████| 5.30k/5.30k [00:00<00:00, 33.2kB/s]

pytorch_model.bin:   3%|▌            