### Chuẩn bị data

```text
datasets/
├── BEAT/
│   ├── 1/
│   ├── 2/
|   ├── ...
|   └── ...
├── BEAT_numpy/
│   ├── npy/
│   └── txt/
```

In [2]:
base_dir = "/home/serverai/ltdoanh/Motion_Diffusion/datasets/BEAT"
out_dir = "/home/serverai/ltdoanh/Motion_Diffusion/datasets/BEAT_numpy"

### Tính mean/std cho toàn bộ dataset

In [None]:
!python "/home/serverai/ltdoanh/Motion_Diffusion/datasets/step1_fit_scaler.py" --parent-dir "{base_dir}" --start 1 --end 4

### Chuyển dataset bvh sang npy theo từng segment

In [None]:
!python "/home/serverai/ltdoanh/Motion_Diffusion/datasets/preprocess_data.py" --parent-dir "{base_dir}" --out-root "{out_dir}" --start 1 --end 4

### Train MotionDiffuse như bình thường

In [None]:
# !python "/home/serverai/ltdoanh/Motion_Diffusion/tools/train.py" --dataset_name beat

### Train VQ-VAE

In [None]:
!python "/home/serverai/ltdoanh/Motion_Diffusion/tools/train_vq.py" --dataset_name beat --codebook_size 512

### Train MotionDiffuse trên Latent Space do VQ-VAE ở trên

In [1]:
!python "/home/serverai/ltdoanh/Motion_Diffusion/tools/train_vq_diffusion.py" --dataset_name beat --vqvae_name VQVAE_BEAT

2025-11-19 03:15:46.401146: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using device: cuda:0
[INFO] Mean and Std extracted successfully from pipeline.
[INFO] Mean and Std loaded successfully from .pkl file.
Loading motion‑text pairs: 100%|█████████████| 223/223 [00:00<00:00, 702.99it/s]
Train dataset: 223 samples
Loading VQ-VAE from ./checkpoints/beat/VQVAE_BEAT/model/finest.tar
VQ-VAE loaded successfully
VQ-VAE frozen
VQLatentDiffusion initialized:
  - Input features (latent): 512
  - Latent sequence length: 45
  - Transformer latent dim: 512
  - Use continuous latent: True
Model parameters:
  Total: 266.18M
  Trainable: 87.33M
Diffusion steps: 1000
Starting VQ Latent Diffusion Training
Epochs: 10
Batches per epoch: 3
Total steps: 30
[INFO] 3/1_wayn

### Evaluation - Đang fix lỗi ...

In [None]:
!python "/home/serverai/ltdoanh/Motion_Diffusion/run_evaluation.py"

### Visual data bằng mean/std chuẩn tính từ bộ dữ liệu

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

import sys
import os
import numpy as np
import joblib

# Setup path
PYOM_DIR = "/home/serverai/ltdoanh/Motion_Diffusion/datasets/pymo"
if PYOM_DIR not in sys.path:
    sys.path.insert(0, PYOM_DIR)

from pymo.parsers import BVHParser
from pymo.preprocessing import *
from pymo.viz_tools import draw_stickfigure, draw_stickfigure3d
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

print("✅ Imports successful!")

# ===== CELL 2: Load Pipeline =====
pipeline_path = "/home/serverai/ltdoanh/Motion_Diffusion/global_pipeline.pkl"

print(f"📦 Loading pipeline from: {pipeline_path}")

pipeline = joblib.load(pipeline_path)
print(f"✅ Pipeline loaded!")
print(f"   Mean shape: {pipeline.named_steps['stdscale'].data_mean_.shape}, First 5 values: {pipeline.named_steps['stdscale'].data_mean_[:5]}")
print(f"   Std shape: {pipeline.named_steps['stdscale'].data_std_.shape},  First 5 values: {pipeline.named_steps['stdscale'].data_std_[:5]}")

# ===== CELL 3: Load and Visualize Motion =====
npy_path = "/home/serverai/ltdoanh/Motion_Diffusion/datasets/BEAT_numpy/npy/1/1_wayne_0_1_1_sentence_000.npy"
frame = 50

print(f"\n🎬 Visualizing: {os.path.basename(npy_path)}")
print(f"   Frame: {frame}")

# Load motion data
motion_data = np.load(npy_path)
print(f"   Motion shape: {motion_data.shape}")

# Inverse transform
print("   Performing inverse transform...")
reconstructed = pipeline.inverse_transform([motion_data])
print(f"   ✅ Reconstructed shape: {reconstructed[0].values.shape}")

# Visualize 2D
print("\n   Creating 2D visualization...")
fig1 = plt.figure(figsize=(10, 8))
draw_stickfigure(reconstructed[0], frame=frame)
plt.title(f"2D Stick Figure - Frame {frame}")
plt.tight_layout()
plt.show()

print("   ✅ 2D plot displayed!")

# Visualize 3D
print("\n   Creating 3D visualization...")
fig2 = plt.figure(figsize=(10, 8))
draw_stickfigure3d(reconstructed[0], frame=frame)
plt.title(f"3D Stick Figure - Frame {frame}")
plt.tight_layout()
plt.show()

print("   ✅ 3D plot displayed!")

### Visual data bằng mean/std từ model được huấn luyện

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import sys
import os
import numpy as np
import joblib
import torch

# Setup path
PYOM_DIR = "/home/serverai/ltdoanh/Motion_Diffusion/datasets/pymo"
if PYOM_DIR not in sys.path:
    sys.path.insert(0, PYOM_DIR)

from pymo.parsers import BVHParser
from pymo.preprocessing import *
from pymo.viz_tools import draw_stickfigure, draw_stickfigure3d
from sklearn.pipeline import Pipeline

print("✅ Imports successful!")

# ===== CREATE PIPELINE WITH META =====
print("\n🔨 Creating pipeline from scratch...")

# 1. Create empty pipeline
pipeline = Pipeline([
    ('param', MocapParameterizer('position')),
    ('rcpn', RootCentricPositionNormalizer()),
    ('delta', RootTransformer('abdolute_translation_deltas')),
    ('const', ConstantsRemover()),
    ('np', Numpyfier()),
    ('down', DownSampler(2)),
    ('stdscale', ListStandardScaler())
])

# 2. Fit pipeline on sample BVH (to learn structure)
bvh_sample_path = "/home/serverai/ltdoanh/Motion_Diffusion/datasets/BEAT/1/1_wayne_0_1_1.bvh"
print(f"   Fitting on BVH sample: {os.path.basename(bvh_sample_path)}")

parser = BVHParser()
parsed_data = parser.parse(bvh_sample_path)
pipeline.fit([parsed_data])
print("   ✅ Pipeline fitted (structure learned)")

# 3. Load mean/std from meta directory
# meta_dir = "/home/serverai/ltdoanh/Motion_Diffusion/checkpoints/beat/test/meta"
# mean_path = os.path.join(meta_dir, "mean.npy")
# std_path = os.path.join(meta_dir, "std.npy")
ckpt_path = "/home/serverai/ltdoanh/Motion_Diffusion/checkpoints/beat/vq_diffusion/model/best.pt"  # Thay đường dẫn file của bạn vào đây

try:
    # 2. Load checkpoint
    # map_location='cpu' giúp tránh lỗi nếu máy bạn không có GPU giống lúc train
    checkpoint = torch.load(ckpt_path, map_location='cpu', weights_only=False)

    # 3. Kiểm tra và lấy mean, std
    if 'mean' in checkpoint and 'std' in checkpoint:
        mean = checkpoint['mean']
        std = checkpoint['std']

        print("--- Đã tìm thấy Mean và Std ---")
        print(f"Shape của Mean: {mean.shape}")
        print(f"Shape của Std: {std.shape}")
        
        # In thử vài giá trị đầu
        print(f"Mean (5 giá trị đầu): {mean[:5]}")
        print(f"Std (5 giá trị đầu): {std[:5]}")
        
        # 4. (Tùy chọn) Lưu lại ra file .npy để dùng việc khác nếu cần
        # np.save('mean.npy', mean)
        # np.save('std.npy', std)
        # print("Đã lưu ra file .npy")
        
    else:
        print("Không tìm thấy key 'mean' hoặc 'std' trong file .pt này.")
        print("Các keys hiện có:", checkpoint.keys())

except Exception as e:
    print(f"Có lỗi xảy ra: {e}")

# print(f"\n📊 Loading meta statistics:")
# print(f"   Mean: {mean_path}")
# print(f"   Std: {std_path}")

# mean_val = np.load(mean_path)
# std_val = np.load(std_path)

# print(f"   ✅ Mean shape: {mean_val.shape}, Mean first 5 values: {mean_val[:5]}")
# print(f"   ✅ Std shape: {std_val.shape}, Std first 5 values: {std_val[:5]}")

# 4. Override pipeline's mean/std with meta values
print("\n🔧 Overriding pipeline statistics with meta values...")
pipeline.named_steps['stdscale'].data_mean_ = mean
pipeline.named_steps['stdscale'].data_std_ = std
print("   ✅ Pipeline updated with meta statistics!")

# ===== VISUALIZE MOTION =====
npy_path = "/home/serverai/ltdoanh/Motion_Diffusion/results/generated_motions/motion_0_0.npy"
frame = 50

print(f"\n🎬 Visualizing: {os.path.basename(npy_path)}")
print(f"   Frame: {frame}")

# Load motion data
motion_data = np.load(npy_path)
print(f"   Motion shape: {motion_data.shape}")

# Inverse transform
print("   Performing inverse transform...")
reconstructed = pipeline.inverse_transform([motion_data])
print(f"   ✅ Reconstructed shape: {reconstructed[0].values.shape}")

# Visualize 2D
print("\n   Creating 2D visualization...")
fig1 = plt.figure(figsize=(10, 8))
draw_stickfigure(reconstructed[0], frame=frame)
plt.title(f"2D Stick Figure (Meta) - Frame {frame}")
plt.tight_layout()
plt.show()

print("   ✅ 2D plot displayed!")

# Visualize 3D
print("\n   Creating 3D visualization...")
fig2 = plt.figure(figsize=(10, 8))
draw_stickfigure3d(reconstructed[0], frame=frame)
plt.title(f"3D Stick Figure (Meta) - Frame {frame}")
plt.tight_layout()
plt.show()

print("   ✅ 3D plot displayed!")

### Inference 

In [None]:
import torch
import torch.nn as nn
import numpy as np
import os
import sys

# Đảm bảo đường dẫn đúng để import các module của bạn
# sys.path.insert(0, "/path/to/your/project_root") 

from models import MotionTransformer
from trainers import DDPMTrainer
# Import đúng class từ file model.py bạn đã upload
from models.vq.model import RVQVAE 

# ==========================================
# 1. Cấu hình Inference
# ==========================================
class InferenceConfig:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        self.is_train = False
        self.schedule_sampler = 'uniform'   

        # --- Cấu hình Diffusion ---
        self.input_feats = 512     # Latent Dimension
        self.num_frames = 24       # Latent Length (360 / 8)
        self.num_layers = 8
        self.latent_dim = 512
        self.ff_size = 1024
        self.num_heads = 8
        self.dropout = 0.1
        self.activation = "gelu"
        self.dataset_name = 'beat' 
        self.do_denoise = True
        self.noise_schedule = 'cosine'
        self.diffusion_steps = 1000
        self.no_clip = False
        self.no_eff = False
        self.result_dir = "/home/serverai/ltdoanh/Motion_Diffusion/results"

# Class giả lập args cho RVQVAE
class VQArgs:
    def __init__(self):
        # Các giá trị mặc định, sẽ được cập nhật logic bên dưới
        self.num_quantizers = 1 
        self.shared_codebook = False
        self.quantize_dropout_prob = 0.0
        self.mu = 0.99 # Cho QuantizerEMA

opt = InferenceConfig()
vq_args = VQArgs()

# ==========================================
# 2. Load Checkpoint & Tách Weights
# ==========================================
ckpt_path = "/home/serverai/ltdoanh/Motion_Diffusion/checkpoints/beat/vq_diffusion/model/best.pt"
print(f"📂 Loading checkpoint: {ckpt_path}")

# Load toàn bộ checkpoint
checkpoint = torch.load(ckpt_path, map_location='cpu', weights_only=False)
state_dict = checkpoint['model_state_dict']

# --- Tự động phát hiện số lượng Quantizers từ Checkpoint ---
# Điều này giúp tránh lỗi sai lệch key khi khởi tạo VQ-VAE
max_layer_idx = 0
for k in state_dict.keys():
    if "vqvae.quantizer.layers." in k:
        # Parse tìm số lớn nhất trong 'layers.X.'
        try:
            parts = k.split('.')
            layer_idx = int(parts[parts.index('layers') + 1])
            if layer_idx > max_layer_idx:
                max_layer_idx = layer_idx
        except:
            pass

vq_args.num_quantizers = max_layer_idx + 1
print(f"🔍 Detected num_quantizers: {vq_args.num_quantizers}")

# --- Tách Dictionary ---
trans_dict = {}
vqvae_dict = {}

for k, v in state_dict.items():
    if k.startswith('transformer.'):
        trans_dict[k[12:]] = v  
    elif k.startswith('vqvae.'):
        vqvae_dict[k[6:]] = v   

# ==========================================
# 3. Khởi tạo Models
# ==========================================

# A. Motion Transformer
print("🔧 Initializing MotionTransformer...")
encoder = MotionTransformer(
    input_feats=opt.input_feats,
    num_frames=opt.num_frames,
    num_layers=opt.num_layers,
    latent_dim=opt.latent_dim,
    num_heads=opt.num_heads,
    ff_size=opt.ff_size,
    no_clip=opt.no_clip,
    no_eff=opt.no_eff
)
encoder.load_state_dict(trans_dict, strict=True)
encoder.to(opt.device).eval()

# B. RVQVAE
print("🔧 Initializing RVQVAE...")
# Lưu ý: Các tham số dưới đây phải khớp với file config lúc train VQVAE của bạn.
# Tôi đang để các giá trị phổ biến dựa trên file model.py
vqvae_model = RVQVAE(
    args=vq_args,
    input_width=264,       # BEAT dataset thường là 264
    nb_code=512,           # Kiểm tra lại config train cũ nếu lỗi
    code_dim=512, 
    output_emb_width=512, 
    down_t=3, 
    stride_t=2, 
    width=512, 
    depth=3, 
    dilation_growth_rate=3,
    activation='relu',
    norm=None
)
vqvae_model.load_state_dict(vqvae_dict, strict=True)
vqvae_model.to(opt.device).eval()

# ==========================================
# 4. Inference
# ==========================================
trainer = DDPMTrainer(opt, encoder)

# Inject mean/std (Quan trọng cho quá trình decode cuối cùng)
trainer.mean = checkpoint['mean']
trainer.std = checkpoint['std']

print("🚀 Starting Inference...")
os.makedirs(opt.result_dir, exist_ok=True)

with torch.no_grad():
    caption = ["the first thing i like to do on weekends is relaxing"]
    
    # Độ dài Latent (45)
    m_lens = torch.LongTensor([45]).to(opt.device) 
    
    # 1. Sinh Latent (Diffusion) -> Output: (Batch, Length, Dim) = (1, 45, 512)
    pred_latent_list = trainer.generate(caption, m_lens, dim_pose=512)
    pred_latent = pred_latent_list[0]

    if pred_latent.dim() == 2:
        pred_latent = pred_latent.unsqueeze(0)  # Thêm batch dim nếu cần

    print(f"   Latent generated shape: {pred_latent.shape}")

    # 2. Decode bằng RVQVAE
    # RVQVAE Decoder cần input: (Batch, Channel, Length) -> Cần permute
    latent_input = pred_latent.permute(0, 2, 1) # -> (1, 512, 45)
    
    print("   Decoding with RVQVAE...")
    # Gọi trực tiếp decoder (bỏ qua quantizer vì Diffusion đã sinh ra latent rồi)
    decoded_motion = vqvae_model.decoder(latent_input)
    
    # 3. Post-process (Permute lại về: Batch, Length, Channel)
    # Hàm postprocess trong model.py: (B, C, T) -> (B, T, C)
    motion = vqvae_model.postprocess(decoded_motion).cpu().numpy()

    if motion.shape[1] == 264 and motion.shape[2] == 360:
        motion = motion.transpose(0, 2, 1)

    # motion = motion.cpu().numpy()
    
    # 4. Denormalize (Giải chuẩn hóa)
    # Output của VQVAE thường vẫn là normalized data
    mean = checkpoint['mean']
    std = checkpoint['std']
    
    # Đảm bảo shape khớp để broadcast
    # Motion: (1, 360, 264), Mean: (264,), Std: (264,)
    motion = motion * std + mean
    
    print(f"🎉 Final Motion Shape: {motion.shape}")

# Lưu kết quả
save_path = os.path.join(opt.result_dir, 'motion_inference.npy')
np.save(save_path, motion)
print(f"💾 Saved to: {save_path}")

KeyboardInterrupt: 

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import sys
import os
import numpy as np
import joblib
import torch

# Setup path
PYOM_DIR = "/home/serverai/ltdoanh/Motion_Diffusion/datasets/pymo"
if PYOM_DIR not in sys.path:
    sys.path.insert(0, PYOM_DIR)

from pymo.parsers import BVHParser
from pymo.preprocessing import *
from pymo.viz_tools import draw_stickfigure, draw_stickfigure3d
from sklearn.pipeline import Pipeline

print("✅ Imports successful!")

# ===== CREATE PIPELINE WITH META =====
print("\n🔨 Creating pipeline from scratch...")

# 1. Create empty pipeline
pipeline = Pipeline([
    ('param', MocapParameterizer('position')),
    ('rcpn', RootCentricPositionNormalizer()),
    ('delta', RootTransformer('abdolute_translation_deltas')),
    ('const', ConstantsRemover()),
    ('np', Numpyfier()),
    ('down', DownSampler(2)),
    ('stdscale', ListStandardScaler())
])

# 2. Fit pipeline on sample BVH (to learn structure)
bvh_sample_path = "/home/serverai/ltdoanh/Motion_Diffusion/datasets/BEAT/1/1_wayne_0_1_1.bvh"
print(f"   Fitting on BVH sample: {os.path.basename(bvh_sample_path)}")

parser = BVHParser()
parsed_data = parser.parse(bvh_sample_path)
pipeline.fit([parsed_data])
print("   ✅ Pipeline fitted (structure learned)")

# 3. Load mean/std from meta directory
# meta_dir = "/home/serverai/ltdoanh/Motion_Diffusion/checkpoints/beat/test/meta"
# mean_path = os.path.join(meta_dir, "mean.npy")
# std_path = os.path.join(meta_dir, "std.npy")
ckpt_path = "/home/serverai/ltdoanh/Motion_Diffusion/checkpoints/beat/vq_diffusion/model/best.pt"  # Thay đường dẫn file của bạn vào đây

try:
    # 2. Load checkpoint
    # map_location='cpu' giúp tránh lỗi nếu máy bạn không có GPU giống lúc train
    checkpoint = torch.load(ckpt_path, map_location='cpu', weights_only=False)

    # 3. Kiểm tra và lấy mean, std
    if 'mean' in checkpoint and 'std' in checkpoint:
        mean = checkpoint['mean']
        std = checkpoint['std']

        print("--- Đã tìm thấy Mean và Std ---")
        print(f"Shape của Mean: {mean.shape}")
        print(f"Shape của Std: {std.shape}")
        
        # In thử vài giá trị đầu
        print(f"Mean (5 giá trị đầu): {mean[:5]}")
        print(f"Std (5 giá trị đầu): {std[:5]}")
        
        # 4. (Tùy chọn) Lưu lại ra file .npy để dùng việc khác nếu cần
        # np.save('mean.npy', mean)
        # np.save('std.npy', std)
        # print("Đã lưu ra file .npy")
        
    else:
        print("Không tìm thấy key 'mean' hoặc 'std' trong file .pt này.")
        print("Các keys hiện có:", checkpoint.keys())

except Exception as e:
    print(f"Có lỗi xảy ra: {e}")

# print(f"\n📊 Loading meta statistics:")
# print(f"   Mean: {mean_path}")
# print(f"   Std: {std_path}")

# mean_val = np.load(mean_path)
# std_val = np.load(std_path)

# print(f"   ✅ Mean shape: {mean_val.shape}, Mean first 5 values: {mean_val[:5]}")
# print(f"   ✅ Std shape: {std_val.shape}, Std first 5 values: {std_val[:5]}")

# 4. Override pipeline's mean/std with meta values
print("\n🔧 Overriding pipeline statistics with meta values...")
pipeline.named_steps['stdscale'].data_mean_ = mean
pipeline.named_steps['stdscale'].data_std_ = std
print("   ✅ Pipeline updated with meta statistics!")

# ===== VISUALIZE MOTION =====
npy_path = "/home/serverai/ltdoanh/Motion_Diffusion/results/motion_inference.npy"
frame = 50

print(f"\n🎬 Visualizing: {os.path.basename(npy_path)}")
print(f"   Frame: {frame}")

# Load motion data
motion_data = np.load(npy_path)
print(f"   Motion shape: {motion_data.shape}")
if motion_data.ndim == 3:
    motion_data = motion_data[0] # Lấy mẫu đầu tiên -> (360, 264)
    print(f"   Squeezed Motion shape: {motion_data.shape}")

# Inverse transform
print("   Performing inverse transform...")
reconstructed = pipeline.inverse_transform([motion_data])
print(f"   ✅ Reconstructed shape: {reconstructed[0].values.shape}")

# Visualize 2D
print("\n   Creating 2D visualization...")
fig1 = plt.figure(figsize=(10, 8))
draw_stickfigure(reconstructed[0], frame=frame)
plt.title(f"2D Stick Figure (Meta) - Frame {frame}")
plt.tight_layout()
plt.show()

print("   ✅ 2D plot displayed!")

# Visualize 3D
print("\n   Creating 3D visualization...")
fig2 = plt.figure(figsize=(10, 8))
draw_stickfigure3d(reconstructed[0], frame=frame)
plt.title(f"3D Stick Figure (Meta) - Frame {frame}")
plt.tight_layout()
plt.show()

print("   ✅ 3D plot displayed!")