In [25]:
from ctypes import util
from cv2 import IMREAD_GRAYSCALE
import torch
import utils as utils
import torch.utils.data.dataset as Dataset
from torch.nn.utils.rnn import pad_sequence
import math
from torchvision import transforms
from PIL import Image
import cv2
import os
import random
import numpy as np
import lmdb
import io
import time
from vidaug import augmentors as va
from augmentation import *
import yaml
from loguru import logger
import argparse

In [26]:
with open('./configs/config_gloss_free.yaml', 'r+',encoding='utf-8') as f:
    config = yaml.load(f,Loader=yaml.FullLoader)
    
parser = argparse.ArgumentParser('Gloss-free Sign Language Translation script', add_help=False)
parser.add_argument('--batch-size', default=1, type=int)
parser.add_argument('--epochs', default=1, type=int)
parser.add_argument('--lr', type=float, default=1.0e-3, metavar='LR',
                help='learning rate (default: 5e-4)')
parser.add_argument('--seed', default=0, type=int)
parser.add_argument('--output_dir', default='./output/slt',
                help='path where to save, empty for no saving')
parser.add_argument('--device', default='cuda',
                help='device to use for training / testing')
parser.add_argument('--num_workers', default=0, type=int)
parser.add_argument('--config', type=str, default='./configs/config_gloss_free.yaml')


# * distributed training parameters
parser.add_argument('--world_size', default=1, type=int,
                help='number of distributed processes')
parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
parser.add_argument('--local_rank', default=0, type=int)

# * Finetuning params
parser.add_argument('--finetune', default='', help='finetune from checkpoint')

# * Optimizer parameters
parser.add_argument('--opt', default='adamw', type=str, metavar='OPTIMIZER',
                help='Optimizer (default: "adamw"')
parser.add_argument('--opt-eps', default=1.0e-09, type=float, metavar='EPSILON',
                help='Optimizer Epsilon (default: 1.0e-09)')
parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
                help='Optimizer Betas (default: None, use opt default)')
parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM',
                help='Clip gradient norm (default: None, no clipping)')
parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
                help='SGD momentum (default: 0.9)')
parser.add_argument('--weight-decay', type=float, default=0.001,
                help='weight decay (default: 0.05)')

# * Learning rate schedule parameters
parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER',
                help='LR scheduler (default: "cosine"')

parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
                help='learning rate noise on/off epoch percentages')
parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
                help='learning rate noise limit percent (default: 0.67)')
parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
                help='learning rate noise std-dev (default: 1.0)')
parser.add_argument('--warmup-lr', type=float, default=1e-6, metavar='LR',
                help='warmup learning rate (default: 1e-6)')
parser.add_argument('--min-lr', type=float, default=1.0e-08, metavar='LR',
                help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')

parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
                help='epoch interval to decay LR')
parser.add_argument('--warmup-epochs', type=int, default=0, metavar='N',
                help='epochs to warmup LR, if scheduler supports')
parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
                help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
                help='patience epochs for Plateau LR scheduler (default: 10')
parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
                help='LR decay rate (default: 0.1)')
parser.add_argument('--decoder_type', default='LD')
parser.add_argument('--noise_rate', type=float, default=0.15)
parser.add_argument('--noise_type', default='omit_last')
parser.add_argument('--random_shuffle', default=False)

# * Baise params


parser.add_argument('--resume', default='', help='resume from checkpoint')
parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
                help='start epoch')
parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
parser.add_argument('--dist-eval', action='store_true', default=False, help='Enabling distributed evaluation')
parser.add_argument('--pin-mem', action='store_true',
                help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
parser.add_argument('--no-pin-mem', action='store_false', dest='pin_mem',
                help='')
parser.set_defaults(pin_mem=True)


# *Drop out params
parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
                help='Dropout rate (default: 0.)')
parser.add_argument('--drop-path', type=float, default=0.1, metavar='PCT',
                help='Drop path rate (default: 0.1)')

# * Mixup params
parser.add_argument('--mixup', type=float, default=0.0,
                help='mixup alpha, mixup enabled if > 0. (default: 0.8)')
parser.add_argument('--cutmix', type=float, default=0.0,
                help='cutmix alpha, cutmix enabled if > 0. (default: 1.0)')
parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
                help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
parser.add_argument('--mixup-prob', type=float, default=1.0,
                help='Probability of performing mixup or cutmix when either/both is enabled')
parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
                help='Probability of switching to cutmix when both mixup and cutmix enabled')
parser.add_argument('--mixup-mode', type=str, default='batch',
                help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')

# * data process params
parser.add_argument('--input-size', default=224, type=int)
parser.add_argument('--resize', default=256, type=int)
# * visualization
parser.add_argument('--visualize', action='store_true')
args, unknown = parser.parse_known_args()

In [36]:
class S2T_Dataset(Dataset.Dataset):
    def __init__(self, path, phase, args, config, seed=None, training_refurbish=False, aug_rate=0.5):
        # 生成随机种子
        self.seed = seed
        np.random.seed(self.seed)
        random.seed(self.seed)
        self.max_length = config['data']['max_length']
        # self.max_length = 10

        self.img_path = config['data']['img_path']
        self.kps_path = config['data']['keypoint_path']

        self.args = args
        self.aug_rate = aug_rate
        self.phase = phase
        self.config = config

        self.raw_data = utils.load_dataset_file(path)
        self.training_refurbish = training_refurbish
        self.list = [key for key, value in self.raw_data.items()]

    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, index):
        key = self.list[index]
        sample = self.raw_data[key]
        tgt_sample = sample['text']
        length = sample['length']

        name_sample = sample['name']

        img_sample = self.load_imgs([self.img_path + x for x in sample['imgs_path']], index, include_blur=True)
        kp_sample = self.load_imgs([self.kps_path + x for x in sample["kps_path"]], index, include_blur=False)

        return name_sample, img_sample, kp_sample, tgt_sample

    def load_imgs(self, paths, index, include_blur=True):
        data_transform = transforms.Compose([
            transforms.ToTensor(),
            # transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ])
        paths = self.length_constraint(paths)
        imgs = torch.zeros(len(paths), 3, self.args.input_size, self.args.input_size)

        batch_image = []
        crop_rect, resize = self.data_augmentation(resize=(self.args.resize, self.args.resize),
                                                   crop_size=self.args.input_size,
                                                   is_train=(self.phase == 'train'), index=index)
        # print('img', crop_rect, resize)
        for i, img_path in enumerate(paths):
            # print(img_path)
            img = cv2.imread(img_path)
            if img is None:
                print(f"警告：无法加载位于 {img_path} 的图像。")
                continue  # 跳过这张图像
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = Image.fromarray(img)
            batch_image.append(img)

        if self.phase == 'train':
            seq = self.video_augmentation(index,include_blur=include_blur)
            batch_image = seq(batch_image)
            # pass

        for i, img in enumerate(batch_image):
            img = img.resize(resize)
            img = data_transform(img).unsqueeze(0)
            imgs[i, :, :, :] = img[:, :, crop_rect[1]:crop_rect[3], crop_rect[0]:crop_rect[2]]
            
            save_path = paths[i].replace('/frame/', '/aug_frame/')  # 根据实际路径逻辑修改
            img_pil = transforms.ToPILImage()(img.squeeze(0))
            os.makedirs(os.path.dirname(save_path), exist_ok=True)  # 确保目录存在
            img_pil.save(save_path)
        return imgs

    def length_constraint(self, paths):
        if len(paths) > self.max_length:
            random.seed(self.seed)
            tmp = sorted(random.sample(range(len(paths)), k=self.max_length))
            new_paths = []
            for i in tmp:
                new_paths.append(paths[i])
            paths = new_paths

        return paths

    def data_augmentation(self, resize=(320, 240), crop_size=224, is_train=True, index=0):
        new_seed = hash((self.seed, index)) % (2 ** 32)
        np.random.seed(new_seed)  # 同步设置 NumPy 的随机种子，如果需要
        if is_train:
            left = np.random.randint(0, resize[0] - crop_size)
            top = np.random.randint(0, resize[1] - crop_size)
        else:
            left = (resize[0] - crop_size) // 2
            top = (resize[1] - crop_size) // 2

        return (left, top, left + crop_size, top + crop_size), resize

    def video_augmentation(self, index=0, include_blur=True):
        new_seed = hash((self.seed, index)) % (2 ** 32)
        random.seed(new_seed)
        sometimes = lambda aug: va.Sometimes(self.aug_rate, aug)

        augmentations = [
            sometimes(va.RandomRotate(30)),
            sometimes(va.RandomResize(0.2)),
            sometimes(va.RandomTranslate(x=50, y=50))
        ]

        if include_blur:
            augmentations.append(sometimes(va.GaussianBlur(sigma=2)))  # 只对视频应用模糊
            # augmentations.append(sometimes(va.Sharpness(alpha=(0.5, 1.5)))),  # 调整锐度，alpha 是锐度强度的范围
            augmentations.append(sometimes(va.Multiply(value=1.0))),  # 类似对比度调整

            augmentations.append(sometimes(Brightness(min=0.1, max=1.5))),
            # SomeOf(self.seq_geo, self.seq_color)
        seq = va.Sequential(augmentations)
        return seq

    def __str__(self):
        return f'#total {self.phase} set: {len(self.list)}.'


In [37]:
help(va.Multiply)

Help on class Multiply in module vidaug.augmentors.intensity:

class Multiply(builtins.object)
 |  Multiply(value=1.0)
 |  
 |  Multiply all pixel intensities with given value.
 |  This augmenter can be used to make images lighter or darker.
 |  
 |  Args:
 |      value (float): The value with which to multiply the pixel intensities
 |      of video.
 |  
 |  Methods defined here:
 |  
 |  __call__(self, clip)
 |      Call self as a function.
 |  
 |  __init__(self, value=1.0)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



# 数据增强

In [38]:

train_data = S2T_Dataset(path=config['data']['train_label_path'], config=config, args=args, phase='train', aug_rate=1,seed = 0)
dev_data = S2T_Dataset(path=config['data']['dev_label_path'],  config=config, args=args, phase='train', aug_rate=1, seed = 0)
test_data = S2T_Dataset(path=config['data']['test_label_path'],  config=config, args=args, phase='train', aug_rate=1,seed = 0)


for i in range(len(train_data)):
    data = train_data[i]
for i in range(len(dev_data)):
    data = dev_data[i]
for i in range(len(test_data)):
    data = test_data[i]

# 92条数据增强10次 存放在不同位置， 然后用数据集拼接的方法拼接， 然后在随机选取大部分作为训练集，小部分作为验证集

In [39]:

train_data = S2T_Dataset(path=config['data']['train_label_path'], config=config, args=args, phase='train', aug_rate=1,seed = 1)
dev_data = S2T_Dataset(path=config['data']['dev_label_path'],  config=config, args=args, phase='train', aug_rate=1, seed = 1)
test_data = S2T_Dataset(path=config['data']['test_label_path'],  config=config, args=args, phase='train', aug_rate=1,seed = 1)


for i in range(len(train_data)):
    data = train_data[i]
for i in range(len(dev_data)):
    data = dev_data[i]
for i in range(len(test_data)):
    data = test_data[i]

In [40]:

train_data = S2T_Dataset(path=config['data']['train_label_path'], config=config, args=args, phase='train', aug_rate=1,seed = 2)
dev_data = S2T_Dataset(path=config['data']['dev_label_path'],  config=config, args=args, phase='train', aug_rate=1, seed = 2)
test_data = S2T_Dataset(path=config['data']['test_label_path'],  config=config, args=args, phase='train', aug_rate=1,seed = 2)


for i in range(len(train_data)):
    data = train_data[i]
for i in range(len(dev_data)):
    data = dev_data[i]
for i in range(len(test_data)):
    data = test_data[i]

In [41]:

train_data = S2T_Dataset(path=config['data']['train_label_path'], config=config, args=args, phase='train', aug_rate=1,seed = 3)
dev_data = S2T_Dataset(path=config['data']['dev_label_path'],  config=config, args=args, phase='train', aug_rate=1, seed = 3)
test_data = S2T_Dataset(path=config['data']['test_label_path'],  config=config, args=args, phase='train', aug_rate=1,seed = 3)


for i in range(len(train_data)):
    data = train_data[i]
for i in range(len(dev_data)):
    data = dev_data[i]
for i in range(len(test_data)):
    data = test_data[i]

In [42]:

train_data = S2T_Dataset(path=config['data']['train_label_path'], config=config, args=args, phase='train', aug_rate=1,seed = 4)
dev_data = S2T_Dataset(path=config['data']['dev_label_path'],  config=config, args=args, phase='train', aug_rate=1, seed = 4)
test_data = S2T_Dataset(path=config['data']['test_label_path'],  config=config, args=args, phase='train', aug_rate=1,seed = 4)


for i in range(len(train_data)):
    data = train_data[i]
for i in range(len(dev_data)):
    data = dev_data[i]
for i in range(len(test_data)):
    data = test_data[i]

In [43]:

train_data = S2T_Dataset(path=config['data']['train_label_path'], config=config, args=args, phase='train', aug_rate=1,seed = 5)
dev_data = S2T_Dataset(path=config['data']['dev_label_path'],  config=config, args=args, phase='train', aug_rate=1, seed = 5)
test_data = S2T_Dataset(path=config['data']['test_label_path'],  config=config, args=args, phase='train', aug_rate=1,seed = 5)


for i in range(len(train_data)):
    data = train_data[i]
for i in range(len(dev_data)):
    data = dev_data[i]
for i in range(len(test_data)):
    data = test_data[i]

In [44]:

train_data = S2T_Dataset(path=config['data']['train_label_path'], config=config, args=args, phase='train', aug_rate=1,seed = 6)
dev_data = S2T_Dataset(path=config['data']['dev_label_path'],  config=config, args=args, phase='train', aug_rate=1, seed = 6)
test_data = S2T_Dataset(path=config['data']['test_label_path'],  config=config, args=args, phase='train', aug_rate=1,seed = 6)


for i in range(len(train_data)):
    data = train_data[i]
for i in range(len(dev_data)):
    data = dev_data[i]
for i in range(len(test_data)):
    data = test_data[i]