diff --git a/configs/recognition/tin/tin_tsm_finetune_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/tin/tin_tsm_finetune_r50_1x1x8_50e_kinetics400_rgb.py index faee78b7f5..2b41c55018 100644 --- a/configs/recognition/tin/tin_tsm_finetune_r50_1x1x8_50e_kinetics400_rgb.py +++ b/configs/recognition/tin/tin_tsm_finetune_r50_1x1x8_50e_kinetics400_rgb.py @@ -3,6 +3,9 @@ '../../_base_/default_runtime.py' ] +# model settings +model = dict(cls_head=dict(is_shift=True)) + # dataset settings dataset_type = 'RawframeDataset' data_root = 'data/kinetics400/rawframes_train' diff --git a/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py b/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py index 590a8d1ac9..7726593aac 100644 --- a/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py +++ b/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py @@ -10,11 +10,11 @@ # dataset settings dataset_type = 'RawframeDataset' -data_root = 'data/sthv1/rawframes' -data_root_val = 'data/sthv1/rawframes' -ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' -ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' -ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' +data_root = 'data/sth-v1/rawframes_train/' +data_root_val = 'data/sth-v1/rawframes_val/' +ann_file_train = 'data/sth-v1/sth-v1_train_list.txt' +ann_file_val = 'data/sth-v1/sth-v1_val_list.txt' +ann_file_test = 'data/sth-v1/sth-v1_val_list.txt' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ diff --git a/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py b/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py index 04bd982d77..cf75d5f46e 100644 --- a/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py +++ b/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py @@ -10,11 +10,11 @@ # dataset settings dataset_type = 'RawframeDataset' -data_root = 'data/sthv2/rawframes' -data_root_val = 'data/sthv2/rawframes' -ann_file_train = 'data/sthv2/sthv2_train_list_rawframes.txt' -ann_file_val = 'data/sthv2/sthv2_val_list_rawframes.txt' -ann_file_test = 'data/sthv2/sthv2_val_list_rawframes.txt' +data_root = 'data/sth-v2/rawframes_train/' +data_root_val = 'data/sth-v2/rawframes_val/' +ann_file_train = 'data/sth-v2/sth-v2_train_list.txt' +ann_file_val = 'data/sth-v2/sth-v2_val_list.txt' +ann_file_test = 'data/sth-v2/sth-v2_val_list.txt' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ diff --git a/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py b/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py index c216215eb9..026dd1997f 100644 --- a/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py +++ b/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py @@ -63,7 +63,7 @@ dict(type='ToTensor', keys=['imgs']) ] data = dict( - videos_per_gpu=8, + videos_per_gpu=6, workers_per_gpu=4, train=dict( type=dataset_type, @@ -83,9 +83,6 @@ evaluation = dict( interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy']) -# dataset settings -data = dict(videos_per_gpu=6, workers_per_gpu=4) - # optimizer optimizer = dict( lr=0.0075, # this lr is used for 8 gpus diff --git a/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_action_rgb.py b/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_action_rgb.py index 87ecd819cc..4d6c02dbbe 100644 --- a/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_action_rgb.py +++ b/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_action_rgb.py @@ -12,6 +12,7 @@ model = dict( backbone=dict(pretrained='torchvision://resnet18', depth=18), cls_head=dict( + in_channels=512, num_classes=category_nums[target_cate], multi_class=True, loss_cls=dict(type='BCELossWithLogits', loss_weight=333.))) diff --git a/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_attribute_rgb.py b/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_attribute_rgb.py index e6cf544972..dead102b3c 100644 --- a/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_attribute_rgb.py +++ b/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_attribute_rgb.py @@ -12,6 +12,7 @@ model = dict( backbone=dict(pretrained='torchvision://resnet18', depth=18), cls_head=dict( + in_channels=512, num_classes=category_nums[target_cate], multi_class=True, loss_cls=dict(type='BCELossWithLogits', loss_weight=333.))) diff --git a/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_concept_rgb.py b/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_concept_rgb.py index f3304ab275..66fa7b6743 100644 --- a/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_concept_rgb.py +++ b/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_concept_rgb.py @@ -12,6 +12,7 @@ model = dict( backbone=dict(pretrained='torchvision://resnet18', depth=18), cls_head=dict( + in_channels=512, num_classes=category_nums[target_cate], multi_class=True, loss_cls=dict(type='BCELossWithLogits', loss_weight=333.))) diff --git a/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_event_rgb.py b/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_event_rgb.py index f612e60387..867d548161 100644 --- a/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_event_rgb.py +++ b/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_event_rgb.py @@ -12,6 +12,7 @@ model = dict( backbone=dict(pretrained='torchvision://resnet18', depth=18), cls_head=dict( + in_channels=512, num_classes=category_nums[target_cate], multi_class=True, loss_cls=dict(type='BCELossWithLogits', loss_weight=333.))) diff --git a/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_object_rgb.py b/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_object_rgb.py index f3304ab275..66fa7b6743 100644 --- a/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_object_rgb.py +++ b/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_object_rgb.py @@ -12,6 +12,7 @@ model = dict( backbone=dict(pretrained='torchvision://resnet18', depth=18), cls_head=dict( + in_channels=512, num_classes=category_nums[target_cate], multi_class=True, loss_cls=dict(type='BCELossWithLogits', loss_weight=333.))) diff --git a/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_scene_rgb.py b/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_scene_rgb.py index 3ef8948b87..b571775b39 100644 --- a/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_scene_rgb.py +++ b/configs/recognition/tsn/hvu/tsn_r18_1x1x8_100e_hvu_scene_rgb.py @@ -12,6 +12,7 @@ model = dict( backbone=dict(pretrained='torchvision://resnet18', depth=18), cls_head=dict( + in_channels=512, num_classes=category_nums[target_cate], multi_class=True, loss_cls=dict(type='BCELossWithLogits', loss_weight=333.))) diff --git a/configs/recognition_audio/resnet/tsn_r50_64x1x1_100e_kinetics400_audio.py b/configs/recognition_audio/resnet/tsn_r50_64x1x1_100e_kinetics400_audio.py index ad11f1a1d2..f4e42f6cc4 100644 --- a/configs/recognition_audio/resnet/tsn_r50_64x1x1_100e_kinetics400_audio.py +++ b/configs/recognition_audio/resnet/tsn_r50_64x1x1_100e_kinetics400_audio.py @@ -1,7 +1,5 @@ _base_ = [ - '../../_base_/models/tsn_r50_audio.py', - '../../_base_/datasets/kinetics400_64x1x1_audio.py', - '../../_base_/default_runtime.py' + '../../_base_/models/tsn_r50_audio.py', '../../_base_/default_runtime.py' ] # dataset settings