##Установка и импорт библиотек\
Для наших экспериментов мы использовали mmaction2, поскольку он предоставляет готовое решение для многих задач компьютерного зрения, включая распознавание жестов.

In [None]:
!pip install torch==1.12.0 torchvision --extra-index-url https://download.pytorch.org/whl/cu113
!pip install -U openmim
!mim install mmengine
!mim install 'mmcv >= 2.0.0'
! git clone https://github.com/open-mmlab/mmaction2.git
%cd mmaction2
! pip install -v -e .

Импорт библиотек

In [1]:
# Libraries import 
import os
import cv2
import pandas as pd
from tqdm import tqdm
from glob import glob
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams.update({'font.size': 10})

In [2]:
import torch
import mmaction2

Обьявляем пути до датасета, анатации и выборок.

In [3]:
DATA_DIR = r'C:\Users\User\Documents\МФТИ\Pyton\DATA_DIR'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
TEST_DIR = os.path.join(DATA_DIR, 'test')
ANNOTATIONS_DIR = os.path.join(DATA_DIR, 'annotations.csv')

In [4]:
ann = pd.read_csv(ANNOTATIONS_DIR, sep='\t')
ann.head()

Unnamed: 0,attachment_id,text,user_id,height,width,length,train,begin,end
0,44e8d2a0-7e01-450b-90b0-beb7400d2c1e,Ё,185bd3a81d9d618518d10abebf0d17a8,1920,1080,156.0,True,36,112
1,df5b08f0-41d1-4572-889c-8b893e71069b,А,185bd3a81d9d618518d10abebf0d17a8,1920,1080,150.0,True,36,76
2,17f53df4-c467-4aff-9f48-20687b63d49a,Р,185bd3a81d9d618518d10abebf0d17a8,1920,1080,133.0,True,40,97
3,e3add916-c708-4339-ad98-7e2740be29e9,Е,185bd3a81d9d618518d10abebf0d17a8,1920,1080,144.0,True,43,107
4,bd7272ed-1850-48f1-a2a8-c8fed523dc37,Ч,185bd3a81d9d618518d10abebf0d17a8,1920,1080,96.0,True,20,70


Берем имена файлов в тренировочные и обучающие выборки

In [5]:
train_files = sorted(glob(os.path.join(TRAIN_DIR, '*')))
test_files = sorted(glob(os.path.join(TEST_DIR, '*')))

train_sample = train_files[0]
test_sample = test_files[0]

In [6]:
train_sample

'C:\\Users\\User\\Documents\\МФТИ\\Pyton\\DATA_DIR\\train\\00019bad-4c36-4cc5-a940-2f994bc4037a.mp4'

In [7]:
def visualize_frames(video_path: str, title: str, num_frames=5):
    fig, axes_list = plt.subplots(nrows=1, ncols=5, figsize=(10, 3))
    vidcap = cv2.VideoCapture(video_path)
    
    STEP = 4 # sampling step (in frames)
    for i in range(num_frames):
        vidcap.set(cv2.CAP_PROP_POS_FRAMES, (i * STEP) - 1)
        success, frame = vidcap.read()
        if success:
            axes_list[i].imshow(frame[:, :, ::-1])
            axes_list[i].axis('off')
    plt.suptitle(title,
                 x=0.05, y=1.0,
                 horizontalalignment='left',
                 fontweight='semibold',
                 fontsize='large')
    plt.show()

In [None]:
#visualize_frames(train_sample, 'Training sample')
#visualize_frames(test_sample, 'Test sample')

Создание файлов аннотаций для обучения и тестирования.\
Чтобы обучить модель, нам нужно создать файлы аннотаций с путем к видео и метками.

In [9]:
NUM_CLASSES = len(ann['text'].unique()) # Including "no-action" class
classes = {label: label_id for label, label_id in zip(ann['text'].unique(), range(NUM_CLASSES))}

ann_train = []
ann_test = []

for file in tqdm(train_files + test_files):
    video_id = file.split('\\')[-1][:-4]
    label = ann[ann['attachment_id'] == video_id]['text'].to_string(index=False)
    class_id = classes[label]
    line = file + ' ' + str(class_id) + '\n'
    if ann[ann['attachment_id'] == video_id]['train'].bool():
        ann_train.append(line)
    else:
        ann_test.append(line)

100%|███████████████████████████████████████████████████████████████████████████| 20400/20400 [03:05<00:00, 109.97it/s]


In [None]:
with open(os.path.join(DATA_DIR, 'ann_train.txt'), 'w') as train_file, open(os.path.join(DATA_DIR, 'ann_test.txt'), 'w') as test_file:
    train_file.writelines(ann_train)
    test_file.writelines(ann_test)

Создаие файл конфигурации\
Следующим шагом является создание файла конфигурации для обучения модели. Чтобы продемонстрировать использование нашего набора данных, мы решили точно настроить MViT Small, предварительно обученный на наборе данных Kinetics400. Контрольная точка взята из репозитория mmaction2.

In [None]:
%%writefile -a mvit-slovo.py

# Model settings
model = dict(
    type='Recognizer3D',
    backbone=dict(
        type='MViT',
        arch='small',
        drop_path_rate=0.2,
        init_cfg=dict(
            type='Pretrained',
            checkpoint=
            'https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_16x4x1_kinetics400-rgb_20221021-9ebaaeed.pth',
            prefix='backbone.')),
    data_preprocessor=dict(
        type='ActionDataPreprocessor',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        format_shape='NCTHW'),
    cls_head=dict(
        type='MViTHead',
        in_channels=768,
        num_classes=1001,
        label_smooth_eps=0.1,
        average_clips='prob'))

In [10]:
%%writefile -a mvit-slovo.py

# Model settings
model = dict(
    type='Recognizer3D',
    backbone=dict(
        type='MViT',
        arch='small',
        drop_path_rate=0.2,
        init_cfg=dict(
            type='Pretrained',
            checkpoint=
            'https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_16x4x1_kinetics400-rgb_20221021-9ebaaeed.pth',
            prefix='backbone.')),
    data_preprocessor=dict(
        type='ActionDataPreprocessor',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        format_shape='NCTHW'),
    cls_head=dict(
        type='MViTHead',
        in_channels=768,
        num_classes=1001,
        label_smooth_eps=0.1,
        average_clips='prob'))

Writing mvit-slovo.py


In [None]:
%%writefile -a mvit-slovo.py

# Specify dataset paths
dataset_type = 'VideoDataset'
data_root = '/kaggle/input/slovo/train'
data_root_val = '/kaggle/input/slovo/test'
ann_file_train = '/kaggle/working/mmaction2/ann_train.txt'
ann_file_val = '/kaggle/working/mmaction2/ann_test.txt'
ann_file_test = '/kaggle/working/mmaction2/ann_test.txt'

In [2]:
%%writefile -a mvit-slovo.py

train_pipeline = [
    dict(type='DecordInit', io_backend='disk'),
    dict(
        type='SampleFrames',
        clip_len=32,
        frame_interval=2,
        num_clips=1,
        out_of_bound_opt='repeat_last'),
    dict(type='DecordDecode'),
    dict(type='Resize', scale=(224, 224)),
    dict(type='Flip', flip_ratio=0.5, direction='horizontal'),
    dict(type='FormatShape', input_format='NCTHW'),
    dict(type='PackActionInputs')
]
val_pipeline = [
    dict(type='DecordInit', io_backend='disk'),
    dict(
        type='SampleFrames',
        clip_len=32,
        frame_interval=2,
        num_clips=1,
        test_mode=True,
        out_of_bound_opt='repeat_last'),
    dict(type='DecordDecode'),
    dict(type='Resize', scale=(224, 224)),
    dict(type='FormatShape', input_format='NCTHW'),
    dict(type='PackActionInputs')
]
test_pipeline = [
    dict(type='DecordInit', io_backend='disk'),
    dict(
        type='SampleFrames',
        clip_len=32,
        frame_interval=2,
        num_clips=2,
        test_mode=True,
        out_of_bound_opt='repeat_last'),
    dict(type='DecordDecode'),
    dict(type='Resize', scale=(224, 224)),
    dict(type='FormatShape', input_format='NCTHW'),
    dict(type='PackActionInputs')
]

Appending to mvit-slovo.py


In [3]:
%%writefile -a mvit-slovo.py

train_dataloader = dict(
    batch_size=2,
    num_workers=2,
    persistent_workers=True,
    sampler=dict(type='DefaultSampler', shuffle=True),
    dataset=dict(
        type='VideoDataset',
        ann_file=ann_file_train,
        data_prefix=dict(video=data_root),
        pipeline=train_pipeline))
val_dataloader = dict(
    batch_size=2,
    num_workers=2,
    persistent_workers=True,
    sampler=dict(type='DefaultSampler', shuffle=False),
    dataset=dict(
        type='VideoDataset',
        ann_file=ann_file_val,
        data_prefix=dict(video=data_root_val),
        pipeline=val_pipeline,
        test_mode=True))
test_dataloader = dict(
    batch_size=1,
    num_workers=2,
    persistent_workers=True,
    sampler=dict(type='DefaultSampler', shuffle=False),
    dataset=dict(
        type='VideoDataset',
        ann_file=ann_file_test,
        data_prefix=dict(video=data_root_val),
        pipeline=test_pipeline,
        test_mode=True))

Appending to mvit-slovo.py


In [4]:
%%writefile -a mvit-slovo.py

# Training settigns
val_evaluator = dict(type='AccMetric')
test_evaluator = dict(type='AccMetric')
train_cfg = dict(
    type='EpochBasedTrainLoop', max_epochs=15, val_begin=1, val_interval=3)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
base_lr = 0.0016
optim_wrapper = dict(
    optimizer=dict(
        type='AdamW', lr=0.0016, betas=(0.9, 0.999), weight_decay=0.05),
    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0))
param_scheduler = [
    dict(
        type='LinearLR',
        start_factor=0.1,
        by_epoch=True,
        begin=0,
        end=15,
        convert_to_iter_based=True)
]
auto_scale_lr = dict(enable=False, base_batch_size=64)
dist_params = dict(backend='nccl')
launcher = 'pytorch'
work_dir = 'work_dirs/mvit-slovo'
randomness = dict(seed=None, diff_rank_seed=False, deterministic=False)

Appending to mvit-slovo.py


Обучаем модель

In [5]:
! python tools/train.py ./mvit-slovo.py

python: can't open file 'C:\\Users\\User\\Documents\\МФТИ\\Pyton\\tools\\train.py': [Errno 2] No such file or directory


Протестируем модель

In [None]:
! python tools/test.py mvit-slovo.py work_dirs/mvit-slovo/{put_your_chekpoint_here}.pth