##Установка и импорт библиотек\
Для наших экспериментов мы использовали mmaction2, поскольку он предоставляет готовое решение для многих задач компьютерного зрения, включая распознавание жестов.

In [None]:
!pip install torch==1.12.0 torchvision --extra-index-url https://download.pytorch.org/whl/cu113
!pip install -U openmim
!mim install mmengine
!mim install 'mmcv >= 2.0.0'
! git clone https://github.com/open-mmlab/mmaction2.git
%cd mmaction2
! pip install -v -e .

In [1]:
# Libraries import 
import os
import cv2
import pandas as pd
from tqdm import tqdm
from glob import glob
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams.update({'font.size': 10})

In [2]:
import torch
import mmaction2

In [3]:
DATA_DIR = r'C:\Users\User\Documents\МФТИ\Pyton\DATA_DIR'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
TEST_DIR = os.path.join(DATA_DIR, 'test')
ANNOTATIONS_DIR = os.path.join(DATA_DIR, 'annotations.csv')

In [4]:
ann = pd.read_csv(ANNOTATIONS_DIR, sep='\t')
ann.head()

Unnamed: 0,attachment_id,text,user_id,height,width,length,train,begin,end
0,44e8d2a0-7e01-450b-90b0-beb7400d2c1e,Ё,185bd3a81d9d618518d10abebf0d17a8,1920,1080,156.0,True,36,112
1,df5b08f0-41d1-4572-889c-8b893e71069b,А,185bd3a81d9d618518d10abebf0d17a8,1920,1080,150.0,True,36,76
2,17f53df4-c467-4aff-9f48-20687b63d49a,Р,185bd3a81d9d618518d10abebf0d17a8,1920,1080,133.0,True,40,97
3,e3add916-c708-4339-ad98-7e2740be29e9,Е,185bd3a81d9d618518d10abebf0d17a8,1920,1080,144.0,True,43,107
4,bd7272ed-1850-48f1-a2a8-c8fed523dc37,Ч,185bd3a81d9d618518d10abebf0d17a8,1920,1080,96.0,True,20,70


In [5]:
train_files = sorted(glob(os.path.join(TRAIN_DIR, '*')))
test_files = sorted(glob(os.path.join(TEST_DIR, '*')))

train_sample = train_files[0]
test_sample = test_files[0]

In [6]:
train_sample

'C:\\Users\\User\\Documents\\МФТИ\\Pyton\\DATA_DIR\\train\\00019bad-4c36-4cc5-a940-2f994bc4037a.mp4'

In [7]:
def visualize_frames(video_path: str, title: str, num_frames=5):
    fig, axes_list = plt.subplots(nrows=1, ncols=5, figsize=(10, 3))
    vidcap = cv2.VideoCapture(video_path)
    
    STEP = 4 # sampling step (in frames)
    for i in range(num_frames):
        vidcap.set(cv2.CAP_PROP_POS_FRAMES, (i * STEP) - 1)
        success, frame = vidcap.read()
        if success:
            axes_list[i].imshow(frame[:, :, ::-1])
            axes_list[i].axis('off')
    plt.suptitle(title,
                 x=0.05, y=1.0,
                 horizontalalignment='left',
                 fontweight='semibold',
                 fontsize='large')
    plt.show()

In [None]:
#visualize_frames(train_sample, 'Training sample')
#visualize_frames(test_sample, 'Test sample')

**Создавайте файлы аннотаций для обучения и тестирования.**\
Чтобы обучить модель, нам нужно создать файлы аннотаций с путем к видео и метками.

In [8]:
ann.head()

Unnamed: 0,attachment_id,text,user_id,height,width,length,train,begin,end
0,44e8d2a0-7e01-450b-90b0-beb7400d2c1e,Ё,185bd3a81d9d618518d10abebf0d17a8,1920,1080,156.0,True,36,112
1,df5b08f0-41d1-4572-889c-8b893e71069b,А,185bd3a81d9d618518d10abebf0d17a8,1920,1080,150.0,True,36,76
2,17f53df4-c467-4aff-9f48-20687b63d49a,Р,185bd3a81d9d618518d10abebf0d17a8,1920,1080,133.0,True,40,97
3,e3add916-c708-4339-ad98-7e2740be29e9,Е,185bd3a81d9d618518d10abebf0d17a8,1920,1080,144.0,True,43,107
4,bd7272ed-1850-48f1-a2a8-c8fed523dc37,Ч,185bd3a81d9d618518d10abebf0d17a8,1920,1080,96.0,True,20,70


In [9]:
NUM_CLASSES = len(ann['text'].unique()) # Including "no-action" class
classes = {label: label_id for label, label_id in zip(ann['text'].unique(), range(NUM_CLASSES))}

ann_train = []
ann_test = []

for file in tqdm(train_files + test_files):
    video_id = file.split('\\')[-1][:-4]
    label = ann[ann['attachment_id'] == video_id]['text'].to_string(index=False)
    class_id = classes[label]
    line = file + ' ' + str(class_id) + '\n'
    if ann[ann['attachment_id'] == video_id]['train'].bool():
        ann_train.append(line)
    else:
        ann_test.append(line)

100%|███████████████████████████████████████████████████████████████████████████| 20400/20400 [03:05<00:00, 109.97it/s]


In [None]:
with open(os.path.join(DATA_DIR, 'ann_train.txt'), 'w') as train_file, open(os.path.join(DATA_DIR, 'ann_test.txt'), 'w') as test_file:
    train_file.writelines(ann_train)
    test_file.writelines(ann_test)

In [None]:
%%writefile -a mvit-slovo.py

# Model settings
model = dict(
    type='Recognizer3D',
    backbone=dict(
        type='MViT',
        arch='small',
        drop_path_rate=0.2,
        init_cfg=dict(
            type='Pretrained',
            checkpoint=
            'https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_16x4x1_kinetics400-rgb_20221021-9ebaaeed.pth',
            prefix='backbone.')),
    data_preprocessor=dict(
        type='ActionDataPreprocessor',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        format_shape='NCTHW'),
    cls_head=dict(
        type='MViTHead',
        in_channels=768,
        num_classes=1001,
        label_smooth_eps=0.1,
        average_clips='prob'))

In [10]:
%%writefile -a mvit-slovo.py

# Model settings
model = dict(
    type='Recognizer3D',
    backbone=dict(
        type='MViT',
        arch='small',
        drop_path_rate=0.2,
        init_cfg=dict(
            type='Pretrained',
            checkpoint=
            'https://download.openmmlab.com/mmaction/v1.0/recognition/mvit/converted/mvit-small-p244_16x4x1_kinetics400-rgb_20221021-9ebaaeed.pth',
            prefix='backbone.')),
    data_preprocessor=dict(
        type='ActionDataPreprocessor',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        format_shape='NCTHW'),
    cls_head=dict(
        type='MViTHead',
        in_channels=768,
        num_classes=1001,
        label_smooth_eps=0.1,
        average_clips='prob'))

Writing mvit-slovo.py


In [None]:
%%writefile -a mvit-slovo.py

# Specify dataset paths
dataset_type = 'VideoDataset'
data_root = '/kaggle/input/slovo/train'
data_root_val = '/kaggle/input/slovo/test'
ann_file_train = '/kaggle/working/mmaction2/ann_train.txt'
ann_file_val = '/kaggle/working/mmaction2/ann_test.txt'
ann_file_test = '/kaggle/working/mmaction2/ann_test.txt'