In [None]:
!wget https://storage.googleapis.com/deepmind-media/Datasets/kinetics600.tar.gz
!tar -xvf /content/kinetics600.tar.gz

In [None]:
!pip install pytube

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
import torch.optim as optim
import numpy as np
import cv2
from imutils import paths
import albumentations as album
import glob
import random
import pandas as pd

In [None]:
if torch.cuda.is_available():
  device = torch.device('cuda:0')
else:
  device = torch.device('cpu')
  
print(device)

cuda:0


In [None]:
train_set = pd.read_csv('/content/kinetics600/train.csv')
train_set
# train_set = pd.concat([chunk[chunk['label'] == 'jumping jacks'] for chunk in train_data])
# train_set

In [None]:
from pytube import YouTube

youtube_video_url = 'https://www.youtube.com/watch?v='

for t in train_set['youtube_id']:
    video_url = '{}{}'.format(youtube_video_url, t)

    try:
        yt_obj = YouTube(video_url)
    
        filters = yt_obj.streams.filter(progressive=True, file_extension='mp4')
    
        # download the highest quality video
        filters.get_highest_resolution().download(output_path='/content/random_videos')
    except Exception as e:
        print(e)
        # break


In [None]:
!zip -r random_videos.zip random_videos

In [None]:
!cp /content/drive/MyDrive/EE381K/*.zip .
!unzip jumping_jack_videos.zip
!unzip random_videos.zip

In [None]:
model = models.video.r3d_18(pretrained=True)

num_classes = 2 # is jumping jack or not
class_names = ['Jumping Jack', 'Other']

for param in model.parameters():
    param.requires_grad = False

model.fc = nn.Linear(512, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
# model.load_state_dict(torch.load('/content/drive/MyDrive/Colab_Notebooks/classfier-3d-3.pt'))
model.to(device)

In [None]:
transforms = album.Compose([album.Resize(320, 320, always_apply=True)])

VIDEO_PATH = '/content/drive/MyDrive/UCF12/'
jumping_jacks = glob.glob(os.path.join('/content/jumping_jack_videos', '*.mp4')) # for ucf11 dataset
random_videos = glob.glob(os.path.join('/content/random_videos', '*.mp4')) # for ucf101 jumping jacks
video_files = jumping_jacks + random_videos

for i in range(3):
    random.shuffle(video_files) # randomize order for training

print(video_files[0].split('/')[2])
print(len(video_files))

jumping_jack_videos
82


In [None]:
for i in range(3):
    random.shuffle(video_files) # randomize order for training

In [None]:
video_frames = []
clip_length = 10

running_loss = 0.0

for i, video in enumerate(video_files):
    cap = cv2.VideoCapture(video)

    while cap.isOpened() == True:
        ret, frame = cap.read()

        if ret == True:
            image = frame.copy()
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = transforms(image=frame)['image']

            video_frames.append(frame)

            if len(video_frames) == clip_length:
                inputs = np.array(video_frames)
                # add an extra dimension        
                inputs = np.expand_dims(inputs, axis=0)
                # transpose to get [1, 3, num_clips, height, width]
                inputs = np.transpose(inputs, (0, 4, 1, 2, 3))
                # convert the frames to tensor
                inputs = torch.tensor(inputs, dtype=torch.float32)
                inputs = inputs.to(device)
                optimizer.zero_grad()

                # forward pass to get the predictions
                outputs = model(inputs)
                labels = video.split('/')[2]

                if labels == 'jumping_jack_videos':
                    labels = torch.Tensor([0])
                else:
                    labels = torch.Tensor([1])

                labels = labels.long().to(device)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                video_frames.clear()
                running_loss += loss.item()
        else:
            break

    video_frames.clear()
    if (i + 1) % 50 == 0:
        chkp_num = int((i + 1) / 50)
        torch.save(model.state_dict(), '/content/drive/MyDrive/Colab_Notebooks/classfier-chkp{}.pt'.format(chkp_num))
        print('Saved model. Running loss: {}'.format(running_loss/50))
        running_loss = 0.0


In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/Colab_Notebooks/classfier-3d-6.pt')

In [None]:
video_frames = []
clip_length = 16

num_correct = 0
num_total = 0

transforms = album.Compose([album.Resize(180, 320, always_apply=True), album.VerticalFlip(True)])

with torch.no_grad():
    for i, video in enumerate(video_files):
        cap = cv2.VideoCapture(video)

        while cap.isOpened() == True:
            ret, frame = cap.read()

            if ret == True:
                image = frame.copy()
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = transforms(image=frame)['image']

                video_frames.append(frame)

                if len(video_frames) == clip_length:
                    inputs = np.array(video_frames)
                    # add an extra dimension        
                    inputs = np.expand_dims(inputs, axis=0)
                    # transpose to get [1, 3, num_clips, height, width]
                    inputs = np.transpose(inputs, (0, 4, 1, 2, 3))
                    # print(inputs.shape)
                    # convert the frames to tensor
                    inputs = torch.tensor(inputs, dtype=torch.float32)
                    inputs = inputs.to(device)
                    # optimizer.zero_grad()

                    # forward pass to get the predictions
                    outputs = model(inputs)
                    labels = video.split('/')[5]

                    if labels == 'jumping_jack':
                        labels = torch.Tensor([0])
                    else:
                        labels = torch.Tensor([1])

                    _, result = torch.max(outputs.data, 1)
                    labels = labels.long().to(device)
                    video_frames.clear()
                    num_correct += (result == labels).sum().item()
                    num_total += 1
            else:
                break

        video_frames.clear()

print('testing results: {}'.format(num_correct / num_total))