In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import numpy as np
import pandas as pd
import tqdm
import time
import cv2
import pickle
import torch

from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.svm import SVC
from scipy.stats import mode
from sklearn.decomposition import PCA


In [None]:
# 베이스라인 달성을 위한 파라미터 제공
arg_img_size = (128, 128)
arg_dense_sift = True
args_local_cluster = 200
args_global_cluster = 200
num_frame = 5
args_aggr = "vlad" # "vlad" or "bow"
pca_vlad = 128

### Video preprocessing and frame-specific feature point extraction

In [None]:
# train 비디오의 행동 분류 label read
root = "/kaggle/input/2021-ml-tp4/"
train_csv = os.path.join(root, "train_label.csv")
train_csv = pd.read_csv(train_csv)
train_csv_arr = np.asarray(train_csv)

# 데이터 셋에 존재하는 행동 분류 정보 read
classinfo = os.path.join(root, "class_info.csv")
classinfo = pd.read_csv(classinfo)
classinfo_arr = np.asarray(classinfo)


train_path = os.path.join(root, "train")
test_path = os.path.join(root, "test")

# train 비디오 경로
train_list = os.listdir(train_path)
train_list.sort()
train_list = [os.path.join(train_path, i) for i in train_list]

# test 비디오 경로
test_list = os.listdir(test_path)
test_list.sort()
test_list = [os.path.join(test_path, i) for i in test_list]

In [None]:
def video_to_frame(video_path, size, num_frame):
    
    #########################################################
    ## The function that extract frames from the video
    ## 
    ## Input 
    ##     video_path : 한 비디오의 경로
    ##     size : 비디오 내의 프레임을 읽을 때, 원하는 해상도 크기
    ##     num_frames : 한 비디오 내에서 읽을 프레임의 수
    ##
    ## Output
    ##     frames : 읽고 저장한 총 프레임
    #########################################################
    
    cap = cv2.VideoCapture(video_path)
    
    total_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    sel_ind = np.linspace(0, total_frame-1, num_frame).astype("int")
    
    
    num=0
    frames = []
    for i in range(total_frame):
        
        # 읽을 프레임 인덱스의 경우 프레임 읽어 메모리에 저장, 아닐 경우 지나감
        if i in sel_ind:
            res, frame = cap.read()
            # 원하는 해상도로 조절 및 grayscale로 변환
            frame = cv2.resize(frame, size, interpolation = cv2.INTER_CUBIC)
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            frames.append(frame)
        else:
            res = cap.grab()        
    cap.release()
    frames = np.asarray(frames)

    return frames

In [None]:
def computeSIFT(data, dense=False):
    
    #########################################################
    ## The function that extracts visual words from frames within a video as SIFT or DenseSIFT
    ## 
    ## Input 
    ##     data : 한 비디오에서 읽고 저장한 프레임
    ##     dense : SIFT or DenseSIFT 사용 여부
    ##
    ## Output
    ##     x : 프레임에 대해 추출된 특징점(visual word), dict 형태 -> x[0]이면 0번째 인덱스 프레임의 특징점(visual word) [n,128] 확인 가능
    ##     x : Visual word extracted for the frame, dict form -> x[0] allows you to check the visual word [n,128] of the 0th index frame        
    #########################################################
    
    x = {}
    for i in range(0, len(data)):
        if dense:
            img = data[i]
            step_size = 8
            kp = [cv2.KeyPoint(x, y, step_size) for x in range(0, img.shape[0], step_size) for y in range(0, img.shape[1], step_size)]
            

            # 기본 SIFT 와 동일하게 정의 
            # 기본 SIFT 에서는 detectAndCompute를 사용했지만 
            # Dense SIFT는 위에서 생성한 keypoint를 사용해 compute 만을 진행 
            
            sift = cv2.SIFT_create()
            # SIFT 추출기를 생성합니다.
            kp, desc = sift.compute(img, kp)
            # img 와 keypoint를 이용하여 compute를 진행하고 keypoint와 기술자를 반환받습니다.
            
        else:
            sift = cv2.SIFT_create()
            img = data[i]
            kp, desc = sift.detectAndCompute(img, None)
        x.update({i : desc})

    return x

In [None]:
# train 비디오에서 프레임 추출 및 특징점(visual word) 추출, dict 형태로 train_local_desc[비디오 경로]이면 해당하는 비디오에서 추출한 모든 특징점(visual word) 확인 가능
# Extract frame and extract visual word from train video, test_local_desc[video path] in dict form, and see all visual words from that video

train_local_desc = {}
for vi, vid_path in enumerate(tqdm.tqdm(train_list, desc="Extract {} in train data".format("dsift" if arg_dense_sift else "sift"))):
    curr_frame = video_to_frame(vid_path, arg_img_size, num_frame)
    local_desc = computeSIFT(curr_frame, arg_dense_sift)
    train_local_desc.update({vid_path : local_desc})

# test 비디오에서 프레임 추출 및 특징점(visual word) 추출, dict 형태로 test_local_desc[비디오 경로]이면 해당하는 비디오에서 추출한 모든 특징점(visual word) 확인 가능
# Extract frame and extract visual word from test video, test_local_desc[video path] in dict form, and see all visual words from that video

test_local_desc = {}
for vi, vid_path in enumerate(tqdm.tqdm(test_list, desc="Extract {} in test data".format("dsift" if arg_dense_sift else "sift"))):
    curr_frame = video_to_frame(vid_path, arg_img_size, num_frame)
    local_desc = computeSIFT(curr_frame, arg_dense_sift)
    test_local_desc.update({vid_path : local_desc})


In [None]:
print("\n\nAggregate SIFT descriptor")
start = time.time()


# train 비디오 별로 나눠진 특징점(visual word)들을 [n,128]형태로 모음, 모아진 특징점(visual word)들의 정보(비디오 내의 몇번째 프레임에서 나온 특징점인지)는 
# 같은 인덱스의 train_frame_total에서 확인 가능 및 비디오 내의 특정 프레임에서 나온 특징점(visual word)의 수는 train_local_info에서 확인 가능

# Train The visual words divided by video are collected in the form of [n,128], and the information of the visual words (which frame in the video is the feature point)
# train_frame_total with the same index and the number of visual words from a particular frame in the video are train_local_info

train_frame_total = []
train_local_desc_total = []
train_local_info = {}
for k, v in train_local_desc.items():
    for kk, vv in v.items():
        l_num = 0
        if vv is not None:
            train_local_desc_total.extend(vv)
            train_frame_total.extend([k+", "+str(kk)] * len(vv))
            l_num = len(vv)
        train_local_info.update({k+", "+str(kk) : l_num})
train_local_desc_total = np.asarray(train_local_desc_total)
train_frame_total = np.asarray(train_frame_total)


# test 비디오 별로 나눠진 특징점(visual word)들을 [n,128]형태로 모음, 모아진 특징점(visual word)들의 정보(비디오 내의 몇번째 프레임에서 나온 특징점인지)는 
# 같은 인덱스의 test_frame_total에서 확인 가능 및 비디오 내의 특정 프레임에서 나온 특징점(visual word)의 수는 test_local_info에서 확인 가능
test_frame_total = []
test_local_desc_total = []
test_local_info = {}
for k, v in test_local_desc.items():
    for kk, vv in v.items():
        l_num = 0
        if vv is not None:
            test_local_desc_total.extend(vv)
            test_frame_total.extend([k+", "+str(kk)] * len(vv))
            l_num = len(vv)
        test_local_info.update({k+", "+str(kk) : l_num})
test_local_desc_total = np.asarray(test_local_desc_total)
test_frame_total = np.asarray(test_frame_total)


print("\t{:3.2f}s\n\n".format(time.time()-start))

### Create Frame Feature for Video Feature

In [None]:
def clustering(train_desc, test_desc=None, n_clusters = 200):
    #########################################################
    ## 모든 특징점들 중, 대표 특징점(codebook)을 선정하는 함수
    ## Of all feature points, a function that selects a representative feature point (codebook)
    ##
    ## Input 
    ##     train_desc : 모든 train 비디오의 모든 프레임에서 추출한 특징점(visual word)들
    ##     test_desc : 모든 test 비디오의 모든 프레임에서 추출한 특징점(visual word)들
    ##     n_clusters : 대표 특징점(codebook)의 수
    ##
    ## Output
    ##     train_pred : 대표 특징점(codebook)에 대해 train_desc가 할당된 위치
    ##     test_pred : 대표 특징점(codebook)에 대해 train_desc가 할당된 위치
    ##     clusters : 대표 특징점(codebook)
    ##     kmeans : kmeans 인스턴스
    #########################################################
    
    
    
    # Selection of representative feature points (codebook)
    
    kmeans = MiniBatchKMeans(n_clusters = n_clusters, random_state = 0)
    # MiniBatchKMeans 를 초기화하고 파라미터 설정, n_clusters 만큼의 대표 특징점을 선정합니다.
    kmeans.fit(train_desc)
    # train_desc 즉 train video에서 추출한 특징점을 모델에 학습시킵니다.
    clusters = kmeans.cluster_centers_
    # 대표 특징점들을 clusters에 저장합니다.

    
    
    train_pred = kmeans.predict(train_desc)
    if test_desc is not None:
        test_pred = kmeans.predict(test_desc)
    else:
        test_pred = None
    return train_pred, test_pred, clusters, kmeans

In [None]:
# Create a representative feature point (codebook) with visual words extracted from all frames of all train videos.
# Assign a visual word extracted from all frames of the train video and a visual word extracted
# from all frames of the test video to a representative feature point (codebook).
train_local_alloc, test_local_alloc, local_codebook, local_kmeans = clustering(train_local_desc_total, test_local_desc_total, args_local_cluster)

In [None]:
def VLAD(X, alloc, centers):
    #########################################################
    ## Function to describe VLAD feature which is image feature
    ## 
    ## Input 
    ##     X : Visual words in a frame
    ##     alloc : Location where the visual words of a frame are assigned to the representative feature points (codebook)
    ##     centers : codebook
    ##
    ## Output
    ##     V : VLAD feature
    #########################################################
    
    m,d = X.shape
    k = centers.shape[0]
    
    # VLAD feature를 담기 위한 변수
    V = np.zeros([k,d])

    for i in range(k):
        if np.sum(alloc == i)>0:
            
            # Using the visual word X extracted from the image and the information alloc assigned to them as representative feature points (codebook),
            # Calculate the vector sum of the visual words assigned to the same representative feature point (codbook) and store them in V[i]
            # VLAD(test_local_desc_total[vi:vi+v], test_local_alloc[vi:vi+v], local_codebook)
            
            V[i] = np.sum(X[alloc == i, :] - centers[i], axis = 0)

            ######################################################################
    
    V = V.flatten()
    V = np.sign(V)*np.sqrt(np.abs(V))
    if np.sqrt(np.dot(V,V))!=0:
        V = V/np.sqrt(np.dot(V,V))
    return V


def BoW(alloc, n_cluster):
    #########################################################
    ## Function to describe BoW feature which is image feature
    ## 
    ## Input 
    ##     alloc : Location where the visual words of a frame are assigned to the representative feature points (codebook)
    ##     n_cluster : Number of representative feature points (codebooks)
    ##
    ## Output
    ##     V : BoW feature
    #########################################################
    
    # Calculate the histogram of the information alloc that the visual word extracted from the image is assigned as the representative feature point (codebook)
    # np.histogram
    V, _ = np.histogram(alloc, bins = range(n_cluster + 1))

    return V

In [None]:
print("\n\nAllocate center & Descript local histogram")
start = time.time()

# Train 비디오 내의 프레임 별로 이미지 feature 기술 -> train_global_desc
# 각 이미지 feature의 정보(속한 비디오 이름, 비디오 내의 인덱스) -> train_global_desc_key
train_global_desc = []
train_global_desc_key = []
vi=0
for k, v in train_local_info.items():
    if v!=0:
        if args_aggr=="bow":            
            hist_desc = BoW(train_local_alloc[vi:vi+v], args_local_cluster)
        elif args_aggr=="vlad":
            hist_desc = VLAD(train_local_desc_total[vi:vi+v], train_local_alloc[vi:vi+v], local_codebook)
        else:
            import pdb; pdb.set_trace()

        vi+=v
        train_global_desc.append(hist_desc)
        train_global_desc_key.append(k)
train_global_desc = np.asarray(train_global_desc)
train_global_desc_key = np.asarray(train_global_desc_key)


# Test 비디오 내의 프레임 별로 이미지 feature 기술 -> test_global_desc
# 각 이미지 feature의 정보(속한 비디오 이름, 비디오 내의 인덱스) -> test_global_desc_key
test_global_desc = []
test_global_desc_key = []
vi=0
for k, v in test_local_info.items():
    if v!=0:
        if args_aggr=="bow":
            hist_desc = BoW(test_local_alloc[vi:vi+v], args_local_cluster)
        elif args_aggr=="vlad":
            hist_desc = VLAD(test_local_desc_total[vi:vi+v], test_local_alloc[vi:vi+v], local_codebook)
        else:
            import pdb; pdb.set_trace()

        vi+=v
        test_global_desc.append(hist_desc)
        test_global_desc_key.append(k)
test_global_desc = np.asarray(test_global_desc)
test_global_desc_key = np.asarray(test_global_desc_key)

print("\t{:3.2f}s\n\n".format(time.time()-start))


In [None]:
# In the case of VLAD feature, large dimensions lead to memory shortage, so dimensions are reduced using PCA
if args_aggr=="vlad":
    print("\n\nReduce dim of descriptor of the frames with PCA")
    start = time.time()
    pca = PCA(n_components=pca_vlad, random_state=0)
    pca.fit(train_global_desc)
    train_global_desc = pca.transform(train_global_desc)
    test_global_desc = pca.transform(test_global_desc)
    print("\t{:3.2f}s\n\n".format(time.time()-start))

In [None]:
print("\n\nProcessing label")
start = time.time()

# For classification, label processing for each frame of train video for behavior classification
train_global_id = np.array([int(i.split("/")[-1].split(".")[0]) for i in train_global_desc_key])
train_global_label = []
for fid in train_global_id:
    cind = np.where(train_csv_arr[:, 0]==fid)[0]
    clsname = train_csv_arr[cind, 1]
    cinfo_ind = np.where(classinfo_arr[:, 1] == clsname)[0]
    train_global_label.append(classinfo_arr[cinfo_ind, 0].astype("int"))
train_global_label = np.asarray(train_global_label).ravel()

# For classification, processing ID for each frame of test video for behavior classification
test_global_id = np.array([int(i.split("/")[-1].split(".")[0]) for i in test_global_desc_key])

print("\t{:3.2f}s\n\n".format(time.time()-start))

In [None]:
def saveFile(predict, predict_id, name, best_params=None):
    #########################################################
    ## Input 
    ##     predict : Behavior predictions for all test videos
    ##     predict_id : id of all test videos
    ##     name : Desired storage file name
    ##     best_params : Use to save the desired instance
    ##
    #########################################################
    
    data = np.concatenate((np.expand_dims(predict_id.astype("str"), axis=1), np.expand_dims(predict.astype("str"), axis=1)), axis=1)
    csv = pd.DataFrame(data, columns=['Id', 'Category'])
    csv.to_csv(name + ".csv", index=False)
    
    if best_params:
        f = open(name + ".pickle", "wb")
        pickle.dump(best_params, f, 2)

### Create and classify video features by averaging the features obtained from all frames of the video

In [None]:
print("\n\nSVM global averaging in frame")
start = time.time()


train_video_feature = []
train_video_label = []
test_video_feature = []

for i in range(0, train_global_id.shape[0], 5) :
    # train_global_id의 수 만큼 반복하되, 비디오 프레임은 5개씩 존재하므로
    # i를 5씩 증가시킵니다.
    train_desc_mean = np.mean(train_global_desc[i:i+5], axis = 0)
    # train_global_desc을 5개씩 평균내어 train_desc_mean에 평균값을 저장합니다.
    train_video_feature.append(train_desc_mean)
    # 저장한 평균값을 train_video_feature에 append 합니다.
    train_video_label.append(train_global_label[i])
    # train_global_label을 프레임 5개당 하나씩 append 합니다.
    
train_video_feature = np.array(train_video_feature)
# train_video_feature를 numpy array로 저장합니다.
train_video_label = np.array(train_video_label)
# train_video_label을 numpy array로 저장합니다.

for i in range(0, test_global_id.shape[0], 5) :
    # test_global_id의 수 만큼 반복하되, 비디오 프레임은 5개씩 존재하므로
    # i를 5씩 증가시킵니다.
    test_desc_mean = np.mean(test_global_desc[i:i+5], axis = 0)
    # test_desc_mean을 5개씩 평균내어 train_desc_mean에 평균값을 저장합니다.
    test_video_feature.append(test_desc_mean)
    # 저장한 평균값을 train_video_feature에 append 합니다.
    
test_video_feature = np.array(test_video_feature)
# test_video_feature를 numpy array로 저장합니다.

clf = SVC(random_state = 0, class_weight = 'balanced')
# 모델을 선언하고 파라미터를 조정합니다.
clf.fit(train_video_feature, train_video_label)
# 모델에 train_video_feature과 train_video_label를 넣어 학습시킵니다.
svm_predict = clf.predict(test_video_feature)
# 모델에 test_video_feature를 넣고 결과값을 예측하여 svm_predict에 저장합니다.

# baseline(bow) : 0.20990
# random_state = 0 // kaggle : 0.17227
# random_state = 0, C = 1 // kaggle: 0.17227
# random_state = 0, C = 10, 100 // kaggle: 0.16831
# random_state = 0, class_weight = 'balanced' // kaggle: 0.20594

# baseline(vlad) : 0.28118
# random_state = 0, class_weight = 'balanced' // kaggle: 0.28118

saveFile(classinfo_arr[svm_predict][:,1], np.arange(len(test_list)), "svm_global_averaging")
print("\t{:3.2f}s\n\n".format(time.time()-start))

### Using a classifier as a feature from all frames of the video, select the most frequently predicted behavior

In [None]:
print("\n\nSVM global voting in frame")
start = time.time()

clf = SVC(random_state = 0, class_weight = 'balanced')
# 모델을 선언하고 파라미터를 설정합니다.
clf.fit(train_global_desc, train_global_label)
# 모델에 train_global_desc과 train_global_label을 학습시킵니다.
y_pred = clf.predict(test_global_desc)
# 모델에 test_global_desc을 넣어 예측하고 y_pred에 저장합니다.
y_pred = y_pred.reshape(-1,5)
# y_pred를 알맞게 reshape 합니다.

find_mode = mode(y_pred, axis = 1)
# mode 함수를 이용하여 프레임별 예측값 y_pred의 최빈값을 구합니다.
svm_predict = find_mode[0].reshape(-1,)
# 최빈값을 해당 비디오의 행동예측 값으로 선정합니다.

# baseline(bow) : 0.20990
# random_state = 0, class_weight = 'balanced' // kaggle: 0.20990

# baseline(vlad) : 0.28118
# random_state = 0, class_weight = 'balanced' // kaggle: 0.26732

saveFile(classinfo_arr[svm_predict][:,1], np.arange(len(test_list)), "svm_global_voting")
print("\t{:3.2f}s\n\n".format(time.time()-start))

### After selecting the representative feature in the frame feature, describe the video feature using BoW or VLAD method

In [None]:
train_global_alloc, test_global_alloc, global_codebook, global_kmeans = clustering(train_global_desc, test_global_desc, args_global_cluster)

In [None]:
print("\n\nAllocate center & Descript global histogram")
start = time.time()
train_vid_names = np.asarray([i.split(", ")[0] for i in train_global_desc_key])
train_vid_names_u = np.unique(train_vid_names)

# Train 비디오 내 프레임 별로 기술된 이미지 feature를 기반으로 한번 더 기술하여(한번 더 BoW 혹은 VLAD)
# 각 비디오에 대한 비디오 feature 기술
train_video_desc = []
train_video_desc_key = []
for vid_name in train_vid_names_u:
    cind = np.where(vid_name==train_vid_names)[0]
    if args_aggr=="bow":
        hist_desc = BoW(train_global_alloc[cind], args_global_cluster)
    elif args_aggr=="vlad":
        hist_desc = VLAD(train_global_desc[cind], train_global_alloc[cind], global_codebook)
    else:
        import pdb; pdb.set_trace()

    train_video_desc.append(hist_desc)
    train_video_desc_key.append(vid_name)
train_video_desc = np.asarray(train_video_desc)
train_video_desc_key = np.asarray(train_video_desc_key)

# Test 비디오 내 프레임 별로 기술된 이미지 feature를 기반으로 한번 더 기술하여(한번 더 BoW 혹은 VLAD)
# 각 비디오에 대한 비디오 feature 기술
test_vid_names = np.asarray([i.split(", ")[0] for i in test_global_desc_key])
test_vid_names_u = np.unique(test_vid_names)

test_video_desc = []
test_video_desc_key = []
for vid_name in test_vid_names_u:
    cind = np.where(vid_name==test_vid_names)[0]
    if args_aggr=="bow":
        hist_desc = BoW(test_global_alloc[cind], args_global_cluster)
    elif args_aggr=="vlad":
        hist_desc = VLAD(test_global_desc[cind], test_global_alloc[cind], global_codebook)
    else:
        import pdb; pdb.set_trace()

    test_video_desc.append(hist_desc)
    test_video_desc_key.append(vid_name)
test_video_desc = np.asarray(test_video_desc)
test_video_desc_key = np.asarray(test_video_desc_key)


print("\t{:3.2f}s\n\n".format(time.time()-start))


In [None]:
print("\n\nProcessing label")
start = time.time()

# 분류를 위해, 행동 분류에 대한 각 train 비디오 별 label 가공
train_video_id = np.array([int(i.split("/")[-1].split(".")[0]) for i in train_video_desc_key])
train_video_label = []
for fid in train_video_id:
    cind = np.where(train_csv_arr[:, 0]==fid)[0]
    clsname = train_csv_arr[cind, 1]
    cinfo_ind = np.where(classinfo_arr[:, 1] == clsname)[0]
    train_video_label.append(classinfo_arr[cinfo_ind, 0].astype("int"))
train_video_label = np.asarray(train_video_label).ravel()

# 분류를 위해, 행동 분류에 대한 각 test 비디오 별 id 가공
test_video_id = np.array([int(i.split("/")[-1].split(".")[0]) for i in test_video_desc_key])

print("\t{:3.2f}s\n\n".format(time.time()-start))

In [None]:
# 이미지 feature에 대해 다시 한번 VLAD feature 기술 방식을 사용하여 video feature를 기술한 경우 큰 차원으로 인해 메모리 부족 현상이 발생하므로 PCA를 이용한 차원 축소
if args_aggr=="vlad":
    print("\n\nReduce dim of descriptor of the frames with PCA")
    start = time.time()
    pca = PCA(n_components=pca_vlad, random_state=0)
    pca.fit(train_video_desc)
    train_video_desc = pca.transform(train_video_desc)
    test_video_desc = pca.transform(test_video_desc)
    print("\t{:3.2f}s\n\n".format(time.time()-start))


In [None]:
print("\n\nSVM video descriptor")
start = time.time()

clf = SVC(random_state = 0)
clf.fit(train_video_desc, train_video_label)
svm_predict = clf.predict(test_video_desc[test_video_id])

# baseline(bow) : 0.20990
# random_state = 0, class_weight = 'balanced' // kaggle: 0.07128

# baseline(vlad) : 0.28118
# random_state = 0, class_weight = 'balanced' // kaggle: 0.07524

saveFile(classinfo_arr[svm_predict][:,1], test_video_id, "svm_video")
print("\t{:3.2f}s\n\n".format(time.time()-start))