In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from PIL import Image
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
import tensorflow as tf
import os
from tqdm import tqdm

2024-03-14 02:54:50.713003: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-14 02:54:50.713127: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-14 02:54:50.844891: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import csv

In [3]:
IMG_SIZE=224
test_transforms = transforms.Compose(
    [
        transforms.Resize((IMG_SIZE,IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    ]
)

In [4]:
use_cuda = torch.cuda.is_available()
device = 'cuda' if use_cuda else 'cpu'

In [5]:
PATH="/kaggle/input/efficientnet-weights-on-vgg/enet_b0_8_best_afew.pt"

In [6]:
feature_extractor_model = torch.load(PATH)
feature_extractor_model.classifier=torch.nn.Identity()
feature_extractor_model.eval()
feature_extractor_model=feature_extractor_model.to(device)

In [7]:
emotion_to_index={
    "Positive":0,
    "Neutral":1,
    "Negative":2
}

In [8]:
DATA_DIR='/kaggle/input/vgaf-aligned/VGAF_aligned'

In [9]:
def get_features(data_dir,cnt_frame,face_cnt):
    videomane2features={}
    for videoname in tqdm(os.listdir(data_dir)):
        frame_dict=os.path.join(data_dir,videoname)
        video_features=[]
        cnt=0
        for frame in os.listdir(frame_dict):
            cnt=cnt + 1
            if cnt==cnt_frame:
                break;
            face_dict=os.path.join(frame_dict,frame)
            frame_features=[]
            faces=[]
            for face in os.listdir(face_dict):
                img = Image.open(os.path.join(face_dict,face))
                img_tensor = test_transforms(img)
                faces.append(img_tensor)
                
                if img.size:
                    faces.append(img_tensor)
                    if len(faces)>=face_cnt:        
                        scores = feature_extractor_model(torch.stack(faces, dim=0).to(device))
                        scores=scores.data.cpu().numpy()

                        if len(frame_features)==0:
                            frame_features=scores
                        else:
                            frame_features=np.concatenate((frame_features,scores),axis=0)

                        faces=[]
                    
            if len(faces)>0:        
                scores = feature_extractor_model(torch.stack(faces, dim=0).to(device))
                scores=scores.data.cpu().numpy()

                if len(frame_features)==0:
                    frame_features=scores
                else:
                    frame_features=np.concatenate((frame_features,scores),axis=0)
            
            mean_features = (np.mean(frame_features, axis=0))
            std_features = (np.std(frame_features, axis=0))
#             max_features = (np.max(frame_features, axis=0))
#             min_features = (np.min(frame_features, axis=0))

            overall_frame_feature=np.concatenate((mean_features,std_features),axis=None)
            
            video_features.append(overall_frame_feature)
        
        video_features=np.array(video_features)
        mean_features = (np.mean(video_features, axis=0))
        std_features = (np.std(video_features, axis=0))
#         max_features = (np.max(video_features, axis=0))
#         min_features = (np.min(video_features, axis=0))
        overall_video_features=np.concatenate((mean_features,std_features),axis=None)
        videomane2features[videoname]=overall_video_features
    return videomane2features

In [12]:
train_dir=os.path.join(DATA_DIR,"Train_faces")
val_dir=os.path.join(DATA_DIR,"Val_faces")

In [13]:
videoname2features_train=get_features(train_dir,50,32)

100%|██████████| 2659/2659 [1:54:38<00:00,  2.59s/it]  


In [14]:
videoname2features_val=get_features(val_dir,50,32)

100%|██████████| 766/766 [45:06<00:00,  3.53s/it]  


In [15]:
train_labelfilename='/kaggle/input/vgaf-frames-faces/Train_labels.txt'
val_labelfilename='/kaggle/input/vgaf-frames-faces/Val_labels.txt'

In [16]:
def create_dataset(videoname2features,filename):
    x=[]
    y=[]
    has_face=[]
  
    with open(filename, mode='r') as file:
        labels = csv.reader(file, delimiter=' ')
        for i,row in enumerate(labels):
            if i==0:
                continue
            videoname,videolabel=row[0],int(row[1])
            
            if videoname in videoname2features.keys():
                video_features=videoname2features[videoname]
                x.append(video_features)
                has_face.append(1)
            else:
                x.append(np.zeros((5120,)))
                has_face.append(0)
            y.append(videolabel-1)
    x=np.array(x)
    y=np.array(y)
    has_face=np.array(has_face)
    print(x.shape,y.shape)
    return x,y,has_face

In [17]:
x_train, y_train, has_faces_train = create_dataset(videoname2features_train,train_labelfilename)
x_test, y_test, has_faces_test = create_dataset(videoname2features_val,val_labelfilename)

(2661, 5120) (2661,)
(766, 5120) (766,)


In [18]:
from sklearn import svm,metrics,preprocessing

In [19]:
x_train_norm=preprocessing.normalize(x_train,norm='l2')
x_test_norm=preprocessing.normalize(x_test,norm='l2')

In [20]:
clf = svm.SVC(kernel='rbf',C=1.9)
if True:    
    clf.fit(x_train_norm[has_faces_train==1], y_train[has_faces_train==1])
    y_pred = clf.predict(x_test_norm)
else:
    clf.fit(x_train[has_faces_train==1], y_train[has_faces_train==1])
    y_pred = clf.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test[has_faces_test==1], y_pred[has_faces_test==1]))
print("Complete accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6096605744125326
Complete accuracy: 0.6096605744125326


In [21]:
from sklearn.metrics import classification_report
def confusion_matrix(y_pred,y_val):
    yhat_valid = y_pred
    val_labels_onehot = tf.keras.utils.to_categorical(y_val)
    print(f'total wrong validation predictions: {np.sum(np.argmax(val_labels_onehot, axis=1) != yhat_valid)}\n\n')
    print(classification_report(np.argmax(val_labels_onehot, axis=1), yhat_valid))

In [22]:
print("Confusion matrix:\n")
confusion_matrix(y_pred[has_faces_test==1], y_test[has_faces_test==1])

Confusion matrix:

total wrong validation predictions: 299


              precision    recall  f1-score   support

           0       0.78      0.61      0.69       302
           1       0.60      0.57      0.59       280
           2       0.46      0.67      0.55       184

    accuracy                           0.61       766
   macro avg       0.62      0.62      0.61       766
weighted avg       0.64      0.61      0.62       766



In [23]:
print("Overall Confusion matrix:\n")
confusion_matrix(y_pred, y_test)

Overall Confusion matrix:

total wrong validation predictions: 299


              precision    recall  f1-score   support

           0       0.78      0.61      0.69       302
           1       0.60      0.57      0.59       280
           2       0.46      0.67      0.55       184

    accuracy                           0.61       766
   macro avg       0.62      0.62      0.61       766
weighted avg       0.64      0.61      0.62       766

