## Imports

In [1]:
import pandas as pd
import torch
import torch
import torch.nn as nn
from torchvision import models
import os
from utils import process_video_frames_tchw, count_predictions
import numpy as np

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#### Directory Setup

In [2]:
# Root directory for dataset
ROOT_DIR = '../../data/BiteCount'
FOLDS_DIR = './folds/'  
VIDEO_DIR = os.path.join(ROOT_DIR, 'video')

# Checkpoint path
CHECKPOINT_DIR = '/media/sadat/sadat/resnet_benchmarks'

#### Action Trigger Module Settings

In [3]:
ENTER_THRESHOLD = 0.78
EXIT_THRESHOLD = 0.4
MOMENTUM = 0.4

BATCH_SIZE = 64

#### Model initialization

In [4]:
# Initialize model and optimizer
model = models.resnet152(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 1)
model = model.to(device)



### Helper Function

In [5]:
def predict_video(row):
    """Returns prediction, mae, and obo"""
    video_path = os.path.join(VIDEO_DIR, row['name'])
    ground_truth = row['count']
    transformed_tchw_tensor = process_video_frames_tchw(video_path).to(device)
    sigmoid = nn.Sigmoid()
    batch_size = BATCH_SIZE

    Y = []

    model.eval() 
    with torch.no_grad(): 
        for i in range(0, len(transformed_tchw_tensor)+1, batch_size):
            batch = transformed_tchw_tensor[i:i+batch_size].cuda() 
            output = model(batch)  
            y_batch = sigmoid(output).cpu().numpy()  
            Y.append(y_batch)  

    Y = np.concatenate(Y, axis=0)  
    Y = Y.squeeze()
    return count_predictions(Y,
                             ground_truth,
                             ENTER_THRESHOLD,
                             EXIT_THRESHOLD,
                             MOMENTUM)

### Cross Validation

In [6]:
oboas = []
maes = []

for fold in range(1, 11):
    checkpoint_path = os.path.join(CHECKPOINT_DIR, 'best_resnet152_fold' + str(fold) + '.pth')
    model.load_state_dict(torch.load(checkpoint_path))

    test_annotation_file = os.path.join(FOLDS_DIR, f'test_fold_{fold}.csv')
    df = pd.read_csv(test_annotation_file)
    df[['prediction', 'mae', 'obo']] = df.apply(lambda row: pd.Series(predict_video(row)), axis=1)
    oboas.append(df['obo'].mean())
    maes.append(df['mae'].mean())

    save_file = os.path.join(FOLDS_DIR, f'results_fold_{fold}.csv')
    df.to_csv(save_file, index=False)

  model.load_state_dict(torch.load(checkpoint_path))
  model.load_state_dict(torch.load(checkpoint_path))
  model.load_state_dict(torch.load(checkpoint_path))
  model.load_state_dict(torch.load(checkpoint_path))
  model.load_state_dict(torch.load(checkpoint_path))
  model.load_state_dict(torch.load(checkpoint_path))
  model.load_state_dict(torch.load(checkpoint_path))
  model.load_state_dict(torch.load(checkpoint_path))
  model.load_state_dict(torch.load(checkpoint_path))
  model.load_state_dict(torch.load(checkpoint_path))


### Results

In [7]:
oboas

[0.85,
 0.75,
 0.75,
 0.8,
 0.8,
 0.7,
 0.631578947368421,
 0.5789473684210527,
 0.631578947368421,
 1.0]

In [8]:
maes

[0.12463980460994502,
 0.2342815370450965,
 0.1776803751497577,
 0.09929924240043422,
 0.17252908335190623,
 0.17286130534842264,
 0.24060262060040702,
 0.2543859648817495,
 0.20681248573836747,
 0.03482112142621823]