#1. Data Download and Preporcessing

This code only used the cholec80 dataset to do the experiment

In [None]:
!wget -c --http-user=camma_cholec80 --http-password=cholec80_unistra http://camma.u-strasbg.fr/datasets/cholec80/cholec80.zip

In [None]:
#!unzip -l ./cholec80.zip

In [None]:
!unzip -d ./cholec80 -n ./drive/MyDrive/cholec80.zip

After downloading and unzip the cholec80 dataset, the video frames should be extracted and the extracting fps is 1fps. 

After extracting, the change_size code will remove the black margin of each frame and then resize the frames to 250*250.

Finally, use the original phase_annotation files to extract the annotations for the model.

In [None]:
import cv2
import os
import PIL
import pandas as pd
import numpy as np

In [None]:
def change_size(image):
 
    binary_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary_image2 = cv2.threshold(binary_image, 15, 255, cv2.THRESH_BINARY)
    binary_image2 = cv2.medianBlur(binary_image2, 19)  # filter the noise, need to adjust the parameter based on the dataset
    x = binary_image2.shape[0]
    y = binary_image2.shape[1]

    edges_x = []
    edges_y = []
    for i in range(x):
        for j in range(10,y-10):
            if binary_image2.item(i, j) != 0:
                edges_x.append(i)
                edges_y.append(j)
    
    if not edges_x:
        return image

    left = min(edges_x)  # left border
    right = max(edges_x)  # right
    width = right - left  
    bottom = min(edges_y)  # bottom
    top = max(edges_y)  # top
    height = top - bottom  

    pre1_picture = image[left:left + width, bottom:bottom + height]  

    #print(pre1_picture.shape) 
    
    return pre1_picture  

In [None]:
for video in range(1,81):
  frame_path = './cholec80/frames/{}'.format(video)
  print(frame_path)
  cap= cv2.VideoCapture('./cholec80/videos/video{:02d}.mp4'.format(video))
  if not os.path.exists(frame_path):
    os.makedirs(frame_path)
  i=0
  while(cap.isOpened()):
      imsave_path = os.path.join(frame_path,'{}.jpg'.format(i//25))
      ret, frame = cap.read()
      if ret == False:
        break
      if not i%25==0:
        i+=1
        cv2.waitKey(1)
        continue
      
      dim = (int(frame.shape[1]/frame.shape[0]*300), 300)
        
      frame = cv2.resize(frame,dim)
      frame = change_size(frame)
      img_result = cv2.resize(frame,(250,250))
      #print(img_result.shape)
      #print(img_result.dtype)

      img_result = cv2.cvtColor(img_result, cv2.COLOR_BGR2RGB)
      img_result = PIL.Image.fromarray(img_result)
      #print(img_result.mode)


      if i%25==0:
        print(imsave_path)
        imsave_path = os.path.join(frame_path,'{}.jpg'.format(i//25))
        cv2.imwrite(imsave_path,img_result)
      i+=1
      cv2.waitKey(1)

  cap.release()
  cv2.destroyAllWindows()

In [None]:
for i in range(1,81):
  phase_path = './cholec80/phase_annotations/video{:02d}-phase.txt'.format(i)
  df = pd.read_csv(phase_path,sep='\t')
  result_df = df[::25]
  idx = np.arange(0,len(result_df))
  result_df.iloc[:,0] = idx
  #print(result_df.head())
  #result_df.iloc[:,1]= df.iloc[::25,1]
  result_df.to_csv('./cholec80/annotations/{}.txt'.format(i),sep='\t',index=False,header=False)

#2. Feature Extractor Network training (ResNet50)

According to the paper, a ResNet50 model is trained to extract features from frames. The dataset is seperated by 60 for training and validation, 20 for testing. In this part, I use 5-fold cross validation to train the ResNet50 model and extract the frame features by using the best model of each fold.

The training and testing dataset is randomly selected.

In [None]:
# This zip file is the extracted frames from cholec80 dataset.
#!cp ./drive/MyDrive/cholec_extracted.zip ./
#!unzip cholec_extracted.zip

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.nn.init as init
from torchvision import models,transforms
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets.folder import default_loader

from sklearn.model_selection import KFold

import os
import math
import random
import numpy as np
import pandas as pd
import copy
from tqdm import tqdm

In [2]:
!nvidia-smi
print(torch.cuda.is_available())

Thu Nov  4 02:50:08 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    31W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
class ResNet50(nn.Module):
  def __init__(self,num_classes=1000):
    super(ResNet50,self).__init__()
    resnet = models.resnet50(pretrained=True)
    self.share = nn.Sequential()
    self.share.add_module('conv1',resnet.conv1)
    self.share.add_module('bn1',resnet.bn1)
    self.share.add_module('relu',resnet.relu)
    self.share.add_module('maxpool',resnet.maxpool)
    self.share.add_module('layer1',resnet.layer1)
    self.share.add_module('layer2',resnet.layer2)
    self.share.add_module('layer3',resnet.layer3)
    self.share.add_module('layer4',resnet.layer4)
    self.share.add_module('avgpool',resnet.avgpool)
    self.fc = nn.Linear(2048,num_classes)
    #self.softmax = nn.Softmax()
  def forward(self,x):
    x = x.view(-1,3,224,224)
    x = self.share.forward(x)
    feature = x.view(x.size(0),-1)
    #print(x.shape,feature.shape)
    y = self.fc(feature)

    return feature,y

In [14]:
phase2label_dict = {
  'cholec80':{
    'Preparation':0,
    'CalotTriangleDissection':1,
    'ClippingCutting':2,
    'GallbladderDissection':3,
    'GallbladderPackaging':4,
    'CleaningCoagulation':5,
    'GallbladderRetraction':6
  }
}

In [15]:
def phase2label(phases,phase2label_dict):
  labels = [phase2label_dict[phase] if phase in phase2label_dict.keys() else len(phase2label_dict) for phase in phases]
  labels = np.array(labels)
  return labels

In [None]:
# to see if the number of frames is equal to the number of labels
root_path = './cholec80'
for i in range(1,81):
  frame_path = os.path.join(root_path,'frames/{}'.format(i))
  len1 = len(os.listdir(frame_path))
  phase_path = os.path.join(root_path,'annotations/{}.txt'.format(i))
  phase = pd.read_csv(os.path.join(phase_path),sep='\t',header=None)
  len2 = len(phase)
  #print(len1-len2)

In [35]:
# generate frame dataset
class FrameData(Dataset):
  def __init__(self,dataset,root,label_folder='annotations',frame_folder='frames',load_list=[]):
    self.dataset = dataset
    self.root = root
    self.imgs = []
    self.labels = []

    frame_path = os.path.join(root,frame_folder)
    label_path = os.path.join(root,label_folder)

    for i in load_list:
      frame_fold = os.path.join(frame_path,str(i))
      frame_len = len(os.listdir(frame_fold))
      
      labels = pd.read_csv(os.path.join(label_path,'{}.txt'.format(i)),sep='\t',header=None)
      labels = labels.loc[:,1]
      labels = np.array(labels)
      labels = phase2label(labels,phase2label_dict[self.dataset])
      label_len = len(labels)
      assert frame_len == label_len

      for idx in range(0,frame_len):
        self.imgs.append(os.path.join(frame_fold,'{}.jpg'.format(idx)))
        self.labels.append(labels[idx])
    self.transform = self.get_transform()
    print('Load Dataset {} with {} frames'.format(self.dataset,len(self.imgs)))

  def __len__(self):
    return len(self.imgs)

  def __getitem__(self,item):
    img,labels,img_path = self.transform(default_loader(self.imgs[item])),self.labels[item],self.imgs[item]
    #print(len(img),len(labels),len(img_path))
    return img,labels,img_path

  def get_transform(self):
    return transforms.Compose([
            transforms.Resize((224,224)),
            transforms.ToTensor()
    ])


In [None]:
####unit test###########
#black_list = np.arange(1,80)
#load_list = [1]
#dataset = FrameData('cholec80','./cholec80',load_list=load_list)

In [None]:
learning_rate = 1e-4
epochs = 3
batch_size= 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
loss_layer = nn.CrossEntropyLoss()

In [None]:
def cross_valid_train(model,save_dir,feature_dir,train_loader,test_loader):
  global learning_rate,epochs
  lr = copy.deepcopy(learning_rate)
  if not os.path.exists(save_dir):
    os.makedirs(save_dir)
  model.to(device)
  for epoch in range(1,epochs+1):
    torch.cuda.empty_cache()
    
    if epoch%2==0:
      lr = lr*0.5
    model.train()

    correct = 0
    total = 0
    loss_item =0

    optimizer = torch.optim.Adam(model.parameters(),lr,weight_decay=1e-5)

    for (imgs,labels,img_path) in tqdm(train_loader):
      imgs,labels = imgs.to(device),labels.to(device)
      feature,y = model(imgs) # shape 224*224*3
      print(y.shape,labels.shape)
      loss = loss_layer(y,labels)
      loss_item += loss.item()
      _,prediction = torch.max(y.data,1)
      correct += ((prediction==labels).sum()).item()
      total += len(prediction)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    train_acc = correct/total
    train_loss = loss_item/total
    
    test_acc,test_loss = cross_valid_test(model,feature_dir,test_loader,save_feature=False)
      
    print('Training Epoch {}: train_acc:{:.4f},train_loss:{:.4f},test_acc:{:.4f},test_loss:{:.4f}'.format(epoch,train_acc,train_loss,test_acc,test_loss))
    torch.save(model.state_dict(),os.path.join(save_dir,'train_{}_{:.0f}_{:.0f}.pth'.format(epoch,test_acc*10000,test_loss*10000)))

In [None]:
def cross_valid_test(model,feature_dir,test_loader,err_dir,save_feature=True):
  print('Testing, Saving feature:{}'.format(save_feature))
  err_df = pd.DataFrame(columns=['video_num','frame_num','err'])
  model.eval()
  model.to(device)
  correct = 0
  total = 0
  loss_item = 0
  with torch.no_grad():
    torch.cuda.empty_cache()
    for (imgs,labels,img_path) in tqdm(test_loader):
      video_num,frame_name= img_path[0].split('/')[-2],img_path[0].split('/')[-1]
      imgs,labels = imgs.to(device),labels.to(device)
      feature,y = model(imgs)
      loss = loss_layer(y,labels)
      loss_item += loss.item()
      _,prediction = torch.max(y.data,1)
      correct += ((prediction==labels).sum()).item()
      total += len(prediction)

      err_df.loc[len(err_df)] = [int(video_num),int(frame_name.split('.')[0]),loss.item()]

      if save_feature == True:
        assert len(imgs)==1 # set batch_size=1 to extract features
        feature = feature.to('cpu').numpy()
        video_num,frame_name= img_path[0].split('/')[-2],img_path[0].split('/')[-1]
        save_path = os.path.join(feature_dir,video_num)
        if not os.path.exists(save_path):
          os.makedirs(save_path)
        np.save(os.path.join(save_path,frame_name.split('.')[0]+'.npy'),feature)
    test_acc = correct/total
    test_loss = loss_item/total
  err_df.to_csv(err_dir,sep=',',index=False)
  return test_acc,test_loss


In [None]:
# training progress
dataset = 'cholec80'
k = 5
random_seed = 1024
np.random.seed(random_seed) # set numpy random seed
random.seed(random_seed) # set python random seed

dataset_list = np.arange(1,81)
train_list = np.array(random.sample(dataset_list.tolist(),60)) # get training list randomly
test_list = np.setdiff1d(dataset_list,train_list) # get remaining test list


In [None]:
##################### unit test ########################
#k = 2
#dataset_list = np.arange(1,4)
#train_list = np.array(random.sample(dataset_list.tolist(),3)) # get training list randomly
#test_list = np.setdiff1d(dataset_list,train_list) # get remaining test list
###########################################################


kf = KFold(k,shuffle=True,random_state=random_seed)
valid_split = kf.split(train_list)

for k, (train_idx,valid_idx) in enumerate(valid_split):
  resnet = ResNet50(num_classes=7)
  train_dataset = FrameData(dataset,'./drive/MyDrive/cholec80',load_list=train_list[train_idx])
  valid_dataset = FrameData(dataset,'./drive/MyDrive/cholec80',load_list=train_list[valid_idx])

  train_dataloader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,drop_last=False)
  valid_dataloader = DataLoader(valid_dataset,batch_size=batch_size,shuffle=True,drop_last=False)

  model_save_dir = './drive/MyDrive/OperA/{}/cross_valid/{}'.format(dataset,k)
  feature_save_dir = './drive/MyDrive/OperA/{}/cross_valid_feature/'.format(dataset)
  print('Cross Valid {}: model saved at -> {}'.format(k,model_save_dir))

  cross_valid_train(resnet,model_save_dir,feature_save_dir,train_dataloader,valid_dataloader)

In [None]:
print(test_list)

[ 2  4 10 16 17 21 26 27 32 35 37 43 51 56 59 60 65 69 77 79]


In [None]:
resnet = ResNet50(num_classes=7)

model_path = './drive/MyDrive/OperA/{}/cross_valid/4/best.pth'.format(dataset)
resnet.load_state_dict(torch.load(model_path))

test_dataset = FrameData(dataset,'./cholec80',load_list=test_list)
test_dataloader = DataLoader(test_dataset,batch_size=1,shuffle=False,drop_last=False)

feature_save_dir = './OperA/{}/cross_valid_feature/'.format(dataset)
err_dir = './drive/MyDrive/OperA/{}/err_dir/test_err.csv'.format(dataset)
cross_valid_test(resnet,feature_save_dir,test_dataloader,err_dir)

Load Dataset cholec80 with 44042 frames
Testing, Saving feature:True


100%|██████████| 44042/44042 [11:03<00:00, 66.39it/s]


(0.8095000227055992, 0.6373712011547304)

After training , the frame-wise features are extracted through the ResNet50 model and merge the features to video features. Besides, I also saved the CEE loss error of each frame which will be used in the next model.

In [None]:
dataset = 'cholec80'
k = 5
random_seed = 1024
np.random.seed(random_seed) # set numpy random seed
random.seed(random_seed) # set python random seed

dataset_list = np.arange(1,81)
train_list = np.array(random.sample(dataset_list.tolist(),60)) # get training list randomly
#test_list = np.setdiff1d(dataset_list,train_list) # get remaining test list

kf = KFold(k,shuffle=True,random_state=random_seed)
valid_split = kf.split(train_list)

for k, (train_idx,valid_idx) in enumerate(valid_split):
  print('cross valid fold {} is extracting'.format(k))
  print(train_list[valid_idx])
  resnet = ResNet50(num_classes=7)
  model_path = './drive/MyDrive/OperA/{}/cross_valid/{}/best.pth'.format(dataset,k)
  resnet.load_state_dict(torch.load(model_path))

  valid_dataset = FrameData(dataset,'./{}'.format(dataset),load_list=train_list[valid_idx])
  valid_dataloader = DataLoader(valid_dataset,batch_size=1,shuffle=False,drop_last=False)

  feature_save_dir = './OperA/{}/cross_valid_feature/'.format(dataset)
  err_dir = './drive/MyDrive/OperA/{}/err_dir/valid_{}_err.csv'.format(dataset,k)
  cross_valid_test(resnet,feature_save_dir,valid_dataloader,err_dir)

cross valid fold 0 is extracting
[50 67 78 75 14 34 54  9 11 15  8 74]
Load Dataset cholec80 with 23385 frames
Testing, Saving feature:True


100%|██████████| 23385/23385 [05:49<00:00, 66.95it/s]


cross valid fold 1 is extracting
[ 3 66 47  6 41 52 45 49 30 46 73  5]
Load Dataset cholec80 with 30481 frames
Testing, Saving feature:True


100%|██████████| 30481/30481 [07:33<00:00, 67.19it/s]


cross valid fold 2 is extracting
[57 18 53 29 71 58 76  1 64 25 68 44]
Load Dataset cholec80 with 32735 frames
Testing, Saving feature:True


100%|██████████| 32735/32735 [08:08<00:00, 66.99it/s]


cross valid fold 3 is extracting
[62 13 48 31 23 40 80 22 12 19 63 38]
Load Dataset cholec80 with 25944 frames
Testing, Saving feature:True


100%|██████████| 25944/25944 [06:18<00:00, 68.55it/s]


cross valid fold 4 is extracting
[42 70 20 28 39  7 24 72 55 61 33 36]
Load Dataset cholec80 with 27991 frames
Testing, Saving feature:True


100%|██████████| 27991/27991 [06:48<00:00, 68.44it/s]


In [None]:
def video_feature_gen(feature_dir,target_dir):
  if not os.path.exists(target_dir):
    os.makedirs(target_dir)
  video_list = np.array(os.listdir(feature_dir),dtype='uint8')
  for video_idx in tqdm(video_list):
    frame_path = os.path.join(feature_dir,str(video_idx))
    frame_list = os.listdir(frame_path)
    num_of_frames = len(frame_list)
    video_feature = []
    for i in range(0,num_of_frames):
      img_path = os.path.join(frame_path,'{}.npy'.format(i))
      video_feature.append(np.load(img_path))
    video_feature = np.concatenate(video_feature,axis=0)

    np.save(os.path.join(target_dir,'{}.npy'.format(video_idx)),video_feature)
    print('video {} have been saved'.format(video_idx))

In [None]:
video_feature_dir = './drive/MyDrive/OperA/{}/video_feature'.format(dataset)
video_feature_gen(feature_save_dir,video_feature_dir)

#3. OperA model
In this part, I implemented the remaining OperA model with self-attention according to the paper. 
The model used 11 self-attention layer and used a self-defined loss function. They are all implemented in my code. 

However, I cannot get a good result with the code. There may be areas where my implementation is wrong or some parameters are different with the authors code.

In [3]:
class FeadForwardNetwork(nn.Module):
  def __init__(self,hidden_size,filter_size,dropout=0.1):
    super(FeadForwardNetwork,self).__init__()

    self.layer1 = nn.Linear(filter_size,hidden_size)
    self.layer2 = nn.Linear(hidden_size,filter_size)
    self.dropout = nn.Dropout(dropout)

    init.xavier_normal_(self.layer1.weight)
    init.constant_(self.layer1.bias,0)
    init.xavier_normal_(self.layer2.weight)
    init.constant_(self.layer2.bias,0)

  def forward(self,x):
    x = self.layer1(x)
    x = F.relu(x)
    x = self.dropout(x)
    x = self.layer2(x)
    return x

In [4]:
class Attention(nn.Module):
  def __init__(self,feature_size,d_model,dropout=0.1,is_first=False):
    super(Attention,self).__init__()
    self.is_first = is_first
    self.d_model = d_model
    self.d = 64
    self.linear_f = nn.Linear(feature_size,d_model)
    #self.linear_q = nn.Linear(d_model,d_model)
    #self.linear_k = nn.Linear(d_model,d_model)
    #self.linear_v = nn.Linear(d_model,d_model)
    self.dropout = nn.Dropout(dropout)
    self.norm = nn.LayerNorm(d_model)
    self.fc = FeadForwardNetwork(512,d_model)

    init.xavier_normal_(self.linear_f.weight)

  def forward(self,x):
    if self.is_first==True:
      x = self.linear_f(x)
    x = x.view(-1,self.d_model)
    q,k,v = (x,x,x)
    #q = self.linear_q(x)
    #k = self.linear_k(x)
    #v = self.linear_v(x)
    
    mask = 1-np.triu(np.ones((q.size(0),q.size(0)))).astype('bool')
    mask = torch.from_numpy(mask).cuda()

    attention,scores_sm = self.calculation(q,k,v,mask)
    attention = self.norm(x+attention)
    fc_output = self.fc(attention)
    output = self.norm(attention+fc_output)


    if self.is_first==True:
      # Normalized Frame-Wise Attention
      n_up = torch.sum(scores_sm,axis=0).cuda()
      n_down = torch.arange(q.size(0),0,-1).cuda()
      n = n_up/n_down
      return output,n
    else:
      return output
  
  def calculation(self,q,k,v,mask=None):
    scores = torch.matmul(q,k.transpose(-1,-2))/math.sqrt(self.d)

    if mask is not None:
      #mask = mask.unsqueeze(1)
      scores = scores.masked_fill(mask==0,-1e9)
    scores_sm = F.softmax(scores,dim=-1)

    if self.dropout is not None:
      scores_sm = self.dropout(scores_sm)

    output = torch.matmul(scores_sm,v)
    return output,scores_sm

In [5]:
class OperA(nn.Module):
  def __init__(self,num_classes=7):
    super(OperA,self).__init__()
    self.layer1 = Attention(2048,192,is_first=True)
    self.layer2 = Attention(192,192)
    self.layer3 = Attention(192,192)
    self.layer4 = Attention(192,192)
    self.layer5 = Attention(192,192)
    self.layer6 = Attention(192,192)
    self.layer7 = Attention(192,192)
    self.layer8 = Attention(192,192)
    self.layer9 = Attention(192,192)
    self.layer10 = Attention(192,192)
    self.layer11 = Attention(192,192)
    self.fc = nn.Linear(192,num_classes)
    init.xavier_normal_(self.fc.weight)
  def forward(self,x):
    x,n = self.layer1(x)
    x = self.layer2(x)
    x = self.layer3(x)
    x = self.layer4(x)
    x = self.layer5(x)
    x = self.layer6(x)
    x = self.layer7(x)
    x = self.layer8(x)
    x = self.layer9(x)
    x = self.layer10(x)
    x = self.layer11(x)
    output = self.fc(x)
    output = output.view(-1,output.size(-1))
    #print(output.shape,' : final output shape')
    return output,n


In [6]:
class VideoFeature(Dataset):
  def __init__(self,dataset,root,err_path,feature_folder='video_feature',label_folder='annotations',load_list=[]):
    self.dataset = dataset
    self.root = root
    self.features = []
    self.labels = []
    self.err = []

    feature_path = os.path.join(root,feature_folder)
    label_path = os.path.join(root,label_folder)

    err_pd = pd.read_csv(err_path,sep=',')

    for i in load_list:
      feature = np.load(os.path.join(feature_path,'{}.npy'.format(i)))

      labels = pd.read_csv(os.path.join(label_path,'{}.txt'.format(i)),sep='\t',header=None)
      labels = labels.loc[:,1]
      labels = np.array(labels)
      labels = phase2label(labels,phase2label_dict[self.dataset])
      label_len = len(labels)
      
      err = err_pd[err_pd['video_num']==i]
      err = err.iloc[:,2]
      err = np.array(err,dtype='float32')
      err_len = len(err)

      self.features.append(feature)
      self.labels.append(labels)
      self.err.append(err)
      #print(feature.shape,label_len,err_len)
    self.freq_list = self.get_class_freq()
    print('Load Dataset {} with {} video features'.format(self.dataset,len(self.features)))

  def __len__(self):
    return len(self.features)

  def __getitem__(self,item):
    feature,label,err = self.features[item],self.labels[item],self.err[item]
    freq_list = self.freq_list
    return feature,label,err,freq_list

  def get_class_freq(self):
    freq_list = []
    labels_all = np.concatenate(self.labels,axis=0)
    for i in range(7):
      freq_list.append(round(sum(labels_all==i)/len(labels_all),4))
    return freq_list


In [None]:
######################### unit test ###################
root_path = './drive/MyDrive/OperA/cholec80'
err_path = os.path.join(root_path,'err_dir/dataset_err.csv')
load_list = np.arange(1,5)
test_vf = VideoFeature('cholec80',root_path,err_path,load_list=load_list)
freq_list = test_vf.get_class_freq()
print(freq_list)
##########################################################

Load Dataset cholec80 with 4 video features
[0.0695, 0.54, 0.0619, 0.1956, 0.0233, 0.0759, 0.0338]


In [36]:
# this function is used to generate the loss error file of the whole dataset
def concat_err_file(err_dir):
  err_file_list = os.listdir(err_dir)
  df = pd.DataFrame(columns=['video_num','frame_num','err'])
  for file_ in err_file_list:
    err_file = os.path.join(err_dir,file_)
    print(err_file)
    err_df = pd.read_csv(err_file,sep=',',dtype={'video_num':int,'frame_num':int})
    df = pd.concat([df,err_df],axis=0)
  df = df.sort_values(by=['video_num','frame_num'])
  df.to_csv(os.path.join(err_dir,'dataset_err.csv'),sep=',',index=False)

In [None]:
err_dir = './drive/MyDrive/OperA/{}/err_dir'.format(dataset)
concat_err_file(err_dir)

./drive/MyDrive/OperA/cholec80/err_dir/test_err.csv
./drive/MyDrive/OperA/cholec80/err_dir/valid_0_err.csv
./drive/MyDrive/OperA/cholec80/err_dir/valid_1_err.csv
./drive/MyDrive/OperA/cholec80/err_dir/valid_2_err.csv
./drive/MyDrive/OperA/cholec80/err_dir/valid_3_err.csv
./drive/MyDrive/OperA/cholec80/err_dir/valid_4_err.csv


In [19]:
def opera_train(model,model_save_dir,result_dir,train_loader,test_loader):
  lr = 1e-6
  epochs = 30
  if not os.path.exists(model_save_dir):
    os.makedirs(model_save_dir)
  model.to(device)

  for epoch in range(1,epochs+1):
    torch.cuda.empty_cache()
    #if epoch%3==0:
    #  lr = lr*0.5
    model.train()

    correct = 0
    total = 0
    loss_item = 0

    optimizer = torch.optim.Adam(model.parameters(),lr,weight_decay=1e-5)

    for (feature,label,err,freq_list) in tqdm(train_loader):
      feature,label,err = feature.to(device),label.to(device),err.to(device)
      output,n = model(feature)
      label = label.transpose(-1,-2)
      err = err.squeeze()
      #print(output.shape,n.shape,' :2')
      

      # median frequency balanced cross-entropy
      class_freq = freq_list
      class_weight = [1.0/freq for freq in class_freq]
      class_weight = torch.Tensor(class_weight)
      loss_layer = nn.CrossEntropyLoss(weight=class_weight).cuda()
      loss = 0
      for i in range(len(output)):
        loss += loss_layer(torch.unsqueeze(output[i,:],0),label[i,:])
        loss += n[i]*err[i] 
      loss = loss/len(output)
      loss_item +=loss.item()
      _,prediction = torch.max(output.data,1)

      correct += ((prediction==label.transpose(-1,-2)).sum()).item()
      total += len(prediction)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    train_acc = correct/total
    train_loss = loss_item/total
    test_acc,test_loss = opera_test(model,result_dir,test_loader)
    print('Training Epoch {}: train_acc:{:.4f},train_loss:{},test_acc:{:.4f},test_loss:{}'.format(
        epoch,train_acc,train_loss,test_acc,test_loss))
    torch.save(model.state_dict(),os.path.join(model_save_dir,'train_{}_{:.0f}_{:.0f}.pth'.format(epoch,test_acc*10000,test_loss*10000)))

In [10]:
def opera_test(model,result_dir,test_loader,save_result=False):
  print('OperA model is training ,Saving result: {}'.format(save_result))
  model.eval()
  model.to(device)
  correct = 0
  total = 0
  loss_item = 0

  with torch.no_grad():
    torch.cuda.empty_cache()
    for (feature,label,err,freq_list) in tqdm(test_loader):
      feature,label,err = feature.to(device),label.to(device),err.to(device)
      output,n = model(feature)
      label = label.transpose(-1,-2)
      err = err.squeeze()

      class_freq = freq_list
      class_weight = [1.0/freq for freq in class_freq]
      class_weight = torch.Tensor(class_weight)
      loss_layer = nn.CrossEntropyLoss(weight=class_weight).cuda()
      loss = 0
      for i in range(len(output)):
        loss += loss_layer(torch.unsqueeze(output[i,:],0),label[i,:])
        loss += n[i]*err[i]
      loss = loss/len(output)
      loss_item +=loss.item()
      _,prediction = torch.max(output.data,1)
      correct += ((prediction==label.transpose(-1,-2)).sum()).item()
      total += len(prediction)

      test_acc = correct/total
      test_loss = loss_item/total

      if save_result == True:
        print(prediction)
  return test_acc,test_loss

In [11]:
dataset = 'cholec80'
k = 5
random_seed = 1024
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
np.random.seed(random_seed) # set numpy random seed
random.seed(random_seed) # set python random seed

dataset_list = np.arange(1,81)
train_list = np.array(random.sample(dataset_list.tolist(),60)) # get training list randomly
test_list = np.setdiff1d(dataset_list,train_list) # get remaining test list

kf = KFold(k,shuffle=True,random_state=random_seed)
valid_split = kf.split(train_list)

In [12]:
print(train_list,test_list)

[ 3 62 50 42 67 13 57 66 47 48 78 75 18 70 14 53 20 28 31 29 71 34 54 23
 40 39  6 80 58 41 76 52  7 24 45 22  1 72 64 12  9 11 25 19 55 49 15  8
 30 46 61 68 44 74 73 63 38 33  5 36] [ 2  4 10 16 17 21 26 27 32 35 37 43 51 56 59 60 65 69 77 79]


In [None]:
##################### unit test ########################
#dataset_list = np.arange(1,7)
#train_list = np.array(random.sample(dataset_list.tolist(),6)) # get training list randomly
#test_list = np.setdiff1d(dataset_list,train_list) # get remaining test list
###########################################################

for k, (train_idx,valid_idx) in enumerate(valid_split):
  print('fold {} is training'.format(k))
  #resnet = ResNet50(num_classes=7)
  opera = OperA()
  #test_vf = VideoFeature('cholec80',root_path,err_path,load_list=load_list)
  root_path = './drive/MyDrive/OperA/cholec80'
  err_path = os.path.join(root_path,'err_dir/dataset_err.csv')
  train_dataset = VideoFeature(dataset,root_path,err_path,load_list=train_list[train_idx])
  valid_dataset = VideoFeature(dataset,root_path,err_path,load_list=train_list[valid_idx])

  train_dataloader = DataLoader(train_dataset,batch_size=1,shuffle=False,drop_last=False)
  valid_dataloader = DataLoader(valid_dataset,batch_size=1,shuffle=False,drop_last=False)

  model_save_dir = './drive/MyDrive/OperA/{}/opera_cross_valid/{}'.format(dataset,k)
  result_dir = './drive/MyDrive/OperA/{}/opera_result/'.format(dataset)
  print('Cross Valid {}: model saved at -> {}'.format(k,model_save_dir))

  opera_train(opera,model_save_dir,result_dir,train_dataloader,valid_dataloader)

In [None]:
opera = OperA()
model_path = './drive/MyDrive/OperA/{}/opera_cross_valid/0/best.pth'.format(dataset)
opera.load_state_dict(torch.load(model_path))

test_dataset = VideoFeature(dataset,root_path,err_path,load_list=test_list)
test_dataloader = DataLoader(test_dataset,batch_size=1,shuffle=False,drop_last=False)
opera_test(opera,result_dir,test_dataloader,save_result=True)