In [1]:
import os
import time
import torch
import argparse
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
from RGBBranch import RGBBranch
from SemBranch import SemBranch
from SASceneNet import SASceneNet
from Libs.Datasets.SUN397Dataset import SUN397Dataset
from Libs.Utils import utils
from Libs.Utils.torchsummary import torchsummary
import numpy as np
import yaml
from tabulate import tabulate
import csv
from PIL import Image
import matplotlib.pyplot as plt
import random
from torchvision import transforms

In [2]:
config_path = './Config/config_SUN397_original.yaml'
CONFIG = yaml.safe_load(open(config_path, 'r'))
USE_CUDA = torch.cuda.is_available()

# Instantiate network
if CONFIG['MODEL']['ONLY_RGB']:
    print('Evaluating ONLY RGB branch')
    print('Selected RGB backbone architecture: ' + CONFIG['MODEL']['ARCH'])
    model = RGBBranch(arch=CONFIG['MODEL']['ARCH'], scene_classes=CONFIG['DATASET']['N_CLASSES_SCENE'])
elif CONFIG['MODEL']['ONLY_SEM']:
    print('Evaluating ONLY SEM branch')
    model = SemBranch(scene_classes=CONFIG['DATASET']['N_CLASSES_SCENE'], semantic_classes=CONFIG['DATASET']['N_CLASSES_SEM'])
else:
    print('Evaluating complete model')
    print('Selected RG backbone architecture: ' + CONFIG['MODEL']['ARCH'])
    model = SASceneNet(arch=CONFIG['MODEL']['ARCH'], scene_classes=CONFIG['DATASET']['N_CLASSES_SCENE'], semantic_classes=CONFIG['DATASET']['N_CLASSES_SEM'])
    

# Load the trained model
completePath = CONFIG['MODEL']['PATH'] + CONFIG['MODEL']['NAME'] + '.pth.tar'
if os.path.isfile(completePath):
    print("Loading model {} from path {}...".format(CONFIG['MODEL']['NAME'], completePath))
    checkpoint = torch.load(completePath)
    best_prec1 = checkpoint['best_prec1']
    model.load_state_dict(checkpoint['state_dict'])
else:
    print("No checkpoint found at '{}'. Check configuration file MODEL field".format(completePath))
    quit()

Evaluating complete model
Selected RG backbone architecture: ResNet-50
Loading model SAScene_ResNet50_SUN from path ./Data/Model Zoo/SUN397/SAScene_ResNet50_SUN.pth.tar...


In [3]:
model.cuda()
cudnn.benchmark = USE_CUDA
model.eval()

classes = []
f = open("SUN_sceneList.txt", "r")
classes = f.read().splitlines() 
classes

['abbey',
 'airplane_cabin',
 'airport_terminal',
 'alley',
 'amphitheater',
 'amusement_arcade',
 'amusement_park',
 'anechoic_chamber',
 'apartment_building-outdoor',
 'apse-indoor',
 'aquarium',
 'aqueduct',
 'arch',
 'archive',
 'arrival_gate-outdoor',
 'art_gallery',
 'art_school',
 'art_studio',
 'assembly_line',
 'athletic_field-outdoor',
 'atrium-public',
 'attic',
 'auditorium',
 'auto_factory',
 'badlands',
 'badminton_court-indoor',
 'baggage_claim',
 'bakery-shop',
 'balcony-exterior',
 'balcony-interior',
 'ball_pit',
 'ballroom',
 'bamboo_forest',
 'banquet_hall',
 'bar',
 'barn',
 'barndoor',
 'baseball_field',
 'basement',
 'basilica',
 'basketball_court-outdoor',
 'bathroom',
 'batters_box',
 'bayou',
 'bazaar-indoor',
 'bazaar-outdoor',
 'beach',
 'beauty_salon',
 'bedroom',
 'berth',
 'biology_laboratory',
 'bistro-indoor',
 'boardwalk',
 'boat_deck',
 'boathouse',
 'bookstore',
 'booth-indoor',
 'botanical_garden',
 'bow_window-indoor',
 'bow_window-outdoor',
 'bowl

In [6]:
def sceneRecognition (root_path,file_name):
    img_path = os.path.join(root_path,'raw') + '/{}.jpg'.format(file_name)
    img = Image.open(img_path)
    # Convert it to RGB if gray-scale
    if img.mode is not "RGB":
        img = img.convert("RGB")
    sem_path = os.path.join(root_path,'sem_label') + '/{}.png'.format(file_name)
    sem = Image.open(sem_path)
    semScore_path = os.path.join(root_path,'sem_score') + '/{}.png'.format(file_name)
    semScore = Image.open(semScore_path)

    mean = [0.485, 0.456, 0.406]
    STD = [0.229, 0.224, 0.225]
    resizeSize = 256
    outputSize = 224
    
    val_transforms_img = transforms.Compose([
                transforms.Resize(resizeSize),
                transforms.TenCrop(outputSize),
                transforms.Lambda(lambda crops: torch.stack([transforms.ToTensor()(crop) for crop in crops])),
                transforms.Lambda(
                    lambda crops: torch.stack([transforms.Normalize(mean, STD)(crop) for crop in crops])),
            ])
    val_transforms_sem = transforms.Compose([
        transforms.Resize(resizeSize, interpolation=Image.NEAREST),
        transforms.TenCrop(outputSize),
        transforms.Lambda(lambda crops: torch.stack(
            [torch.from_numpy(np.asarray(crop) + 1).long().permute(2, 0, 1) for crop in crops])),
    ])

    val_transforms_scores = transforms.Compose([
        transforms.Resize(resizeSize),
        transforms.TenCrop(outputSize),
        transforms.Lambda(lambda crops: torch.stack([transforms.ToTensor()(crop) for crop in crops])),
    ])

    img = val_transforms_img(img)
    sem = val_transforms_sem(sem)
    semScore = val_transforms_scores(semScore)

    assert img.shape[0] == 10 and img.shape[2] == outputSize and img.shape[3] == outputSize
    assert sem.shape[0] == 10 and sem.shape[2] == outputSize and sem.shape[3] == outputSize
    assert semScore.shape[0] == 10 and semScore.shape[2] == outputSize and semScore.shape[3] == outputSize
    
    with torch.no_grad():
        RGB_image = torch.unsqueeze(img.cuda(), 0)
        semantic_mask = torch.unsqueeze(sem.cuda(), 0)
        semantic_scores = torch.unsqueeze(semScore.cuda(), 0)

        TEN_CROPS = 1 
        if TEN_CROPS:
                        # Fuse batch size and ncrops to set the input for the network
            bs, ncrops, c_img, h, w = RGB_image.size()
            RGB_image = RGB_image.view(-1, c_img, h, w)

            bs, ncrops, c_sem, h, w = semantic_mask.size()
            semantic_mask = semantic_mask.view(-1, c_sem, h, w)

            bs, ncrops, c_sem, h, w = semantic_scores.size()
            semantic_scores = semantic_scores.view(-1, c_sem, h, w)

            # Create tensor of probabilities from semantic_mask
            semanticTensor = utils.make_one_hot(semantic_mask, semantic_scores, C=CONFIG['DATASET']['N_CLASSES_SEM'])

            #compute the final output
            outputSceneLabel, feature_conv, outputSceneLabelRGB, outputSceneLabelSEM = model(RGB_image, semanticTensor)

            # Average results over the 10 crops
            outputSceneLabel = outputSceneLabel.view(bs, ncrops, -1).mean(1)
            outputSceneLabelRGB = outputSceneLabelRGB.view(bs, ncrops, -1).mean(1)
            outputSceneLabelSEM = outputSceneLabelSEM.view(bs, ncrops, -1).mean(1)

            Ten_Predictions = utils.obtainPredictedClasses(outputSceneLabel)
            Predictions = Ten_Predictions
            Predictions_conf = np.reshape(outputSceneLabel.cpu().numpy()[0],[1,397])

            return Predictions

def most_frequent(List): 
    return max(set(List), key = List.count)

In [8]:
#inference
time_start = time.time()
root_path = './demo/basketball_01'
files = os.listdir(os.path.join(root_path,'raw'))
sampling_ratio = 0.3 
sampling = int(sampling_ratio * len(files))
results = []
for file in random.sample(files, sampling):
    if file.endswith('.jpg'):
        file_name = file[:-4]
        predictions_index = sceneRecognition(root_path,file_name)
        top1 = predictions_index[0]
        results.append(top1)
video_results = most_frequent(results)

time_end = time.time()    
print('Detected {} frames. The most possible scene category is {}'.format(len(results),classes[video_results]))
print('Runing time: {}'.format(time_end - time_start))
        
        

Detected 27 frames. The most possible scene category is volleyball_court-indoor
Runing time: 4.169652462005615
