### Installation des dépendances

In [249]:
%pip install numpy
%pip install opencv-python
%pip install -U matplotlib
%pip install einops
import numpy as np
import cv2
from matplotlib import pyplot as plt
import math
import os
import time
import torch
import torchvision.transforms as transforms
import torchvision.models as models
from einops import rearrange
from PIL import Image # Using here to convert openCV image to PIL image



Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [250]:
# Define Constants
STEP_TO_NEXT_FRAME = 16 
""" in each video, we will read 1 frame every 24/STEP_TO_INDEX frames"""
DEBUG = False # if True, print debug information
NUMBER_OF_VIDEO = 100 # number of videos to index
VIDEOS_SIZE_ON_DISK = 444039168  # in octets
MAX_FRAME_NUMBER = 37895 # maximum number of frames in all video
N = int(MAX_FRAME_NUMBER/STEP_TO_NEXT_FRAME) # index Length
TENSOR_SIZE = 2048 # size of the tensor to store the index

In [251]:
#Define all global variables
imageList = []
videoList = []
indexationTable = [] # Table of indexation
resnet_descriptor = np.empty((N, TENSOR_SIZE), dtype=np.float16) # Descriptor of the image by resnet

### Lecture des fichiers

In [252]:
imagePath = "/data/jpeg/"
videoPath = "/data/mp4/"
currDirectory = os.getcwd()
imageList = os.listdir(currDirectory + imagePath)
videoList = os.listdir(currDirectory + videoPath)

In [253]:
def getVideoParameter(videoPath:str):
    """return the frame rate and the number of frames of the video."""
    cap = cv2.VideoCapture(videoPath)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    return fps, frame_count
    

In [254]:
def cv2_to_pil(cv2_img):
    """Convert openCV image to PIL image to perfom resnet computing"""
    cv2_im_rgb = cv2.cvtColor(cv2_img, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(cv2_im_rgb)
    return pil_img

In [255]:
# Resnet-18 model loading
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)   # le modèle est chargé avec des poids pré-entrainés sur ImageNet
model = torch.nn.Sequential(*(list(model.children())[:-1]))        # supprime la dernière couche du réseau
model.eval()
if torch.cuda.is_available():  
    model.cuda() # On envoie le modèle sur le GPU

In [256]:
def ImageVectorResnet_CPU(imagePath, needToRead:bool = True):
    """Return resnet18 image descriptor"""
    if needToRead:
        image = cv2.imread(imagePath)
    else:
        image = imagePath
    
    new_image = cv2_to_pil(image)
    
    # Pre-processing
    preprocess = transforms.Compose([
    transforms.Resize((224, 224)),                       
        transforms.ToTensor(),                            
        transforms.Normalize(mean=[0.485, 0.456, 0.406],    
                         std=[0.229, 0.224, 0.225]),
    ])
    
    input_tensor = preprocess(new_image)         # 3 x 224 x 224
    input_batch = input_tensor.unsqueeze(0)  # Ajout d'une dimension de batch : 1 x 3 x 224 x 224
    
    if torch.cuda.is_available():  
        input_batch = input_batch.cuda() # On envoie le batch sur le GPU
    
    # Computing descriptor
    with torch.no_grad():
        output = model(input_batch)  # 1 x 512 x 1 x 1 


    output = rearrange(output, 'b d h w -> (b d h w)')  # 512
    output_normalized = output / torch.sum(output) # Normalisation
    output_normalizedList = output_normalized.tolist()  # is returned for the description creation
    if needToRead:
        output_normalizedList = output_normalized # is returned for the image search
    return output_normalizedList

In [257]:
def plotingResnetDescriptor(descriptor:list, index_desc:int = 0):
    plt.plot(descriptor[index_desc])
    plt.title("Resnet descriptor")
    plt.show()

In [258]:
def createIndexTableResnet():
    """Index a video by creating a list of frames and their corresponding index. and update the indexationTable variable."""
    for video in videoList:
        path = currDirectory + videoPath + video
        fps, frame_count= getVideoParameter(path)
        cap = cv2.VideoCapture(path)
        startIndex = len(indexationTable)
        stopIndex = startIndex - 1 + math.floor(frame_count/STEP_TO_NEXT_FRAME)
        for i in range(startIndex, stopIndex):
            frameNumber = STEP_TO_NEXT_FRAME*(i-startIndex)
            indexationTable.append((frameNumber, video, (frameNumber/fps)))
            currImage = cap.set(cv2.CAP_PROP_POS_FRAMES, frameNumber)
            ret, currImage = cap.read()
            resnet_descriptor[i] = ImageVectorResnet_CPU(currImage, False)
            
 
createIndexTableResnet()       
# plotingResnetDescriptor(resnet_descriptor, 1)

In [259]:
def CalculateCompressionRate():
    rate = 1- (TENSOR_SIZE*len(indexationTable)*2)/VIDEOS_SIZE_ON_DISK #*2 because we use float16
    print("Le taux de compression est: ",rate)
    
CalculateCompressionRate()

Le taux de compression est:  0.9795033576857797


In [269]:
def searchImage():
    SearchResult = []
    bestIndex = 0
    
     # Convert resnet_descriptor and indexationTable to PyTorch tensors and move them to GPU if available
    resnet_descriptorTensor = torch.tensor(resnet_descriptor).cuda() if torch.cuda.is_available() else torch.tensor(resnet_descriptor)
    
    for image in imageList:
        path = currDirectory + imagePath + image
        imageVector = ImageVectorResnet_CPU(path)

          # Expand imageVector to have the same size as resnet_descriptor for element-wise subtraction
        imageVector_expanded = imageVector.expand_as(resnet_descriptorTensor)
        imageVector_expanded = imageVector_expanded.cuda() if torch.cuda.is_available() else imageVector_expanded

        # Calculate distances using PyTorch operations on GPU
        distances = -torch.log(torch.sum(torch.sqrt(imageVector_expanded * resnet_descriptorTensor), dim=1))

        bestDistance, bestIndex = torch.min(distances, dim=0)

        bestVideo = indexationTable[bestIndex][1]
        bestTime = indexationTable[bestIndex][2]

        if bestDistance < 0.346:
            # print(image.split(".")[0], " ", bestDistance)
            bestTime = round(bestTime.item(), 3) if torch.is_tensor(bestTime) else bestTime
        else:
            bestVideo = "out"
            bestTime = ""

        SearchResult.append((image.split(".")[0], bestVideo.split(".")[0], bestTime))

    return SearchResult

SearchResult = searchImage()

i000   tensor(0.0752, device='cuda:0')
i001   tensor(0.1305, device='cuda:0')
i003   tensor(0.2262, device='cuda:0')
i004   tensor(0.1132, device='cuda:0')
i005   tensor(0.2794, device='cuda:0')
i006   tensor(0.1311, device='cuda:0')
i009   tensor(0.1501, device='cuda:0')
i010   tensor(0.1517, device='cuda:0')
i011   tensor(0.0823, device='cuda:0')
i012   tensor(0.2603, device='cuda:0')
i013   tensor(0.1405, device='cuda:0')
i016   tensor(0.1917, device='cuda:0')
i017   tensor(0.1654, device='cuda:0')
i018   tensor(0.2895, device='cuda:0')
i020   tensor(0.1843, device='cuda:0')
i023   tensor(0.1184, device='cuda:0')
i024   tensor(0.1297, device='cuda:0')
i025   tensor(0.1154, device='cuda:0')
i026   tensor(0.1455, device='cuda:0')
i027   tensor(0.2099, device='cuda:0')
i028   tensor(0.3395, device='cuda:0')
i029   tensor(0.3190, device='cuda:0')
i030   tensor(0.1930, device='cuda:0')
i031   tensor(0.2153, device='cuda:0')
i032   tensor(0.2926, device='cuda:0')
i033   tensor(0.3386, dev

In [270]:
def writeResultInCSVFile():
    """Function to write the results in a csv file."""
    with open(currDirectory+'/src/test.csv', 'w') as file:
        file.write("image,video_pred,minutage_pred\n")
        for imageName, video, time in SearchResult:
            file.write(imageName + "," + video + "," + str(time) + "\n")
    file.close()

writeResultInCSVFile()

In [271]:
# Affichage des résultats
import subprocess
directory = r'C:\Users\loicn\OneDrive - polymtl.ca\Polymtl\Session_5\INF8770_TEC_MULTIMEDIA\Lab\Lab3\src'
output = subprocess.run(['python3', 'evaluate.py', '--file', 'test.csv', '--file_gt', '../data/gt.csv'], cwd=directory, capture_output=True, text=True)

print(output.stdout)

Taux de bonnes rÃ©ponses : 92.5% (925/1000)
Ecart temporel moyen : 1.56 sec

