In [1]:
import torch, torchvision
import os, json
from shutil import copy, move, rmtree
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random 

random.seed(0)

# Data Preparation

Le jeu de données contient plus que 12000 images, de différentes situations (Groupe - Entretien - Concert - etc).

Faute de capacité de calcul (je n'ai pas de GPU) je vais me limiter à 3000 images : 

* 2300 images pour l'entrainement.
* 700 images pour le test.

Afin d'avoir un jeu de données bien representatif du jeu de données complet, je vais choisir 3000 images à partir de toutes les situations possible.



In [2]:
from config import IMAGE_FOLDER, ANNOTATION_FOLDER, DATAPATH
os.makedirs(os.path.join(DATAPATH, IMAGE_FOLDER), exist_ok=True)
os.makedirs(os.path.join(DATAPATH, ANNOTATION_FOLDER), exist_ok=True)

In [3]:
from config import RAW_DATAPATH, NB_IMAGES

# path to raw images
raw_image_folder = os.path.join(RAW_DATAPATH, IMAGE_FOLDER)
# number of situations
situations = os.listdir(raw_image_folder)
nb_situations = len(situations)
# number of images per situation in order to gather 3000 images
nb_images_per_situation = NB_IMAGES // nb_situations
# number of images to add to attain exactly 3000 images
rest = NB_IMAGES - nb_images_per_situation * nb_situations

# choose a situation from which
# we will take the rest of the images.
data_distribution = {}
for situation in situations:
    folder_path = os.path.join(raw_image_folder, situation)
    data_distribution[situation] = len(os.listdir(folder_path))

rest_origin = None # from where to take the rest of the images
for situation in data_distribution.keys():
    nb_images = data_distribution[situation]
    if nb_images > nb_images_per_situation + rest:
        rest_origin = situation
        break;


# Copy the 3000 images into the dataset folder.
image_folder = os.path.join(DATAPATH, IMAGE_FOLDER)
if len(os.listdir(image_folder)) < NB_IMAGES:
    # If we still did not copy the image 
    # then we do it here
    for situation in situations:
        folder_path = os.path.join(raw_image_folder, situation)
        images = os.listdir(folder_path)
        if situation == rest_origin:
            images = images[:nb_images_per_situation + rest]
        else:
            images = images[:nb_images_per_situation]
        for image_name in images:
            image_path = os.path.join(folder_path, image_name)
            new_image_path = os.path.join(image_folder, image_name)
            copy(image_path, new_image_path)

# copy the annotations file
annot_filename = "annotations.txt"
raw_annot_filepath = os.path.join(RAW_DATAPATH, ANNOTATION_FOLDER, annot_filename)
annot_filepath = os.path.join(DATAPATH, ANNOTATION_FOLDER, annot_filename)
copy(raw_annot_filepath, annot_filepath)

'dataset/annotations/annotations.txt'

Maintenant, je vais diviser le jeu de données en deux:
* jeu de données d'entrainement (2300 images).
* jeu de données de test (700 images).

In [4]:
from config import TRAIN_DATAPATH, TEST_DATAPATH

# TRAIN
os.makedirs(os.path.join(TRAIN_DATAPATH, IMAGE_FOLDER), exist_ok = True)
os.makedirs(os.path.join(TRAIN_DATAPATH, ANNOTATION_FOLDER), exist_ok = True)
# TEST
os.makedirs(os.path.join(TEST_DATAPATH, IMAGE_FOLDER), exist_ok = True)
os.makedirs(os.path.join(TEST_DATAPATH, ANNOTATION_FOLDER), exist_ok = True)

In [5]:
from config import NB_IMAGES_TRAIN, NB_IMAGES_TEST

image_folder = os.path.join(DATAPATH, IMAGE_FOLDER)
images = sorted(os.listdir(image_folder)) # sort to have same order in any OS.

# Select the indices of test images at random
indices = random.sample(range(NB_IMAGES), NB_IMAGES_TEST)

# Move the test images to the corresponding folder
for i in indices:
    image_name = images[i]
    image_path = os.path.join(image_folder, image_name)
    new_image_path = os.path.join(TEST_DATAPATH, IMAGE_FOLDER, image_name)
    move(image_path, new_image_path)

# Update the list of images
# All that is left are the training images.
images = sorted(os.listdir(image_folder))
for image_name in images:
    image_path = os.path.join(image_folder, image_name)
    new_image_path = os.path.join(TRAIN_DATAPATH, IMAGE_FOLDER, image_name)
    move(image_path, new_image_path)

# Delete the image directory 
os.rmdir(image_folder)

Il reste qu'à créer les deux fichiers d'annotations pour les deux datasets.

In [7]:
# TRAIN FOLDERS PATH
TRAIN_IMAGE_FOLDER = os.path.join(TRAIN_DATAPATH, IMAGE_FOLDER)

# TEST FOLDERS PATH
TEST_IMAGE_FOLDER = os.path.join(TEST_DATAPATH, IMAGE_FOLDER)

# LISTS OF IMAGES FOR EACH DATASET
train_images = os.listdir(TRAIN_IMAGE_FOLDER)
test_images = os.listdir(TEST_IMAGE_FOLDER)

# INITIALIZE EACH DATASET ANNOTATIONS
train_annot = []
test_annot = []

# ITERATE OVER THE FILE AND GET THE ANNOTATIONS
with open(annot_filepath, "r") as fp:
    # Number of lines in the file
    for count, line in enumerate(fp):
        pass

count += 1
print('Total Lines', count)

f = open(annot_filepath, "r")
c = 0
while c < count:
    # get filename
    filepath = f.readline()
    filename = filepath.split("/")[-1]
    filename = filename.strip()
    c += 1
    # get number of faces/boxes
    nb_boxes = int(f.readline())
    c += 1
    if nb_boxes == 0 : nb_boxes = 1
    # collect the boxes
    boxes = []
    for i in range(nb_boxes):
        line = f.readline()
        c += 1
        info = line.split(" ")
        box  = [int(info[i]) for i in range(4)]
        boxes.append(box)
    # check wether it is a training image,
    # a testing image, or neither one of the two.
    if filename in train_images:
        train_annot.append({
            "filename": filename,
            "boxes": boxes
        })
    elif filename in test_images:
        test_images.append({
            "filename": filename,
            "boxes": boxes
        })
    else:
        pass

f.close()

Total Lines 185184


In [11]:
# TRAIN FOLDERS PATH
import shutil


TRAIN_ANNOT_FOLDER = os.path.join(TRAIN_DATAPATH, ANNOTATION_FOLDER)

# TEST FOLDERS PATH
TEST_ANNOT_FOLDER = os.path.join(TEST_DATAPATH, ANNOTATION_FOLDER)

# SAVE THE ANNOTATIONS
train_annot_filename = "train.json"
train_annot_filepath = os.path.join(TRAIN_ANNOT_FOLDER, train_annot_filename)
with open(train_annot_filepath, "w") as f:
    f.write(json.dumps(train_annot))

test_annot_filename = "test.json"
test_annot_filepath = os.path.join(TEST_ANNOT_FOLDER, test_annot_filename)
with open(test_annot_filepath, "w") as f:
    f.write(json.dumps(test_annot))

# DELETE UNECESSARY FOLDER
rmtree(os.path.join(DATAPATH, ANNOTATION_FOLDER))