In [1]:
# Import requirement libraries
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2

import random
import os

from globals import CLASSES_NAME

In [None]:
# Define name of images and their annotation bbox
train_images_path = "dataset\\train-images.csv"
train_bbox_path = "dataset\\train-annotations-bbox.csv"

train_images_df = pd.read_csv(train_images_path)
train_bbox_df = pd.read_csv(train_bbox_path)

# Print first 5 of dataframe
print(train_images_df.head())
print(train_bbox_df.head())


In [None]:
print(train_images_df.columns)
print(np.size(train_images_df))

In [None]:
# Changing dataframes for comfortable usage
train_images_df = train_images_df[["ImageID"]]
train_bbox_df = train_bbox_df[["ImageID", "LabelName"]]
train_bbox_df.rename(columns={"LabelName": "LabelCode"}, inplace=True)

# Print changed dataframes
print(train_images_df.head())
print(train_bbox_df.head())

In [25]:
class_description_path = "dataset\\class-descriptions.csv"
class_description_df = pd.read_csv(class_description_path)
class_description_df.columns = ["LabelCode", "LabelName"]
class_description_df = class_description_df.loc[class_description_df["LabelName"].isin(CLASSES_NAME), "LabelCode"].tolist()
print(class_description_df)

['/m/015p6', '/m/01dws', '/m/01yrx']


In [74]:
# === Downloading dataset ===
# Creating function for writing to file 10% of images from dataset
def write_data_part_to_file(data_split, classes_name_df, percent_of_data=None):
    """
    Function for creating list of images which will be downloaded and write that to txt file;
    :param data_split: is string value (can be set as 'train', 'test' or 'validation';
    :param percent_of_data: is the decimal value of per cent dataset which will be written to txt file;
    :return: written file with a list of images to be downloaded.
    """

    # Initializing file name for this part of split
    images_name_path = "dataset\\"+data_split+"-images.csv"
    images_bbox_path = "dataset\\"+data_split+"-annotations-bbox.csv"

    # Defining DataFrame for this files
    images_name_df = pd.read_csv(images_name_path, usecols=["ImageID"])
    images_bbox_df = pd.read_csv(images_bbox_path, usecols=["ImageID", "LabelName"])

    # Changing dataframes for comfortable usage
    # images_name_df = images_name_df[["ImageID"]]
    # images_bbox_df = images_bbox_df[["ImageID", "LabelName"]]
    images_bbox_df.rename(columns={"LabelName": "LabelCode"}, inplace=True)

    # Determining which image we need depending on whether our class is there
    images_bbox_df = images_bbox_df.loc[images_bbox_df["LabelCode"].isin(classes_name_df), "ImageID"]
    images_name_df = images_name_df.loc[images_name_df["ImageID"].isin(images_bbox_df.tolist())]
    images_name_df = images_name_df.drop_duplicates()

    # Split dataframe to percent_of_data for downloading not all images
    print(f"Before splitting {data_split} size: {np.size(images_name_df)}")
    images_name_list = []
    if percent_of_data is not None:
        images_name_df_index = images_name_df.index.tolist()
        images_name_df_size = np.size(images_name_df_index)-1
        images_name_df_index = images_name_df_index[:int(images_name_df_size * percent_of_data)]
        for index in images_name_df_index:
            if data_split == "test":
                print(index)
                print(images_name_df.loc[index].tolist()[0])
            images_name_list.append(images_name_df.iloc[index].tolist()[0])
        print(f"Total {data_split} size: {np.size(images_name_list)}")

    # Writing list of images names to file
    file_name = "dataset\\images_name_lists\\"+data_split+"_images_list.txt"

    with open(file_name, "w") as file:
        for image_id in images_name_list:
            file.write(f"{data_split}/{image_id}\n")


In [75]:
# Using function for creating images name list file for downloading images
write_data_part_to_file("train", class_description_df, percent_of_data=0.01)
write_data_part_to_file("test", class_description_df, percent_of_data=0.1)
write_data_part_to_file("validation", class_description_df, percent_of_data=1)


Before splitting train size: 31382
Total train size: 313
Before splitting test size: 2942
86
a6929d36cab14179
99
123b018bf069b759
115
a0ec2abf92636bdc
155
7869e1c856faf8c7
172
1e4d6d39aeb79611
184
26e3af433e3b258c
243
55e3fb35856e5b88
264
fb5291907c2ef672
279
cc4945655ce79a89
336
64103c4b84b9a880
368
89f24ade918c841b
484
3075e899a8d2ef86
486
d1d5ff637df50278
513
85ec1299d3ff234a
572
5618f72bdbac4529
581
3d101925007e26eb
588
62c9261e91e53009
610
5e4722478ab9c699
656
a834c103d1b009a2
713
157b05990f244c43
743
b83f0d431e35ab4d
797
7e03c4ec8575210e
812
4a108ea57c9d467b
850
bc1d08232c455d2a
895
65d707a37d182f21
966
ec34461c3b5cd072
1021
506a490a8486cc3f
1095
5ee994ee070b7a72
1174
3dae24fb08cfcd91
1223
c78e57055b0aa808
1249
38375791b9062603
1252
f257d651d2cd0e7d
1296
bb64a7e92f951b56
1314
4484c491cea3c7e9
1319
4ed9088fa5b4dda7
1404
947aa77e6d0fa828
1425
bb32d4035b4776ad
1431
66394e8bfd8a46b3
1455
15f31225779efbe6
1492
1ae6e1e3826f8702
1506
fa6fa452e8be7985
1510
41a68265ea64b5a4
1524
1ac22bddc

IndexError: single positional indexer is out-of-bounds