In [7]:
#Get the dataset
!wget https://zenodo.org/record/3587843/files/TACO.zip?download=1

--2021-02-25 02:30:26--  https://zenodo.org/record/3587843/files/TACO.zip?download=1
Resolving zenodo.org (zenodo.org)... 137.138.76.77
Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2719810615 (2.5G) [application/octet-stream]
Saving to: ‘TACO.zip?download=1’


2021-02-25 02:34:51 (9.88 MB/s) - ‘TACO.zip?download=1’ saved [2719810615/2719810615]



In [150]:
#Import packages
import os
import zipfile
import numpy as np
import json
import pandas as pd
import tensorflow as tf
import pathlib
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [12]:
#Unzip the files
local_zip = 'TACO.zip?download=1'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall()
zip_ref.close()

In [129]:
#Look at the files in the directory 
print('Current directory:', pathlib.Path().absolute())
print('In the directory:', os.listdir())
print('Within TACO:' , os.listdir('TACO'))
print('Within data:', os.listdir('TACO/data'),'\n')
print('File count per batch:')
for i in range(1,15):
    print('batch_'+str(i)+':',len(os.listdir('TACO/data/batch_'+str(i))))

Current directory: /content
In the directory: ['.config', 'TACO.zip?download=1', 'TACO', '__MACOSX', 'sample_data']
Within TACO: ['download.py', 'LICENSE', 'requirements.txt', 'demo.ipynb', '.ipynb_checkpoints', '.git', '.DS_Store', 'data', 'detector', '.gitignore', 'README.md']
Within data: ['batch_1', 'batch_7', 'batch_10', 'batch_6', 'batch_12', 'batch_5', 'annotations.json', 'batch_11', 'batch_13', 'batch_4', 'batch_3', 'batch_15', 'batch_9', 'batch_8', 'batch_2', '.DS_Store', 'batch_14'] 

File count per batch:
batch_1: 102
batch_2: 93
batch_3: 98
batch_4: 90
batch_5: 113
batch_6: 98
batch_7: 128
batch_8: 101
batch_9: 101
batch_10: 101
batch_11: 102
batch_12: 101
batch_13: 101
batch_14: 101


In [130]:
#Read annotations
print('See values in dictionary:')
with open('/content/TACO/data/batch_1/annotations.json') as f:
  data = json.load(f)
for key in data:
  print(key)

See values in dictionary:
info
images
annotations
scene_annotations
licenses
categories
scene_categories


In [131]:
#Image information
df_images = pd.DataFrame(data['images'])
df_images.head(3)

Unnamed: 0,id,width,height,file_name,license,flickr_url,coco_url,date_captured,flickr_640_url
0,0,1537,2049,000006.jpg,,https://farm66.staticflickr.com/65535/33978196...,,,https://farm66.staticflickr.com/65535/33978196...
1,1,1537,2049,000008.jpg,,https://farm66.staticflickr.com/65535/47803331...,,,https://farm66.staticflickr.com/65535/47803331...
2,2,1537,2049,000010.jpg,,https://farm66.staticflickr.com/65535/40888872...,,,https://farm66.staticflickr.com/65535/40888872...


In [132]:
#Category information
df_annotations = pd.DataFrame(data['annotations'])
df_annotations.head(3)

Unnamed: 0,id,image_id,category_id,segmentation,area,bbox,iscrowd
0,1,0,6,"[[561.0, 1238.0, 568.0, 1201.0, 567.0, 1175.0,...",403954.0,"[517.0, 127.0, 447.0, 1322.0]",0
1,2,1,18,"[[928.0, 1876.0, 938.0, 1856.0, 968.0, 1826.0,...",1071259.5,"[1.0, 457.0, 1429.0, 1519.0]",0
2,3,1,14,"[[617.0, 383.0, 703.0, 437.0, 713.0, 456.0, 72...",99583.5,"[531.0, 292.0, 1006.0, 672.0]",0


In [135]:
#Get all images and details in one list
x = range(1,15)
df_all = pd.DataFrame()
for i in x:
    directory = '/content/TACO/data/batch_'+str(i)+'/annotations.json'
    with open(directory) as f:
        data = json.load(f)
    df_images = pd.DataFrame(data['images'])
    df_images['batch'] = 'batch_'+str(i)
    df_images['image_id'] = df_images['id']
    #add a new column for joining with same name
    df_anotations = pd.DataFrame(data['annotations'])
    df_categories = pd.DataFrame(data['categories'])
    df_1 = df_images.join(df_annotations,on='image_id',rsuffix='_anno')
    df_2 = df_1.join(df_categories,on='category_id',rsuffix='_cat')
    df_3 = df_2.append(df_images)
    df_all = df_all.append(df_3)

df_all.head(3)

Unnamed: 0,id,width,height,file_name,license,flickr_url,coco_url,date_captured,flickr_640_url,batch,image_id,id_anno,image_id_anno,category_id,segmentation,area,bbox,iscrowd,supercategory,id_cat,name
0,0,1537,2049,000006.jpg,,https://farm66.staticflickr.com/65535/33978196...,,,https://farm66.staticflickr.com/65535/33978196...,batch_1,0,1.0,0.0,6.0,"[[561.0, 1238.0, 568.0, 1201.0, 567.0, 1175.0,...",403954.0,"[517.0, 127.0, 447.0, 1322.0]",0.0,Bottle,6.0,Glass bottle
1,1,1537,2049,000008.jpg,,https://farm66.staticflickr.com/65535/47803331...,,,https://farm66.staticflickr.com/65535/47803331...,batch_1,1,2.0,1.0,18.0,"[[928.0, 1876.0, 938.0, 1856.0, 968.0, 1826.0,...",1071259.5,"[1.0, 457.0, 1429.0, 1519.0]",0.0,Carton,18.0,Meal carton
2,2,1537,2049,000010.jpg,,https://farm66.staticflickr.com/65535/40888872...,,,https://farm66.staticflickr.com/65535/40888872...,batch_1,2,3.0,1.0,14.0,"[[617.0, 383.0, 703.0, 437.0, 713.0, 456.0, 72...",99583.5,"[531.0, 292.0, 1006.0, 672.0]",0.0,Carton,14.0,Other carton


In [138]:
#Get shape of information
df_all.shape

(2830, 21)

In [156]:
#count ids to split
df_imageurls = df_all['flickr_url'].drop_duplicates()
df_imageurls_test = np.array(df_imageurls[0:550])
df_imageurls_train = np.array(df_imageurls[550:])
print('Testing imnages:',len(df_imageurls_test))
df_all_test = df_all[df_all['flickr_url'].isin(df_imageurls_test)]
print('Rows and columns:',df_all_test.shape)
print('Validate unique images within file:',df_all_test['flickr_url'].nunique())

print('\nTraining imnages:',len(df_imageurls_train))
df_all_train = df_all[df_all['flickr_url'].isin(df_imageurls_train)]
print('Rows and columns:',df_all_train.shape)
print('Validate unique images within file:',df_all_train['flickr_url'].nunique())

Testing imnages: 550
Rows and columns: (1100, 21)
Validate unique images within file: 550

Training imnages: 865
Rows and columns: (1730, 21)
Validate unique images within file: 865


In [None]:
#Next step is to move images into folders and begin the CNN!