#### Imports

In [1]:
import pandas as pd
import numpy as np
import os
from google.cloud import storage
from io import BytesIO

### Paths

In [2]:
RAW_DATA_PATH = os.path.join(os.getcwd(), 'raw_data')
DATA_PATH = os.path.join(os.getcwd(), 'data')

In [3]:
storage_client = storage.Client(project = 'ih-final-project-277919')
bucket = storage_client.get_bucket('final_project_ih_data_dl')

In [4]:
#list(bucket.list_blobs())

In [5]:
blob = storage.blob.Blob('data/class-descriptions-boxable.csv',bucket)
cnt = blob.download_as_string() 
colnames=['LabelName','Label']
df_class = pd.read_csv(BytesIO(cnt),names=colnames, header=None)

In [6]:
df_class.head()

Unnamed: 0,LabelName,Label
0,LabelKey,Label
1,/m/011k07,Tortoise
2,/m/011q46kg,Container
3,/m/012074,Magpie
4,/m/0120dh,Sea turtle


In [7]:
df_class['Label'].unique()[:50]

array(['Label', 'Tortoise', 'Container', 'Magpie', 'Sea turtle',
       'Football', 'Ambulance', 'Ladder', 'Toothbrush', 'Syringe', 'Sink',
       'Toy', 'Organ (Musical Instrument)', 'Cassette deck', 'Apple',
       'Human eye', 'Cosmetics', 'Paddle', 'Snowman', 'Beer',
       'Chopsticks', 'Human beard', 'Bird', 'Parking meter',
       'Traffic light', 'Croissant', 'Cucumber', 'Radish', 'Towel',
       'Doll', 'Skull', 'Washing machine', 'Glove', 'Tick', 'Belt',
       'Sunglasses', 'Banjo', 'Cart', 'Ball', 'Backpack', 'Bicycle',
       'Home appliance', 'Centipede', 'Boat', 'Surfboard', 'Boot',
       'Headphones', 'Hot dog', 'Shorts', 'Fast food'], dtype=object)

In [8]:
classes = ['Car']

In [9]:
classes_keys = df_class['LabelName'][df_class['Label'].isin(classes)].unique()

In [10]:
blob = storage.blob.Blob('data/oidv6-train-annotations-bbox.csv',bucket)
cnt = blob.download_as_string() 
iter_csv = pd.read_csv(BytesIO(cnt), iterator=True, chunksize=10000)
df_boxes = pd.concat([chunk[chunk['LabelName'].isin(classes_keys)] for chunk in iter_csv])

In [11]:
df_boxes.shape

(248075, 21)

In [12]:
df_boxes.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,...,IsDepiction,IsInside,XClick1X,XClick2X,XClick3X,XClick4X,XClick1Y,XClick2Y,XClick3Y,XClick4Y
29,0000048549557964,xclick,/m/0k4j,1,0.194375,0.32625,0.72,0.758333,1,0,...,0,0,0.32625,0.194375,0.28375,0.238125,0.753333,0.744167,0.758333,0.72
30,0000048549557964,xclick,/m/0k4j,1,0.435,0.725625,0.698333,0.944167,0,0,...,0,0,0.435,0.725625,0.609375,0.585625,0.871667,0.815,0.698333,0.944167
31,0000048549557964,xclick,/m/0k4j,1,0.70625,0.936875,0.685833,0.865,1,0,...,0,0,0.70625,0.936875,0.78375,0.78875,0.763333,0.724167,0.865,0.685833
32,0000048549557964,xclick,/m/0k4j,1,0.82875,0.999375,0.6925,0.908333,0,1,...,0,0,0.958125,0.82875,0.958125,0.999375,0.908333,0.830833,0.6925,0.7325
503,0000575f5a03db70,xclick,/m/0k4j,1,0.026563,0.982813,0.164127,0.83939,0,0,...,0,0,0.982813,0.026563,0.273438,0.572656,0.574443,0.627198,0.164127,0.83939


In [13]:
df_boxes.columns

Index(['ImageID', 'Source', 'LabelName', 'Confidence', 'XMin', 'XMax', 'YMin',
       'YMax', 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction',
       'IsInside', 'XClick1X', 'XClick2X', 'XClick3X', 'XClick4X', 'XClick1Y',
       'XClick2Y', 'XClick3Y', 'XClick4Y'],
      dtype='object')

In [14]:
ID_list = df_boxes['ImageID'].unique()

In [15]:
blob = storage.blob.Blob('data/train-images-boxable-with-rotation.csv',bucket)
cnt = blob.download_as_string() 
iter_csv = pd.read_csv(BytesIO(cnt), iterator=True, chunksize=10000)
df_image = pd.concat([chunk[chunk['ImageID'].isin(ID_list)] for chunk in iter_csv])

In [16]:
df_image.shape

(89465, 12)

In [17]:
df_image.columns

Index(['ImageID', 'Subset', 'OriginalURL', 'OriginalLandingURL', 'License',
       'AuthorProfileURL', 'Author', 'Title', 'OriginalSize', 'OriginalMD5',
       'Thumbnail300KURL', 'Rotation'],
      dtype='object')

In [18]:
df_image.head()

Unnamed: 0,ImageID,Subset,OriginalURL,OriginalLandingURL,License,AuthorProfileURL,Author,Title,OriginalSize,OriginalMD5,Thumbnail300KURL,Rotation
56,cd329b922daa73bb,train,https://c4.staticflickr.com/6/5443/18231828633...,https://www.flickr.com/photos/nzcarfreak/18231...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/nzcarfreak/,Riley,Contrast...,1578003,SwdtLIiGTA8PPJ6TCSTjhA==,https://c6.staticflickr.com/6/5443/18231828633...,0.0
79,c9c0a72fe1816773,train,https://farm4.staticflickr.com/6067/6120245663...,https://www.flickr.com/photos/drivingthenorthe...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/drivingthenorthe...,DrivingtheNortheast,An old Buick police paddy wagon at the Joint B...,2897866,OdPn1Q7ZQYrn/Q1NrlgOpA==,https://c2.staticflickr.com/7/6067/6120245663_...,0.0
170,d3be59dfcca3d208,train,https://c1.staticflickr.com/4/3951/15729204092...,https://www.flickr.com/photos/bartosz/15729204092,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/bartosz/,Bartosz Brzezinski,DSCF9378,1488662,h//N8QdHIOZ2LriUoEEDuA==,https://c2.staticflickr.com/4/3951/15729204092...,0.0
179,2f79b4556eb584ee,train,https://c7.staticflickr.com/9/8151/7501925902_...,https://www.flickr.com/photos/greggjerdingen/7...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/greggjerdingen/,Greg Gjerdingen,68 Mercury Park Lane Brougham,4325376,hy/BU/D7iD++Is8BpFwj7A==,https://c1.staticflickr.com/9/8151/7501925902_...,0.0
190,4d2886c115c83e1c,train,https://c5.staticflickr.com/4/3399/3517769845_...,https://www.flickr.com/photos/300tdorg/3517769...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/300tdorg/,Jurgen,Buscamping in Bassari Country (Senegal),1757406,3HnsiTVrWjOtCgTWuncyYA==,https://c5.staticflickr.com/4/3399/3517769845_...,0.0


In [19]:
df_all_data = df_image.merge(df_boxes, left_on = 'ImageID', right_on = 'ImageID',how = 'left')

In [20]:
df_all_data = df_all_data.merge(df_class, left_on = 'LabelName', right_on = 'LabelName', how = 'left')

In [21]:
df_all_data.shape

(248075, 33)

In [22]:
df_all_data.head()

Unnamed: 0,ImageID,Subset,OriginalURL,OriginalLandingURL,License,AuthorProfileURL,Author,Title,OriginalSize,OriginalMD5,...,IsInside,XClick1X,XClick2X,XClick3X,XClick4X,XClick1Y,XClick2Y,XClick3Y,XClick4Y,Label
0,cd329b922daa73bb,train,https://c4.staticflickr.com/6/5443/18231828633...,https://www.flickr.com/photos/nzcarfreak/18231...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/nzcarfreak/,Riley,Contrast...,1578003,SwdtLIiGTA8PPJ6TCSTjhA==,...,0,0.0,0.0,0.1875,0.131875,0.578333,0.578333,0.7925,0.926667,Car
1,cd329b922daa73bb,train,https://c4.staticflickr.com/6/5443/18231828633...,https://www.flickr.com/photos/nzcarfreak/18231...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/nzcarfreak/,Riley,Contrast...,1578003,SwdtLIiGTA8PPJ6TCSTjhA==,...,0,0.07125,0.3125,0.62375,0.395625,0.668333,0.511667,0.725,0.881667,Car
2,cd329b922daa73bb,train,https://c4.staticflickr.com/6/5443/18231828633...,https://www.flickr.com/photos/nzcarfreak/18231...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/nzcarfreak/,Riley,Contrast...,1578003,SwdtLIiGTA8PPJ6TCSTjhA==,...,0,0.471875,0.655,0.979375,0.77375,0.561667,0.358333,0.67,0.775,Car
3,c9c0a72fe1816773,train,https://farm4.staticflickr.com/6067/6120245663...,https://www.flickr.com/photos/drivingthenorthe...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/drivingthenorthe...,DrivingtheNortheast,An old Buick police paddy wagon at the Joint B...,2897866,OdPn1Q7ZQYrn/Q1NrlgOpA==,...,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,Car
4,c9c0a72fe1816773,train,https://farm4.staticflickr.com/6067/6120245663...,https://www.flickr.com/photos/drivingthenorthe...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/drivingthenorthe...,DrivingtheNortheast,An old Buick police paddy wagon at the Joint B...,2897866,OdPn1Q7ZQYrn/Q1NrlgOpA==,...,0,0.336875,0.266875,0.335625,0.4025,0.458333,0.516667,0.52,0.4825,Car


In [23]:
df_all_data.shape

(248075, 33)

In [24]:
df_all_data.columns

Index(['ImageID', 'Subset', 'OriginalURL', 'OriginalLandingURL', 'License',
       'AuthorProfileURL', 'Author', 'Title', 'OriginalSize', 'OriginalMD5',
       'Thumbnail300KURL', 'Rotation', 'Source', 'LabelName', 'Confidence',
       'XMin', 'XMax', 'YMin', 'YMax', 'IsOccluded', 'IsTruncated',
       'IsGroupOf', 'IsDepiction', 'IsInside', 'XClick1X', 'XClick2X',
       'XClick3X', 'XClick4X', 'XClick1Y', 'XClick2Y', 'XClick3Y', 'XClick4Y',
       'Label'],
      dtype='object')

In [25]:
df_all_data = df_all_data[['ImageID','OriginalURL','ImageID','XMin', 'XMax', 'YMin', 'YMax','Label','Confidence']]

In [26]:
df_all_data.to_csv(f'{DATA_PATH}/df_all_data.csv',index = False)

In [27]:
list_urls = df_all_data['OriginalURL'].unique()

In [28]:
#crear el .tsv para poder descargar las imagenes

In [29]:
#BytesIO(storage.blob.Blob(f'data/open-images-dataset-train{i[0]}.tsv',bucket).download_as_string())

df_tsv = pd.concat([pd.read_csv(BytesIO(storage.blob.Blob(f'data/open-images-dataset-train{i}.tsv',bucket).download_as_string()), sep='\t').reset_index() 
                    for i in range(0,10)])

In [30]:
df_tsv.head()

Unnamed: 0,level_0,level_1,TsvHttpData-1.0
0,https://c7.staticflickr.com/6/5499/10245691204...,141489,XKWUMkcNib71Ge6Yq1/0/g==
1,https://farm1.staticflickr.com/5615/1533586145...,7054258,CyvuYIfw90WAr8y0a1PaOw==
2,https://c7.staticflickr.com/8/7590/17048042861...,812483,cFlLPftwE0e7PznqGLwxKA==
3,https://farm5.staticflickr.com/5582/1823300949...,1282717,bIoO16vdUB8MpG+Wbe1bxw==
4,https://farm6.staticflickr.com/4126/5145819744...,3129329,nauUgLObUcXqCUxuVabxbw==


In [31]:
df_class_tsv = df_tsv[df_tsv['level_0'].isin(list_urls)]

In [32]:
df_class_tsv.shape

(89465, 3)

.tsv file to upload to google cloud bucket

In [33]:
df_class_tsv.to_csv(f'{DATA_PATH}/images_{classes}_tsv_file.tsv',sep='\t',index = False)