#### Imports

In [7]:
import pandas as pd
import numpy as np
import os
from google.cloud import storage
from io import BytesIO

### Paths

In [8]:
RAW_DATA_PATH = os.path.join(os.getcwd(), 'raw_data')
DATA_PATH = os.path.join(os.getcwd(), 'data')

In [9]:
storage_client = storage.Client(project = 'ih-final-project-277919')
bucket = storage_client.get_bucket('final_project_ih_data_dl')

In [10]:
#list(bucket.list_blobs())

In [11]:
blob = storage.blob.Blob('data/class-descriptions-boxable.csv',bucket)
cnt = blob.download_as_string() 
colnames=['LabelName','Label']
df_class = pd.read_csv(BytesIO(cnt),names=colnames, header=None)

In [12]:
df_class.head()

Unnamed: 0,LabelName,Label
0,LabelKey,Label
1,/m/011k07,Tortoise
2,/m/011q46kg,Container
3,/m/012074,Magpie
4,/m/0120dh,Sea turtle


In [13]:
df_class['Label'].unique()[:50]

array(['Label', 'Tortoise', 'Container', 'Magpie', 'Sea turtle',
       'Football', 'Ambulance', 'Ladder', 'Toothbrush', 'Syringe', 'Sink',
       'Toy', 'Organ (Musical Instrument)', 'Cassette deck', 'Apple',
       'Human eye', 'Cosmetics', 'Paddle', 'Snowman', 'Beer',
       'Chopsticks', 'Human beard', 'Bird', 'Parking meter',
       'Traffic light', 'Croissant', 'Cucumber', 'Radish', 'Towel',
       'Doll', 'Skull', 'Washing machine', 'Glove', 'Tick', 'Belt',
       'Sunglasses', 'Banjo', 'Cart', 'Ball', 'Backpack', 'Bicycle',
       'Home appliance', 'Centipede', 'Boat', 'Surfboard', 'Boot',
       'Headphones', 'Hot dog', 'Shorts', 'Fast food'], dtype=object)

In [14]:
classes = ['Boat']

In [15]:
classes_keys = df_class['LabelName'][df_class['Label'].isin(classes)].unique()

In [16]:
blob = storage.blob.Blob('data/oidv6-train-annotations-bbox.csv',bucket)
cnt = blob.download_as_string() 
iter_csv = pd.read_csv(BytesIO(cnt), iterator=True, chunksize=10000)
df_boxes = pd.concat([chunk[chunk['LabelName'].isin(classes_keys)] for chunk in iter_csv])

In [17]:
df_boxes.shape

(79113, 21)

In [18]:
df_boxes.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,...,IsDepiction,IsInside,XClick1X,XClick2X,XClick3X,XClick4X,XClick1Y,XClick2Y,XClick3Y,XClick4Y
78,00001bc2c4027449,xclick,/m/019jd,1,0.2675,0.765,0.240833,0.8025,1,0,...,0,0,0.59375,0.2675,0.47125,0.765,0.240833,0.6775,0.8025,0.641667
1208,0000c035a08c3770,xclick,/m/019jd,1,0.0,0.996248,0.0,0.9725,1,1,...,0,0,0.448405,0.0,0.332083,0.996248,0.0,0.4725,0.9725,0.5275
2663,00019f5540822905,xclick,/m/019jd,1,0.526875,0.9275,0.628518,0.757974,1,0,...,0,0,0.893125,0.9275,0.851875,0.526875,0.628518,0.717636,0.757974,0.679174
6431,000477fbd201b1ff,xclick,/m/019jd,1,0.578889,0.7,0.216875,0.453125,1,0,...,0,0,0.654444,0.578889,0.656667,0.7,0.216875,0.424375,0.453125,0.433125
8438,0005d3d4f802db0a,xclick,/m/019jd,1,0.0,0.4225,0.238999,0.831752,0,0,...,0,0,0.274375,0.0,0.33875,0.4225,0.238999,0.764452,0.831752,0.732528


In [50]:
df_boxes.columns

Index(['ImageID', 'Source', 'LabelName', 'Confidence', 'XMin', 'XMax', 'YMin',
       'YMax', 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction',
       'IsInside', 'XClick1X', 'XClick2X', 'XClick3X', 'XClick4X', 'XClick1Y',
       'XClick2Y', 'XClick3Y', 'XClick4Y'],
      dtype='object')

In [51]:
ID_list = df_boxes['ImageID'].unique()

In [53]:
blob = storage.blob.Blob('data/train-images-boxable-with-rotation.csv',bucket)
cnt = blob.download_as_string() 
iter_csv = pd.read_csv(BytesIO(cnt), iterator=True, chunksize=10000)
df_image = pd.concat([chunk[chunk['ImageID'].isin(ID_list)] for chunk in iter_csv])

In [54]:
df_image.shape

(25478, 12)

In [55]:
df_image.columns

Index(['ImageID', 'Subset', 'OriginalURL', 'OriginalLandingURL', 'License',
       'AuthorProfileURL', 'Author', 'Title', 'OriginalSize', 'OriginalMD5',
       'Thumbnail300KURL', 'Rotation'],
      dtype='object')

In [56]:
df_image.head()

Unnamed: 0,ImageID,Subset,OriginalURL,OriginalLandingURL,License,AuthorProfileURL,Author,Title,OriginalSize,OriginalMD5,Thumbnail300KURL,Rotation
74,25e3acb1a25707dd,train,https://c6.staticflickr.com/8/7659/17891339551...,https://www.flickr.com/photos/archiefalkmaar/1...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/archiefalkmaar/,Regionaal Archief Alkmaar,Vrijwilligersuitje 2015,2677536,FvIDdDhff45GCENbR2CXuQ==,https://c5.staticflickr.com/8/7659/17891339551...,0.0
83,9aaa4a4bce049a85,train,https://farm6.staticflickr.com/8164/7388947910...,https://www.flickr.com/photos/45131642@N00/738...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/45131642@N00/,Phil Parker,Decks and Boats in,28624,8Jqvyk2WuArDznavCtpBnA==,https://c7.staticflickr.com/9/8164/7388947910_...,0.0
265,6db5e012cf2936fa,train,https://c3.staticflickr.com/3/2874/9062318686_...,https://www.flickr.com/photos/atgeist/9062318686,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/atgeist/,TownePost Network,Cocktail Cove cuties get the party started,5036881,6Rn5G58CFybS6b1wUeTApg==,https://c3.staticflickr.com/3/2874/9062318686_...,0.0
326,ad3d362f5abb39e5,train,https://farm6.staticflickr.com/5720/2116326194...,https://flickr.com/27366393@N00/21163261942,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/alecperkins/,Alec Perkins,Jersey City Triathlon,2128344,WkWUDm1bSDlnv7mWUb7PUA==,https://c7.staticflickr.com/6/5720/21163261942...,0.0
421,9172175c072967ae,train,https://c7.staticflickr.com/1/64/214333364_5dd...,https://www.flickr.com/photos/simiant/214333364,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/simiant/,nathan williams,Mega-Yacht,835503,b2ThuubQXK4EWh9NqaR7Ng==,https://c2.staticflickr.com/1/64/214333364_5dd...,0.0


In [57]:
df_all_data = df_image.merge(df_boxes, left_on = 'ImageID', right_on = 'ImageID',how = 'left')

In [72]:
df_all_data = df_all_data.merge(df_class, left_on = 'LabelName', right_on = 'LabelName', how = 'left')

In [73]:
df_all_data.shape

(79113, 33)

In [74]:
df_all_data.head()

Unnamed: 0,ImageID,Subset,OriginalURL,OriginalLandingURL,License,AuthorProfileURL,Author,Title,OriginalSize,OriginalMD5,...,IsInside,XClick1X,XClick2X,XClick3X,XClick4X,XClick1Y,XClick2Y,XClick3Y,XClick4Y,Label
0,25e3acb1a25707dd,train,https://c6.staticflickr.com/8/7659/17891339551...,https://www.flickr.com/photos/archiefalkmaar/1...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/archiefalkmaar/,Regionaal Archief Alkmaar,Vrijwilligersuitje 2015,2677536,FvIDdDhff45GCENbR2CXuQ==,...,0,0.715,0.766875,0.999375,0.390625,0.438149,0.999056,0.885741,0.915958,Boat
1,9aaa4a4bce049a85,train,https://farm6.staticflickr.com/8164/7388947910...,https://www.flickr.com/photos/45131642@N00/738...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/45131642@N00/,Phil Parker,Decks and Boats in,28624,8Jqvyk2WuArDznavCtpBnA==,...,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,Boat
2,6db5e012cf2936fa,train,https://c3.staticflickr.com/3/2874/9062318686_...,https://www.flickr.com/photos/atgeist/9062318686,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/atgeist/,TownePost Network,Cocktail Cove cuties get the party started,5036881,6Rn5G58CFybS6b1wUeTApg==,...,0,0.975,0.999375,0.780625,0.0375,0.381341,0.456522,0.999094,0.999094,Boat
3,6db5e012cf2936fa,train,https://c3.staticflickr.com/3/2874/9062318686_...,https://www.flickr.com/photos/atgeist/9062318686,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/atgeist/,TownePost Network,Cocktail Cove cuties get the party started,5036881,6Rn5G58CFybS6b1wUeTApg==,...,0,0.69,0.23125,0.185,0.7,0.241848,0.395833,0.494565,0.259058,Boat
4,6db5e012cf2936fa,train,https://c3.staticflickr.com/3/2874/9062318686_...,https://www.flickr.com/photos/atgeist/9062318686,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/atgeist/,TownePost Network,Cocktail Cove cuties get the party started,5036881,6Rn5G58CFybS6b1wUeTApg==,...,0,0.766875,0.265625,0.3375,0.778125,0.210145,0.455616,0.526268,0.368659,Boat


In [75]:
df_all_data.shape

(79113, 33)

In [76]:
df_all_data.columns

Index(['ImageID', 'Subset', 'OriginalURL', 'OriginalLandingURL', 'License',
       'AuthorProfileURL', 'Author', 'Title', 'OriginalSize', 'OriginalMD5',
       'Thumbnail300KURL', 'Rotation', 'Source', 'LabelName', 'Confidence',
       'XMin', 'XMax', 'YMin', 'YMax', 'IsOccluded', 'IsTruncated',
       'IsGroupOf', 'IsDepiction', 'IsInside', 'XClick1X', 'XClick2X',
       'XClick3X', 'XClick4X', 'XClick1Y', 'XClick2Y', 'XClick3Y', 'XClick4Y',
       'Label'],
      dtype='object')

In [77]:
df_all_data = df_all_data[['ImageID','OriginalURL','ImageID','XMin', 'XMax', 'YMin', 'YMax','Label','Confidence']]

In [78]:
df_all_data.to_csv(f'{DATA_PATH}/df_all_data.csv',index = False)

In [63]:
list_urls = df_all_data['OriginalURL'].unique()

In [None]:
#crear el .tsv para poder descargar las imagenes

In [79]:
#BytesIO(storage.blob.Blob(f'data/open-images-dataset-train{i[0]}.tsv',bucket).download_as_string())

df_tsv = pd.concat([pd.read_csv(BytesIO(storage.blob.Blob(f'data/open-images-dataset-train{i}.tsv',bucket).download_as_string()), sep='\t').reset_index() 
                    for i in range(0,10)])

In [80]:
df_tsv.head()

Unnamed: 0,level_0,level_1,TsvHttpData-1.0
0,https://c7.staticflickr.com/6/5499/10245691204...,141489,XKWUMkcNib71Ge6Yq1/0/g==
1,https://farm1.staticflickr.com/5615/1533586145...,7054258,CyvuYIfw90WAr8y0a1PaOw==
2,https://c7.staticflickr.com/8/7590/17048042861...,812483,cFlLPftwE0e7PznqGLwxKA==
3,https://farm5.staticflickr.com/5582/1823300949...,1282717,bIoO16vdUB8MpG+Wbe1bxw==
4,https://farm6.staticflickr.com/4126/5145819744...,3129329,nauUgLObUcXqCUxuVabxbw==


In [69]:
df_class_tsv = df_tsv[df_tsv['level_0'].isin(list_urls)]

In [70]:
df_class_tsv.shape

(25478, 3)

.tsv file to upload to google cloud bucket

In [82]:
df_class_tsv.to_csv(f'{DATA_PATH}/images_{classes}_tsv_file.tsv',sep='\t',index = False)