Notebook to download [Hotels-50K dataset](https://github.com/GWUvision/Hotels-50K) based on the [download_train.py](https://github.com/GWUvision/Hotels-50K/blob/master/download_train.py) script

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

In [None]:
import pandas as pd
import PIL as pil_image
import tqdm

In [None]:
PROJECT_FOLDER = "/gdrive/MyDrive/Projects/Hotel-ID/"
DATA_FOLDER = "/home/data/"

In [None]:
!mkdir {DATA_FOLDER}
!mkdir {DATA_FOLDER}images/
!tar -xvzf {PROJECT_FOLDER}data/dataset.tar.gz -C {DATA_FOLDER}

In [None]:
from __future__ import print_function
import csv, multiprocessing, cv2, os
import numpy as np
import urllib
import urllib.request

import ssl

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

def url_to_image(url):
    resp = urllib.request.urlopen(url, context=ctx)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_UNCHANGED)
    return image

# chain,hotel,im_source,im_id,im_url
def download_and_resize(imList):
    for im in imList:
        try:
            saveDir = os.path.join(DATA_FOLDER, 'images/train/',im[0],im[1],im[2])
            if not os.path.exists(saveDir):
                os.makedirs(saveDir)

            savePath = os.path.join(saveDir,str(im[3])+'.'+im[4].split('.')[-1])

            if not os.path.isfile(savePath):
                img = url_to_image(im[4])
                if img.shape[1] > img.shape[0]:
                    width = 512
                    height = round((512 * img.shape[0]) / img.shape[1])
                    img = cv2.resize(img,(width, height))
                else:
                    height = 512
                    width = round((512 * img.shape[1]) / img.shape[0])
                    img = cv2.resize(img,(width, height))
                cv2.imwrite(savePath,img)
                # print('Good: ' + savePath)
            else:
                print('Already saved: ' + savePath)
        except Exception as e:
            print('Bad: ' + savePath)

In [None]:
hotel_f = open(f'{DATA_FOLDER}dataset/hotel_info.csv','r')
hotel_reader = csv.reader(hotel_f)
hotel_headers = next(hotel_reader,None)
hotel_to_chain = {}
for row in hotel_reader:
    hotel_to_chain[row[0]] = row[2]

train_df = pd.read_csv(f'{DATA_FOLDER}dataset/train_set.csv', header=None, dtype={0:str, 1:str})

In [None]:
subsample_df = train_df[train_df[3] == "travel_website"]
hotel_data = subsample_df[1].value_counts()
selected_hotels = hotel_data[(hotel_data > 10) & (hotel_data < 30)]

print(f"Hotels valid: {len(selected_hotels)}")
selected_hotels = selected_hotels.sample(5000)
print(f"Hotels selected: {len(selected_hotels)}")
subsample_df = subsample_df[subsample_df[1].isin(selected_hotels.index.values)]
print(f"Total images {len(subsample_df)}")

In [None]:
images = []
for _, im in subsample_df.iterrows():
    im_id = im[0]
    im_url = im[2]
    im_source = im[3]
    hotel = im[1]
    chain = hotel_to_chain[hotel]
    images.append((chain,hotel,im_source,im_id,im_url))

In [None]:
%%time
pool = multiprocessing.Pool()
NUM_THREADS = multiprocessing.cpu_count()
for cpu in range(NUM_THREADS):
    pool.apply_async(download_and_resize,[images[cpu::NUM_THREADS]])

pool.close()
pool.join()

In [None]:
!find {DATA_FOLDER}images -type f | wc -l
!du -sh {DATA_FOLDER}images

In [None]:
!zip -r -qq {DATA_FOLDER}hotels-50K-sample.zip {DATA_FOLDER}images
!cp {DATA_FOLDER}hotels-50K-sample.zip {PROJECT_FOLDER}data/

In [None]:
# im = subsample_df.iloc[500]
# im_id = im[0]
# im_url = im[2]
# im_source = im[3]
# hotel = im[1]
# chain = hotel_to_chain[hotel]

# I = pil_image.Image.open(f"/home/data/images/train/{chain}/{hotel}/{im_source}/{im_id}.jpg")
# print(im)
# print(np.shape(I))
# I