In [1]:
import pandas as pd
import urllib.request

from uuid import uuid1
from time import sleep

In [2]:
def download_beer_image(data, retries=3):
    '''
    Download a beer image using an URL
    
    Args.:
        data (Pandas DataFrame): a Pandas DataFrame containing two columns named image_url and id
    '''
    retry_count = 1
    while retry_count <= retries:
        try:
            # Downloading image
            urllib.request.urlretrieve(data.loc['image_url'], ''.join(['../data/img/', data.loc['id'], '.jpg']))
            break
        except urllib.error.HTTPError:
            print('error while downloading {}. Trying again ({}/{})'.format(data.brand, retry_count, retries))
            retry_count += 1
            sleep(5)

# Reading data

In [3]:
df = pd.read_csv('../data/csv/raw_dataset.csv')
df.head()

Unnamed: 0,id,brand,image_url
0,,Chopp Brahma,https://i2marabraz-a.akamaihd.net/1800x1800/59...
1,,Bavária,https://static.carrefour.com.br/medias/sys_mas...
2,,Bohemia,https://static.carrefour.com.br/medias/sys_mas...
3,,Crystal,https://static.carrefour.com.br/medias/sys_mas...
4,,Kaiser,https://static.carrefour.com.br/medias/sys_mas...


## Strip data

In [4]:
df['brand'] = df['brand'].str.strip()

## Creating Id column

In [5]:
df['id'] = [uuid1().hex for _ in range(df.shape[0])]

# Downloading beer images

In [7]:
df.apply(download_beer_image, axis=1)
df.tail()

Unnamed: 0,id,brand,image_url
34,b95c5423045011eb926cc86000ef9ed1,Wäls,https://cdn.awsli.com.br/600x450/1343/1343273/...
35,b95c7b14045011eba142c86000ef9ed1,Amazon Beer,https://shopfacil.vteximg.com.br/arquivos/ids/...
36,b95c7b15045011ebbd85c86000ef9ed1,Dogma,https://d2o1s8t60x9u9v.cloudfront.net/Custom/C...
37,b95c7b16045011eba615c86000ef9ed1,Heilige,https://statics.angeloni.com.br/super/files/pr...
38,b95c7b17045011ebb41ec86000ef9ed1,Colombina,https://www.mywinery.com.br/image/cache/data/P...
