# Preuzimanje slika

---

## 1. Uvoz knjižnica i postavke

In [1]:
# Knjižnice
import os
import pandas as pd
import requests
from tqdm import tqdm
import time
from PIL import Image

In [2]:
# Postavke bilježnice
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

---

## 2. Preuzimanje slika i ažuriranje dataseta

In [3]:
# Funkcija za preuzimanje slika, resize slika na 224x224 dimenzije i spremanje dimenzije slika (widthxheight)
def download_images(dataset_path, images_dir):
    data = pd.read_csv(dataset_path)
    data["image_path"] = None
    data["image_original_width"] = None
    data["image_original_height"] = None

    def download_and_resize_image(url, output_path, target_size=(224, 224)):
        try:
            response = requests.get(url, timeout=10)
            
            if response.status_code == 200:
                raw_image_path = output_path + "_raw"
                
                with open(raw_image_path, "wb") as f:
                    f.write(response.content)

                with Image.open(raw_image_path) as img:
                    original_width, original_height = img.width, img.height
                    img = img.convert("RGB")
                    img = img.resize(target_size, Image.Resampling.LANCZOS)
                    img.save(output_path)

                os.remove(raw_image_path)
                # time.sleep(0.5)

                return output_path, original_width, original_height
            
            else:
                print(f"Preuzimanje slike neuspjelo: {url}")
                return None, None, None

        except Exception as e:
            print(f"Greška prilikom preuzimanja i obrade: {url} - {e}")
            return None, None, None

    for index, row in tqdm(data.iterrows(), total=len(data), desc=f"Preuzimanje i obrada slika za {os.path.basename(dataset_path)}"):
        image_url = row.get("image_large")

        if pd.notnull(image_url):
            filename = f"product_{index}.jpg"
            output_path = os.path.join(images_dir, filename)
            local_path, width, height = download_and_resize_image(image_url, output_path)

            if local_path:
                data.at[index, "image_path"] = local_path
                data.at[index, "image_original_width"] = width
                data.at[index, "image_original_height"] = height

    data.to_csv(dataset_path, index=True, index_label="product_id")
    print(f"Ažuriran skup podataka je spremljen: {dataset_path}")

In [4]:
# Preuzimanje slika i ažuriranje skupa podataka
clean_dataset_path = "./datasets/dataset_clean/dataset_clean.csv"
remaining_dataset_path = "./datasets/dataset_remaining/dataset_remaining.csv"

clean_images_dir = "./datasets/dataset_clean/images/"
remaining_images_dir = "./datasets/dataset_remaining/images/"

os.makedirs(clean_images_dir, exist_ok=True)
os.makedirs(remaining_images_dir, exist_ok=True)

download_images(clean_dataset_path, clean_images_dir)
download_images(remaining_dataset_path, remaining_images_dir)

Preuzimanje i obrada slika za dataset_clean.csv:  52%|█████▏    | 25825/50000 [1:26:43<21:39:55,  3.23s/it]

Greška prilikom preuzimanja i obrade: https://m.media-amazon.com/images/I/41jhzevB2IL._AC_.jpg - HTTPSConnectionPool(host='m.media-amazon.com', port=443): Read timed out. (read timeout=10)


Preuzimanje i obrada slika za dataset_clean.csv:  54%|█████▍    | 27194/50000 [1:30:37<19:54:44,  3.14s/it]

Greška prilikom preuzimanja i obrade: https://m.media-amazon.com/images/I/517efqOqHeL.jpg - HTTPSConnectionPool(host='m.media-amazon.com', port=443): Read timed out. (read timeout=10)


Preuzimanje i obrada slika za dataset_clean.csv:  63%|██████▎   | 31626/50000 [1:45:33<15:58:06,  3.13s/it]

Greška prilikom preuzimanja i obrade: https://m.media-amazon.com/images/I/519K+7nUQHL._AC_.jpg - HTTPSConnectionPool(host='m.media-amazon.com', port=443): Read timed out. (read timeout=10)


Preuzimanje i obrada slika za dataset_clean.csv:  73%|███████▎  | 36256/50000 [2:00:16<8:38:32,  2.26s/it] 

Greška prilikom preuzimanja i obrade: https://m.media-amazon.com/images/I/31w0Hy0bizL._AC_.jpg - HTTPSConnectionPool(host='m.media-amazon.com', port=443): Read timed out. (read timeout=10)


Preuzimanje i obrada slika za dataset_clean.csv:  99%|█████████▉| 49556/50000 [2:45:47<23:17,  3.15s/it]  

Greška prilikom preuzimanja i obrade: https://m.media-amazon.com/images/I/512uzwUUCcL._AC_.jpg - HTTPSConnectionPool(host='m.media-amazon.com', port=443): Read timed out. (read timeout=10)


Preuzimanje i obrada slika za dataset_clean.csv: 100%|██████████| 50000/50000 [2:47:12<00:00,  4.98it/s]


Ažuriran skup podataka je spremljen: ./datasets/dataset_clean/dataset_clean.csv


Preuzimanje i obrada slika za dataset_remaining.csv:  15%|█▍        | 5455/37105 [18:06<20:48:08,  2.37s/it]

Greška prilikom preuzimanja i obrade: https://m.media-amazon.com/images/I/41UyLBj8oJL._AC_.jpg - HTTPSConnectionPool(host='m.media-amazon.com', port=443): Read timed out. (read timeout=10)


Preuzimanje i obrada slika za dataset_remaining.csv:  55%|█████▍    | 20246/37105 [1:09:17<10:33:53,  2.26s/it]

Greška prilikom preuzimanja i obrade: https://m.media-amazon.com/images/I/41y7MSqK35L._AC_.jpg - HTTPSConnectionPool(host='m.media-amazon.com', port=443): Read timed out. (read timeout=10)


Preuzimanje i obrada slika za dataset_remaining.csv:  58%|█████▊    | 21497/37105 [1:13:18<9:47:35,  2.26s/it] 

Greška prilikom preuzimanja i obrade: https://m.media-amazon.com/images/I/31MUN6FUFkL._AC_.jpg - HTTPSConnectionPool(host='m.media-amazon.com', port=443): Read timed out. (read timeout=10)


Preuzimanje i obrada slika za dataset_remaining.csv: 100%|██████████| 37105/37105 [2:04:45<00:00,  4.96it/s]  


Ažuriran skup podataka je spremljen: ./datasets/dataset_remaining/dataset_remaining.csv


In [8]:
def retry_missing_images(dataset_path, images_dir):
    data = pd.read_csv(dataset_path)

    missing_images = data[data["image_path"].isnull()]

    def download_and_resize_image(url, output_path, target_size=(224, 224)):
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                raw_image_path = output_path + "_raw"
                with open(raw_image_path, "wb") as f:
                    f.write(response.content)

                with Image.open(raw_image_path) as img:
                    original_width, original_height = img.width, img.height
                    img = img.convert("RGB")
                    img = img.resize(target_size, Image.Resampling.LANCZOS)
                    img.save(output_path)

                os.remove(raw_image_path)
                return output_path, original_width, original_height
            
            else:
                print(f"Preuzimanje slike neuspjelo: {url}")
                return None, None, None

        except Exception as e:
            print(f"Greška prilikom preuzimanja i obrade: {url} - {e}")
            return None, None, None

    for index, row in tqdm(missing_images.iterrows(), total=len(missing_images), desc="Ponovno preuzimanje slika..."):
        image_url = row["image_large"]
        product_id = row["product_id"]

        if pd.notnull(image_url):
            filename = f"product_{product_id}.jpg"
            output_path = os.path.join(images_dir, filename)
            local_path, width, height = download_and_resize_image(image_url, output_path)

            if local_path:
                data.at[index, "image_path"] = local_path
                data.at[index, "image_original_width"] = width
                data.at[index, "image_original_height"] = height

    data.to_csv(dataset_path, index=False)
    print(f"Ažuriran skup podataka je spremljen: {dataset_path}")

In [9]:
# Preuzimanje slika i ažuriranje skupa podataka
clean_dataset_path = "./datasets/dataset_clean/dataset_clean.csv"
remaining_dataset_path = "./datasets/dataset_remaining/dataset_remaining.csv"

clean_images_dir = "./datasets/dataset_clean/images/"
remaining_images_dir = "./datasets/dataset_remaining/images/"

retry_missing_images(clean_dataset_path, clean_images_dir)
retry_missing_images(remaining_dataset_path, remaining_images_dir)

Ponovno preuzimanje slika...: 100%|██████████| 5/5 [00:01<00:00,  4.54it/s]


Ažuriran skup podataka je spremljen: ./datasets/dataset_clean/dataset_clean.csv


Ponovno preuzimanje slika...: 100%|██████████| 3/3 [00:00<00:00,  4.09it/s]


Ažuriran skup podataka je spremljen: ./datasets/dataset_remaining/dataset_remaining.csv
