## Image Extraction: Amazon Link Page 2

In [1]:
# import libraries for system configuration
import os
import sys
import warnings

# import dependencies for data processing and visualisation
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

### Approach: Image Extraction

+ Data Loading
+ System Configuration: WebDriver for Edge Browser
+ Collecting Images from Amazon Web page: Downloading Process
+ Renaming image items with new filenames
+ Store image path into DataFrame

## Data Loading

In [2]:
# Load amazon link dataset
amazon_ds = pd.read_csv("links_page1.csv")
amazon_ds.head()

Unnamed: 0,links
0,https://www.amazon.com.be/-/en/s?i=electronics...
1,https://www.amazon.com.be/-/en/s?i=electronics...
2,https://www.amazon.com.be/-/en/s?i=electronics...
3,https://www.amazon.com.be/-/en/s?i=electronics...
4,https://www.amazon.com.be/-/en/s?i=electronics...


In [3]:
print(f"Number of links: {amazon_ds.shape[0]}")


Number of links: 52


## System Configuration: WebDriver for Edge Browser

In [4]:
# Import dependencies for extracting images
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By 

In [5]:
# Setup environment for WebDriver on Edge Browser
options = Options()
options.add_argument("--headless")

# Setup WebDriver service 
edgedriver_path = "D:\\Data_Engineering\\data_extraction\\msedgedriver.exe"
service = Service(executable_path=edgedriver_path)

driver = webdriver.Edge(options=options, service=service)

## Data Collection: Downloading images from Amazon

The Data Collection process involves several steps of gathering image content for training neural network models. To implement this stage, 2 cases need to be considered. Test Case and Full Case. 

The test case assumes 3 URLS extracted from Amazon. This is to get more control on how the images are extracted during the processes. The sequences of images provides more insight about the distribution of the images for every URLs from which the images are downloaded. The full case requires all requested URLs to make a complete image extraction for this stage. This allows more managability of manipulating and storing these images. 

This stage entails 2 processes to complete the collection process of images:
+ Extraction Step
+ Downloading Step

### Test Case: Extractiing images from 10 URLs

#### Step 1: Extraction Step

In [6]:
# Extract images from each amazon links
def extract_images(webdriver, link, class_attr = "s-image") -> list:
    # Retrieve URLS    
    webdriver.get(link)
    img_elements = webdriver.find_elements(By.CLASS_NAME, class_attr)
    image_urls = [web_elem.get_attribute("src") for web_elem in img_elements]
    return image_urls

# Iterate over a list of links --> extract images 
item_img_lst = []
items_tot = 0
for i, link in enumerate(amazon_ds["links"].values[0:3]):
    # Extract images from each amazon weblink + determine the number of images per url
    img_seq = extract_images(webdriver=driver, link=link)
    seq_size = len(img_seq)

    # Store image items into a list + determine the total number of images for the entire list
    item_img_lst.append(img_seq)
    items_tot += seq_size
    print(f"Amazon URL {i + 1}: {seq_size} items (successfully extracted)")
print(f"Image Extraction is completed. Number of Image URLs: {items_tot} ")


Amazon URL 1: 24 items (successfully extracted)
Amazon URL 2: 24 items (successfully extracted)
Amazon URL 3: 24 items (successfully extracted)
Image Extraction is completed. Number of Image URLs: 72 


In [11]:
# Flatten all img sequences into full 1D list of images
img_list = [item for item_seq in item_img_lst for item in item_seq]
img_list[:2]

['https://m.media-amazon.com/images/I/61w6v4ldYXL._AC_UL320_.jpg',
 'https://m.media-amazon.com/images/I/51jL1Ow6tXL._AC_UL320_.jpg']

### Step 2: Downloading Step

In [8]:
# Import dependencies for downloading images
import requests 

In [10]:
# List of image URLs
image_urls = img_list[:2]

# Folder to save images
path = "D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection\collection1"
save_folder = "images"
save_path = os.path.join(path, save_folder)

# Create the folder if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

# Download and save each image
for i, url in enumerate(image_urls, start=1):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Check if the request was successful

        # Save the image to the folder
        file_extension = url.split(".")[-1]  # Get the file extension
        file_name = f"image_{i}.{file_extension}"
        file_path = os.path.join(save_path, file_name)

        with open(file_path, "wb") as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        
        print(f"Downloaded: {file_path}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")

  path = "D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection\collection1"


Downloaded: D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection\collection1\images\image_1.jpg
Downloaded: D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection\collection1\images\image_2.jpg


## Full Case: Complete Image Extraction

In [12]:
# Define functions for: iteration process of image extraction && downloading process
# Function 1: Iteration Image Process
def iteration_image_process(dataset):
    # Define constants
    links = dataset["links"].values
    item_img_lst = []
    items_tot = 0

    # Iterate over a list of links --> extract images 
    for i, link in enumerate(links):
        # Extract images from each amazon weblink + determine the number of images per url
        img_seq = extract_images(webdriver=driver, link=link)
        seq_size = len(img_seq)

        # Store image items into a list + determine the total number of images for the entire list
        item_img_lst.append(img_seq)
        items_tot += seq_size
        print(f"Amazon URL {i + 1}: {seq_size} items (successfully extracted)")
    print(f"Image Extraction is completed. Number of Image URLs: {items_tot} ")
    img_list = [item for item_seq in item_img_lst for item in item_seq]
    return img_list

# Function 2: Downloading process
def downloading_process(image_urls, path):
    # Folder to save images
    save_folder = "images"
    save_path = os.path.join(path, save_folder)

    # Create the folder if it doesn't exist
    os.makedirs(save_folder, exist_ok=True)

    # Download and save each image
    for i, url in enumerate(image_urls, start=1):
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()  # Check if the request was successful

            # Save the image to the folder
            file_extension = url.split(".")[-1]  # Get the file extension
            file_name = f"image_{i}.{file_extension}"
            file_path = os.path.join(save_path, file_name)

            with open(file_path, "wb") as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            
            print(f"Downloaded image {i}: {file_name}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {url}: {e}")

In [13]:
# Extract all the images from URLs
img_urls = iteration_image_process(amazon_ds)
img_urls[:3]

Amazon URL 1: 24 items (successfully extracted)
Amazon URL 2: 24 items (successfully extracted)
Amazon URL 3: 24 items (successfully extracted)
Amazon URL 4: 24 items (successfully extracted)
Amazon URL 5: 24 items (successfully extracted)
Amazon URL 6: 3 items (successfully extracted)
Amazon URL 7: 3 items (successfully extracted)
Amazon URL 8: 2 items (successfully extracted)
Amazon URL 9: 12 items (successfully extracted)
Amazon URL 10: 3 items (successfully extracted)
Amazon URL 11: 1 items (successfully extracted)
Amazon URL 12: 9 items (successfully extracted)
Amazon URL 13: 4 items (successfully extracted)
Amazon URL 14: 3 items (successfully extracted)
Amazon URL 15: 9 items (successfully extracted)
Amazon URL 16: 18 items (successfully extracted)
Amazon URL 17: 8 items (successfully extracted)
Amazon URL 18: 24 items (successfully extracted)
Amazon URL 19: 24 items (successfully extracted)
Amazon URL 20: 1 items (successfully extracted)
Amazon URL 21: 5 items (successfully ext

['https://m.media-amazon.com/images/I/51jL1Ow6tXL._AC_UL320_.jpg',
 'https://m.media-amazon.com/images/I/61w6v4ldYXL._AC_UL320_.jpg',
 'https://m.media-amazon.com/images/I/71XWNkGrxSS._AC_UL320_.jpg']

In [14]:
len(img_urls)

824

In [15]:
# Download all the images from image URLS in folder called "image"
path = "D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection\collection1"
downloading_process(image_urls=img_urls, path=path)

  path = "D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection\collection1"


Downloaded: D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection\collection1\images\image_1.jpg
Downloaded: D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection\collection1\images\image_2.jpg
Downloaded: D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection\collection1\images\image_3.jpg
Downloaded: D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection\collection1\images\image_4.jpg
Downloaded: D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection\collection1\images\image_5.jpg
Downloaded: D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection\collection1\images\image_6.jpg
Downloaded: D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection\collection1\images\image_7.jpg
Downloaded: D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection

## Renaming Images

In [None]:
# Reload the image items from folder images
path = "D:\\Projectwork Platform\\AIP-Computer-Vision\\Mobile_Phone_Recognition\\image_collection\\collection1"
img_folder = "val_images"
img_path = os.path.join(path, img_folder)

image_files = os.listdir(img_path)
sorted_files = sorted(image_files, key=lambda x: int(x.split('_')[1].split('.')[0]))

In [None]:
# Rename image items
def format_item_names(img_path): 
    # code outside for loop block 
    n = 1
    item_list = []
    # code inside for loop block
    for i, item in enumerate(os.listdir(img_path)): 
        # Replace the item number by index number
        seperator = lambda sep: item[item.find(sep):]
        extension = seperator(".")
        formatted_name = f"image_{str(i).zfill(3)}" + extension
        item_list.append(formatted_name)
    
    return item_list

new_item_names = format_item_names(img_path=img_path)
print(f"New item names: {new_item_names[:5]}")
print(f"Number of items: {len(new_item_names)}")

In [None]:
# Renaming process: modify item names by new formatted names
old_names = sorted(os.listdir(path=img_path), key=lambda x: int(x.split('_')[1].split('.')[0])) # 823
new_names = new_item_names[:]

source_dir = "D:\\Projectwork Platform\\AIP-Computer-Vision\\Mobile_Phone_Recognition\\image_collection\\collection1\\val_images"

img_items = []
for old_name, new_name in zip(old_names, new_names):
    old_file = os.path.join(source_dir, old_name)
    new_file = os.path.join(source_dir, new_name)
    img_items.append(new_file)
    # os.rename(old_file, new_file)
    # print(f"Renamed old name {old_name} --> new name {new_name}")
    print(f"Item {new_name} stored successfully")

## Image Storage

In [17]:
# Save image file names into dataframe
source_path = "D:\\Projectwork Platform\\AIP-Computer-Vision\\Mobile_Phone_Recognition\\image_collection\\collection1\\val_images"

def save_image_into_df(file_num):
    # Define a dataframe
    img_df = pd.DataFrame()
    img_df["Image_URL"] = [os.path.join(path, img) for img in new_item_names]
    img_df["Image_file"] = new_item_names
    img_df.to_csv(f"image{file_num}.csv")

  path = "D:\Projectwork Platform\AIP-Computer-Vision\Mobile_Phone_Recognition\image_collection\collection1\images"


In [None]:
# Save image dataset
save_image_into_df(file_num=1)
