## Label Extraction: Amazon Webpage

In [1]:
# import dependencies for system configuration
import os
import sys
import warnings
import time

# import dependencies for collecting images
import pandas as pd 
import numpy as np 

Approach: Image Extraction

+ Loading Dataset of amazon urls
+ Extracting images per url 
+ Downloading mobile phone images with created folder
+ Storing images

## Data Loading

In [2]:
# Load the amazon weblink dataset
weblink_file = "links_page1.csv"
amazon_ds = pd.read_csv(weblink_file)
amazon_ds.head()

Unnamed: 0,links
0,https://www.amazon.com.be/-/en/s?i=electronics...
1,https://www.amazon.com.be/-/en/s?i=electronics...
2,https://www.amazon.com.be/-/en/s?i=electronics...
3,https://www.amazon.com.be/-/en/s?i=electronics...
4,https://www.amazon.com.be/-/en/s?i=electronics...


## Data Extracting from URL (P1): Testcase

+ Setup WebDriver
+ Extracting labels from amazon webpage
+ Storing labels in DataFrame

In [3]:
# Import dependencies for extracting image content from amazon webpage
from selenium import webdriver 
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By 

In [4]:
# Set up Webdriver for Edge Browser
options = Options()
options.add_argument("--headless")

# Set up Browser service 
edgedriver_path = "D:\\Data_Engineering\\data_extraction\\msedgedriver.exe"
service = Service(executable_path=edgedriver_path)
driver = webdriver.Edge(service=service, options=options)

Test Case 1: Amazon with 10 labels

In [5]:
# Test case 1: 1 amazon link 
amazon_link = amazon_ds["links"].values[0]

def extract_labels(webdriver, link) -> list:
    webdriver.get(link)
    time.sleep(5)

    product_titles = driver.execute_script(""" 
        var h2_selector = "h2.a-size-base-plus.a-spacing-none.a-color-base.a-text-normal";
        var titles = [];
        var elements = document.querySelectorAll(h2_selector);
        elements.forEach(function(element){
            titles.push(element.innerText);                                       
        });
        return titles
    """)

    return product_titles

# Test case 2: 3 amazon links 
amazon_links = amazon_ds["links"].values[0:10]
item_title_lst = []
item_tot = 0
for i, link in enumerate(amazon_links): 
    # Extract the product titles from each link + take the size of each sequence
    text_sequence = extract_labels(webdriver=driver, link=link)
    seq_len = len(text_sequence)

    # Store text sequence in list of all item titles
    item_title_lst.append(text_sequence)
    print(f"Amazon URL {i + 1}: {seq_len} items (extracted successfully) ")
    item_tot += seq_len
print(f"Extraction Successful. Number of Items: {item_tot}")


Amazon URL 1: 24 items (extracted successfully) 
Amazon URL 2: 24 items (extracted successfully) 
Amazon URL 3: 24 items (extracted successfully) 
Amazon URL 4: 24 items (extracted successfully) 
Amazon URL 5: 24 items (extracted successfully) 
Amazon URL 6: 3 items (extracted successfully) 
Amazon URL 7: 3 items (extracted successfully) 
Amazon URL 8: 2 items (extracted successfully) 
Amazon URL 9: 12 items (extracted successfully) 
Amazon URL 10: 3 items (extracted successfully) 
Extraction Successful. Number of Items: 143


In [6]:
item_titles = np.array([item for title_seq in item_title_lst for item in title_seq])
item_titles[:5]

array(['Samsung Galaxy SM-G780GZWDEUB 6.5 Inch Dual SIM Hybrid 4G USB Type-C 6GB 128GB 4500mAh Mint',
       'Samsung Galaxy S20 FE 5G Unlocked Blue',
       'SAMSUNG A135F/DSN Galaxy A13 Dual SIM (6.6 inches - 4/128GB) Black',
       'Samsung SM-A156B Galaxy A15 Dual SIM 5G 4GB RAM 128GB Blue Black EU',
       'Samsung Galaxy S20 FE 5G 6GB/128GB Purple (Lavender) Dual SIM G781B'],
      dtype='<U192')

## Data Extraction P2: Full Page (Final)

In [7]:
# Iterated Extraction loop 
def iterative_extraction(amazon_links):
    # Define constants
    item_title_lst = []
    item_tot = 0

    # Extraction process
    for i, link in enumerate(amazon_links): 
        # Extract the product titles from each link + take the size of each sequence
        text_sequence = extract_labels(webdriver=driver, link=link)
        seq_len = len(text_sequence)

        # Store text sequence in list of all item titles
        item_title_lst.append(text_sequence)
        print(f"Amazon URL {i}: {seq_len} items (extracted successfully) ")
        item_tot += seq_len
    print(f"Extraction Successful. Number of Items: {item_tot}")
    
    # Flatten all item titles
    item_titles = np.array([item for title_seq in item_title_lst for item in title_seq])

    return item_titles

# Extract with full page
full_page = amazon_ds["links"].values
label_list = iterative_extraction(amazon_links=full_page)

Amazon URL 0: 24 items (extracted successfully) 
Amazon URL 1: 24 items (extracted successfully) 
Amazon URL 2: 24 items (extracted successfully) 
Amazon URL 3: 24 items (extracted successfully) 
Amazon URL 4: 24 items (extracted successfully) 
Amazon URL 5: 3 items (extracted successfully) 
Amazon URL 6: 3 items (extracted successfully) 
Amazon URL 7: 2 items (extracted successfully) 
Amazon URL 8: 12 items (extracted successfully) 
Amazon URL 9: 3 items (extracted successfully) 
Amazon URL 10: 1 items (extracted successfully) 
Amazon URL 11: 9 items (extracted successfully) 
Amazon URL 12: 4 items (extracted successfully) 
Amazon URL 13: 3 items (extracted successfully) 
Amazon URL 14: 9 items (extracted successfully) 
Amazon URL 15: 18 items (extracted successfully) 
Amazon URL 16: 8 items (extracted successfully) 
Amazon URL 17: 24 items (extracted successfully) 
Amazon URL 18: 24 items (extracted successfully) 
Amazon URL 19: 1 items (extracted successfully) 
Amazon URL 20: 5 item

## Label Storage

In [8]:
# Store label list
label_ds = pd.DataFrame()
label_ds["Labels"] = label_list
label_ds.to_csv("labels1.csv")