In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import numpy as np

In [240]:
def Get_items(product_cards, Items_):
    
    for product_card in product_cards:
        
        # Extract image link.
        try :
            img_link = product_card.find('div',class_ = 'gl-product-card__media gl-product-card__media--hover').find('img')['src']
            Items_['Image Link'].append(img_link)
        except:
            Items_['Image Link'].append(np.nan)
            
        # Extract Product Name.
        try :
            product_name = product_card.find('span', class_ = 'gl-label gl-label--m gl-label--condensed gl-product-card__name')
            Items_['Product Name'].append(product_name.text)
        except :
            Items_['Product Name'].append(np.nan)
            
        # Extract Price.
        try :
            price = product_card.find('div', class_ = 'gl-price-item gl-price-item--small notranslate gl-product-card__price')
            Items_['Price'].append(price.text)
            
            if not price:
                price = product_card.find('div', class_ = 'gl-price-item gl-price-item--sale gl-price-item--small notranslate')
                Items_['Price'].append(price.text)
        except :
            Items_['Price'].append(np.nan)
            
        # Extract Product Category.
        try :
            product_cat = product_card.find('div', class_ = 'gl-product-card__category')
            Items_['Product Category'].append(product_cat.text)
        except :
            Items_['Product Category'].append(np.nan)
        
        # Extract Color.
        try :
            color = product_card.find('div', class_ = 'gl-product-card__color')
            Items_['Color'].append(color.text)
        except : 
            Items_['Color'].append(np.nan)
            
    return Items_

def DataFrame(driver, link, pages):
    """This function will return dataframe object that contains all the scraped items in the given link"""
    
    Items_dict = {"Image Link" : [], 
                  "Product Name" : [], 
                  "Price" : [], 
                  "Product Category" : [], 
                  "Color" : []
                 }
                         
    for page in range(1,pages+1):
        
        driver.get(link + f'?page={page}')
        time.sleep(10)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        product_cards = soup.find_all('li', class_ = 'ProductCard')
        items_dict = Get_items(product_cards, Items_dict)
        
        if page%10  == 0:
            print(page)
    
    df = pd.DataFrame(Items_dict)
    return df

In [223]:
url = "https://www.adidas.co.id/"
driver = webdriver.Chrome(service = Service(ChromeDriverManager().install()))
driver.get(url)
website_contents = driver.page_source
soup = BeautifulSoup(website_contents)
element_1 = driver.find_elements(By.XPATH, '//*[@id="root"]/header/div/div[2]/div/div[1]/div/ul/li/a')
links = list(map((lambda x : x.get_attribute('href')), element_1))

In [161]:
print(links)

['https://www.adidas.co.id/pria.html', 'https://www.adidas.co.id/wanita.html', 'https://www.adidas.co.id/anak.html', 'https://www.adidas.co.id/sport.html', 'https://www.adidas.co.id/brands.html']


In [243]:
print('Pria :')
df_pria = DataFrame(driver, 'https://www.adidas.co.id/pria.html', 107)
print('Wanita :')
df_wanita = DataFrame(driver, 'https://www.adidas.co.id/wanita.html',97)
print('Anak :')
df_anak = DataFrame(driver, 'https://www.adidas.co.id/anak.html', 24)
print('Sport :')
df_sport = DataFrame(driver, 'https://www.adidas.co.id/sport.html', 91)
print('Brand :')
df_brands  = DataFrame(driver, 'https://www.adidas.co.id/brands.html', 98)

Pria :
10
20
30
40
50
60
70
80
90
100
Wanita :
10
20
30
40
50
60
70
80
90
Anak :
10
20
Sport :
10
20
30
40
50
60
70
80
90
Brand :
10
20
30
40
50
60
70
80
90


In [245]:
print(df_pria.info())
print(df_wanita.info())
print(df_anak.info())
print(df_sport.info())
print(df_brands.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2558 entries, 0 to 2557
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Image Link        2558 non-null   object
 1   Product Name      2558 non-null   object
 2   Price             1633 non-null   object
 3   Product Category  2558 non-null   object
 4   Color             1545 non-null   object
dtypes: object(5)
memory usage: 100.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2327 entries, 0 to 2326
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Image Link        2327 non-null   object
 1   Product Name      2327 non-null   object
 2   Price             1380 non-null   object
 3   Product Category  2327 non-null   object
 4   Color             1222 non-null   object
dtypes: object(5)
memory usage: 91.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5

In [246]:
driver.quit()

In [309]:
df_full = pd.concat([df_pria,df_wanita,df_anak,df_sport,df_brands], axis = 0, ignore_index = True)
print(df_full.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9949 entries, 0 to 9948
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Image Link        9941 non-null   object
 1   Product Name      9949 non-null   object
 2   Price             5891 non-null   object
 3   Product Category  9949 non-null   object
 4   Color             5487 non-null   object
dtypes: object(5)
memory usage: 388.8+ KB
None


In [310]:
display(df_full.head(10))

Unnamed: 0,Image Link,Product Name,Price,Product Category,Color
0,https://www.adidas.co.id/media/catalog/product...,Sepatu Pureboost Go,Rp. 2.000.000,Pria Running,
1,https://www.adidas.co.id/media/catalog/product...,Sepatu Dropset Trainer,Rp. 2.200.000,Pria Training,4 warna
2,https://www.adidas.co.id/media/catalog/product...,Sepatu Dropset Trainer,Rp. 2.200.000,Pria Training,4 warna
3,https://www.adidas.co.id/media/catalog/product...,Sepatu NMD_V3,Rp. 2.600.000,Pria Lifestyle,4 warna
4,https://www.adidas.co.id/media/catalog/product...,Sepatu NMD_V3,Rp. 2.600.000,Pria Lifestyle,4 warna
5,https://www.adidas.co.id/media/catalog/product...,Sepatu Harden Stepback 3,Rp. 1.400.000,Uniseks Basket,3 warna
6,https://www.adidas.co.id/media/catalog/product...,Sepatu Advantage Court Lifestyle,Rp. 850.000,Pria Skateboarding,
7,https://www.adidas.co.id/media/catalog/product...,Sepatu NMD_V3,Rp. 2.600.000,Pria Lifestyle,4 warna
8,https://www.adidas.co.id/media/catalog/product...,Sepatu Ultraboost 5 DNA Running Lifestyle,Rp. 3.300.000,Pria Skateboarding,4 warna
9,https://www.adidas.co.id/media/catalog/product...,Sepatu NMD_V3,Rp. 2.600.000,Pria Lifestyle,4 warna


In [311]:
df_full['Price'] = df_full['Price'].str.replace('Rp.','', regex = False)
df_full['Price'] = df_full['Price'].str.replace('.','', regex = False)
df_full['Price'] = df_full['Price'].str.strip()
display(df_full.head(10))

Unnamed: 0,Image Link,Product Name,Price,Product Category,Color
0,https://www.adidas.co.id/media/catalog/product...,Sepatu Pureboost Go,2000000,Pria Running,
1,https://www.adidas.co.id/media/catalog/product...,Sepatu Dropset Trainer,2200000,Pria Training,4 warna
2,https://www.adidas.co.id/media/catalog/product...,Sepatu Dropset Trainer,2200000,Pria Training,4 warna
3,https://www.adidas.co.id/media/catalog/product...,Sepatu NMD_V3,2600000,Pria Lifestyle,4 warna
4,https://www.adidas.co.id/media/catalog/product...,Sepatu NMD_V3,2600000,Pria Lifestyle,4 warna
5,https://www.adidas.co.id/media/catalog/product...,Sepatu Harden Stepback 3,1400000,Uniseks Basket,3 warna
6,https://www.adidas.co.id/media/catalog/product...,Sepatu Advantage Court Lifestyle,850000,Pria Skateboarding,
7,https://www.adidas.co.id/media/catalog/product...,Sepatu NMD_V3,2600000,Pria Lifestyle,4 warna
8,https://www.adidas.co.id/media/catalog/product...,Sepatu Ultraboost 5 DNA Running Lifestyle,3300000,Pria Skateboarding,4 warna
9,https://www.adidas.co.id/media/catalog/product...,Sepatu NMD_V3,2600000,Pria Lifestyle,4 warna


In [312]:
print(df_full.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9949 entries, 0 to 9948
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Image Link        9941 non-null   object
 1   Product Name      9949 non-null   object
 2   Price             5891 non-null   object
 3   Product Category  9949 non-null   object
 4   Color             5487 non-null   object
dtypes: object(5)
memory usage: 388.8+ KB
None


In [318]:
df_full = df_full[df_full['Price'] != '']

In [322]:
print(df_full.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9929 entries, 0 to 9947
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Image Link        9929 non-null   object
 1   Product Name      9929 non-null   object
 2   Price             5871 non-null   object
 3   Product Category  9929 non-null   object
 4   Color             5473 non-null   object
dtypes: object(5)
memory usage: 465.4+ KB
None


In [323]:
df_full.to_csv('Adidas Data')