# Data Collection Case from E-Commerce Website
Website: seller's page product in Tokopedia

Tools and Framework: selenium, BeautifulSoup, Pandas

by *Machffud Tra H. V*

### Some used function

In [1]:
def get_multiple_element(xpath):
    '''
    Get multiple element by xpath in Selenium
    '''
    WebDriverWait(driver, 60).until(
        EC.presence_of_all_elements_located((By.XPATH, xpath)))
    return driver.find_elements_by_xpath(xpath)


def get_text(element):
    '''
    Extract text of an element in Selenium
    '''
    return driver.execute_script("return arguments[0].innerText;", element)

### Import library

In [2]:
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import warnings

### Ignore all warnings

In [3]:
warnings.filterwarnings('ignore')

### Initiate driver and an url that will be crawl

In [4]:
url = 'https://www.tokopedia.com/ibingotech'
driver_path = './chromedriver' # path to ChromeDriver
driver = webdriver.Chrome(driver_path)
driver.get(url)


### guarantee a web is fully loaded

In [5]:
SCROLL_PAUSE_TIME = 3

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

### retrieve link, name, and price for each product in seller's page

In [6]:
xpath = '//div[@data-ssr="shopAllProductSSR"]/div/div[1]/*'
arr_menu = get_multiple_element(xpath)

df_product_dyn = pd.DataFrame()
j = -1
xpath1 = '//div[@data-testid="linkProductPrice"]'
prices = arr_menu[0].find_elements_by_xpath(xpath1)

for am in arr_menu: 
    j+=1
    df_product_dyn = df_product_dyn.append({
        'link': am.find_elements_by_tag_name('a')[1].get_attribute('href'),
        'name': am.find_elements_by_tag_name('a')[1].get_attribute('title'),
        'price': get_text(prices[j])
        
    }, ignore_index=True)
driver.close() 
df_product_dyn.info()
df_product_dyn.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   link    19 non-null     object
 1   name    19 non-null     object
 2   price   19 non-null     object
dtypes: object(3)
memory usage: 584.0+ bytes


Unnamed: 0,link,name,price
0,https://www.tokopedia.com/ibingotech/apple-pen...,Apple pencil 1 & 2 Case Cover Silicone Case Cover,Rp 209.000
1,https://www.tokopedia.com/ibingotech/fine-poin...,Fine Point Stainless Metal Apple Pencil Replac...,Rp 149.000
2,https://www.tokopedia.com/ibingotech/apple-pen...,Apple Pencil Stylus Pen Palm Rejection | iPad ...,Rp 529.000
3,https://www.tokopedia.com/ibingotech/apple-pen...,Apple pencil 1 & 2 Case Cover Gardient Color S...,Rp 149.000
4,https://www.tokopedia.com/ibingotech/ibingo-pa...,iBingo Palm Rejection Stylus Pen Apple Pencil ...,Rp 529.000
5,https://www.tokopedia.com/ibingotech/aluminium...,Aluminium Laptop Stand Holder Foldable Adjusta...,Rp 290.000
6,https://www.tokopedia.com/ibingotech/detachabl...,DETACHABLE Acrylic Case iPad 7/8 10.2 Air3 10....,Rp 80.000
7,https://www.tokopedia.com/ibingotech/phone-ipa...,PHONE iPad STAND HOLDER HP iPad TATAKAN DI MEJA,Rp 109.000
8,https://www.tokopedia.com/ibingotech/apple-pen...,"Apple Pencil Gen1 Cap, Magnetic Replacement Ca...",Rp 79.000
9,https://www.tokopedia.com/ibingotech/original-...,Original Apple Pencil 1 charger cable adapter,Rp 79.000


### retrieve category, sold amount, and rating for each product

In [7]:
arr_categories = []
arr_sold_counter = []
arr_rating = []

driver_path = './chromedriver' # path to ChromeDriver
driver = webdriver.Chrome(driver_path)

delay = 3 # seconds

total_link = len(df_product_dyn['link'])
print(f"Crawling {total_link} product")
count = 1
for l in df_product_dyn['link']:
    print(f"{count} of {total_link}, {l}")
    count+=1
    driver.get(l)
    try:
        myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.ID, 'pdp_comp-ticker_info')))
    except TimeoutException:
        print("Loading took too much time!")
    arr_li = driver.find_elements_by_xpath('//ul[@data-testid="lblPDPInfoProduk"]/*')
    #print('baris 7')
    kategori = None
    sold_counter = 0
    rating = None
    for li in arr_li:
        
        try:
            key = get_text(li.find_element_by_tag_name('span')).lower()
            if 'category' in key or 'kategori' in key:
                kategori = get_text(li.find_element_by_tag_name('b'))
        except:
            continue
    try:
        sold_drive = driver.find_element_by_xpath('//div[@data-testid="lblPDPDetailProductSoldCounter"]')
        sold_counter = get_text(sold_drive)
        num = 0
        for i in range(len(sold_counter)):
            num = i
            if sold_counter[i].isdigit():
                break
        sold_counter = sold_counter[num:]
    except:
        pass
    try:
        rate_drive = driver.find_element_by_xpath('//span[@data-testid="lblPDPDetailProductRatingNumber"]')
        rating = get_text(rate_drive)
    except:
        pass
    arr_sold_counter.append(sold_counter)
    arr_rating.append(rating)
    arr_categories.append(kategori)

driver.close()
df_product_dyn['categories'] = arr_categories
df_product_dyn['sold amount'] = arr_sold_counter
df_product_dyn['rating'] = arr_rating
df_product_dyn.info()

Crawling 19 product
1 of 19, https://www.tokopedia.com/ibingotech/apple-pencil-1-2-case-cover-silicone-case-cover-hitam-apple-pencil-1
2 of 19, https://www.tokopedia.com/ibingotech/fine-point-stainless-metal-apple-pencil-replacement-tip-untuk-gen-1-2-1-pc-clear-tip
3 of 19, https://www.tokopedia.com/ibingotech/apple-pencil-stylus-pen-palm-rejection-ipad-2018-dan-above-stylus-pen
4 of 19, https://www.tokopedia.com/ibingotech/apple-pencil-1-2-case-cover-gardient-color-silicone-case-cover-apple-pencil-1-hijau
5 of 19, https://www.tokopedia.com/ibingotech/ibingo-palm-rejection-stylus-pen-apple-pencil-2-untuk-ipad-2018-above-ibingo-pen
6 of 19, https://www.tokopedia.com/ibingotech/aluminium-laptop-stand-holder-foldable-adjustable-meja-laptop-laptop-stand
7 of 19, https://www.tokopedia.com/ibingotech/detachable-acrylic-case-ipad-7-8-10-2-air3-10-5-air4-10-9-pro-11-20-21-tempered-glass-ipad7-8-10-2
8 of 19, https://www.tokopedia.com/ibingotech/phone-ipad-stand-holder-hp-ipad-tatakan-di-meja-o

In [8]:
df_product_dyn

Unnamed: 0,link,name,price,categories,sold amount,rating
0,https://www.tokopedia.com/ibingotech/apple-pen...,Apple pencil 1 & 2 Case Cover Silicone Case Cover,Rp 209.000,Stylus Tablet,0,
1,https://www.tokopedia.com/ibingotech/fine-poin...,Fine Point Stainless Metal Apple Pencil Replac...,Rp 149.000,Stylus Tablet,100+,4.8
2,https://www.tokopedia.com/ibingotech/apple-pen...,Apple Pencil Stylus Pen Palm Rejection | iPad ...,Rp 529.000,Stylus Tablet,30+,5.0
3,https://www.tokopedia.com/ibingotech/apple-pen...,Apple pencil 1 & 2 Case Cover Gardient Color S...,Rp 149.000,Tablet Sleeve,70+,5.0
4,https://www.tokopedia.com/ibingotech/ibingo-pa...,iBingo Palm Rejection Stylus Pen Apple Pencil ...,Rp 529.000,Stylus Tablet,100+,5.0
5,https://www.tokopedia.com/ibingotech/aluminium...,Aluminium Laptop Stand Holder Foldable Adjusta...,Rp 290.000,Meja Laptop,100+,5.0
6,https://www.tokopedia.com/ibingotech/detachabl...,DETACHABLE Acrylic Case iPad 7/8 10.2 Air3 10....,Rp 80.000,Casing & Cover Tablet,0,
7,https://www.tokopedia.com/ibingotech/phone-ipa...,PHONE iPad STAND HOLDER HP iPad TATAKAN DI MEJA,Rp 109.000,Holder Handphone,5,5.0
8,https://www.tokopedia.com/ibingotech/apple-pen...,"Apple Pencil Gen1 Cap, Magnetic Replacement Ca...",Rp 79.000,Stylus Tablet,18,5.0
9,https://www.tokopedia.com/ibingotech/original-...,Original Apple Pencil 1 charger cable adapter,Rp 79.000,Stylus Tablet,18,4.9


In [11]:
df_product_dyn.to_json (r'.\result.json')

In [12]:
df_product_dyn.to_csv('result.csv')

In this task, beside retrieve a mandatory column for each product that is name, price, and categories, I also retrieve url link,  sold amount, and rating for each product in the seller's page. The reason why I retrieve url **link** is for validation the information of the product, **sold amount** is to know how the popularity of the product so in the future we can serve more the product and **rating** is to know the quality and response from buyer about the product, so we can improve the product.