# Tokopedia order history scraper

## Dependencies

- [Selenium](#https://selenium-python.readthedocs.io/) for the browser interaction and scrape process
- [Plotly](#https://plotly.com/python/) for analytics visualisation
    - `notebook`, `ipywidgets`, and `pandas` are dependencies for `plotly`

In [None]:
%pip install selenium plotly notebook ipywidgets pandas

## Core logics

Change the following parameters to suit you environment:
- `DRIVER_PATH` path of the ChromeDriver
- `CHROME_USER_PATH` path of the base Chrome user data
- `INTERVAL` interval of which the scraper wait after next button clicked
- `FILTER_YEAR` to filter orders of only the specified year
- `OUTPUT` path to file to save the process output

In [None]:
# Selenium
DRIVER_PATH = r'[PATH-TO-CHROMEDRIVER]\chromedriver.exe'
CHROME_USER_PATH = r'[PATH-TO-CHROME-DATA]\User Data'

# Scraping
INTERVAL = 1  # in seconds.
FILTER_YEAR = '2022'
OUTPUT = r'[PATH-TO-OUTPUT]\toped.json'

Initializing Selenium parts, using [Chrome](#https://www.google.com/chrome/) browser and the respective [Chromium ChromeDriver](#https://chromedriver.chromium.org/downloads). Selenium will try to open the browser using the profile selector. After first run, select the designated profile and uncheck the option to show every start up. If the url isn't shown up, you can manually navigate to it inside the browser.

In [None]:
# Selenium init

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

URL = 'https://www.tokopedia.com/order-list'

options = Options()
options.add_argument(f'user-data-dir={CHROME_USER_PATH}')
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)

driver.get(URL)

Make sure the order list page is openned before continue. Remove all filter and navigate to the first page, then you can proceed to scrape.

In [None]:
# Scrapping process

from time import sleep
import json

XPATH_CARD_ICONS = '//div[@class="icon-category"]'
XPATH_ICON_DIVS = 'following-sibling::div'
XPATH_PRICE = '../../..//div[@class="sum-price"]/div/p[2]'
XPATH_SHOP_NAME = '../../..//p[starts-with(@data-testid, "shopName")]'
XPATH_PRODUCT_DETAILS = '../../..//div[@class="item-content"]//div[@class="product-details"]//h6'
XPATH_NEXT_BUTTON = '//button[@aria-label="Laman berikutnya"]'

def trim_price(price):
    return price.replace('Rp', '').replace('.', '').strip()

invoices = list()
done = False
while not done:
    icons = driver.find_elements(By.XPATH, XPATH_CARD_ICONS)
    for icon in icons:
        divs = icon.find_elements(By.XPATH, XPATH_ICON_DIVS)
        cat, dat, _, inv = divs

        if not str(dat.text).endswith(FILTER_YEAR):
            done = True
            break

        price = icon.find_element(By.XPATH, XPATH_PRICE)
        shop_xpath = XPATH_SHOP_NAME if str(cat.text) == 'Belanja' else XPATH_PRODUCT_DETAILS
        shop = icon.find_element(By.XPATH, shop_xpath)

        invoice = {
            'invoice': inv.text,
            'category': cat.text,
            'shop': shop.text,
            'price': trim_price(price.text)
        }
        print(invoice)
        invoices.append(invoice)

    driver.find_element(By.XPATH, XPATH_NEXT_BUTTON).click()
    sleep(INTERVAL)

json.dump(invoices, open(OUTPUT, 'w+'))

## Post-scrape: analytics and visualisations

### Basic aggregation logics

In [189]:
# Plotly init

import pandas
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "browser"

def sum_per_key(dictionary):
    for d in dictionary:
        val = sum(int(x['price']) for x in dictionary[d])
        yield {'key':d, 'value':val}

def figure_per_key(dictionary):
    fig = px.bar(sum_per_key(dictionary), x='key', y='value', text_auto='.2s')
    fig.show()

def figure_per_group_key(dictionary):
    def _sum(filter):
        for d in dictionary:
            yield sum(int(x['price']) for x in dictionary[d] if x['invoice'].startswith(filter))
    
    df = pandas.DataFrame()
    df['key'] = dictionary.keys()
    df['Belanja'] = list(_sum('INV'))
    df['Tagihan'] = list(_sum('IVR'))

    fig = px.bar(df, x='key', y=['Belanja', 'Tagihan'], text_auto='.2s', barmode='group')
    fig.show()

### Example of the analytics processes

In [None]:
grouped_by_category = dict()

for inv in invoices:
    category = inv['category']
    if category not in grouped_by_category:
        grouped_by_category[category] = list()
    grouped_by_category[category].append(inv)

figure_per_key(grouped_by_category)

In [190]:
MONTHS = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'Desember']
grouped_by_month = dict()

for inv in invoices:
    date = inv['invoice'].split('/')[1]
    month = MONTHS[int(date[4:6]) - 1]
    if month not in grouped_by_month:
        grouped_by_month[month] = list()
    grouped_by_month[month].append(inv)

figure_per_group_key(grouped_by_month)

In [None]:
grouped_by_shop = dict()

for inv in invoices:
    shop = f"{inv['category']} {inv['shop']}"
    if shop not in grouped_by_shop:
        grouped_by_shop[shop] = list()
    grouped_by_shop[shop].append(inv)

figure_per_key(grouped_by_shop)