# Watsons Scraper

## Dependencies

In [None]:
import requests
import json
import time
import random

import pandas as pd

from bs4 import BeautifulSoup
from datetime import datetime, timedelta

import sys
from pathlib import Path

# Automatically detect the repo root (parent of notebook folder)
repo_root = Path().resolve().parent  # if notebook is in 'notebooks/' folder
sys.path.append(str(repo_root))

from config.config import get_environment

from config.config import data_import_json, data_export_json, data_import_pandas, data_export_pandas

## ENV

In [None]:
ENV = get_environment(
    env_path="../environments",
    env_name="env.json"
)

# content_date = datetime.now().date() + timedelta(days=0)
content_date = ENV['CONTENT_DATE']
website = ENV['TARGET']["2"]['NAME']
version = ENV['VERSION']

url_scrape = ENV['TARGET']["2"]['URL']

reparse_only = ENV['SCRAPER']['REPARSE_ONLY']
continue_scraper = ENV['SCRAPER']['CONTINUE_SCRAPER']

## Mining

### Get Initial Categories

In [None]:
# headers_cat = {
#     'Upgrade-Insecure-Requests': '1',
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
#     'sec-ch-ua': '"Chromium";v="142", "Google Chrome";v="142", "Not_A Brand";v="99"',
#     'sec-ch-ua-mobile': '?0',
#     'sec-ch-ua-platform': '"Windows"',
# }

# url_scrape_cat = 'https://www.watsons.co.id/id/lc/030000'

# print(f"Get Item Categories | URL: {url_scrape_cat}")

# response_cat = requests.get(
#     url_scrape_cat,
#     headers=headers_cat
# )

# print(f"Response Item Categories | URL: {url_scrape_cat} | STATUS CODE: {response_cat.status_code}")

# try:
#     soup_cat = BeautifulSoup(response_cat.content, 'html.parser')
#     json_cat = json.loads(soup_cat.select_one('script#wtcid-state').text)
#     # print(f"Total Categories: {len(item_cat['children'])}")
# except Exception as e:
#     raise e

# json_cat['cx-state']['e2-category']['category']['entities'].keys()

### Get All Items per Category

In [None]:
list_cat_id = ['facial/c/030100', 'tangan-kaki-pembersih-bulu/c/030200', 'derma-skin-care/c/030300']
range_limit = 100

for index, cat_slug in enumerate(list_cat_id):
    cat_id = cat_slug.split('/')[-1]
    print(f"Category {cat_slug} {index+1}/{len(list_cat_id)} | Range Limit: {range_limit}")

    headers = {
        'accept': 'application/json, text/plain, */*',
        'accept-language': 'en-US,en;q=0.9,id;q=0.8',
        # 'authorization': 'bearer JG0GV7RnGaGMraf4t3b9pqNluAE',
        'cache-control': 'no-cache, no-store, must-revalidate, post-check=0, pre-check=0',
        'expires': '0',
        'if-modified-since': 'Wed, 26 Nov 2025 11:36:20 GMT',
        'origin': 'https://www.watsons.co.id',
        'pragma': 'no-cache',
        'priority': 'u=1, i',
        'queue-target': f'https://www.watsons.co.id/id/perawatan-kulit/{cat_slug}',
        'referer': 'https://www.watsons.co.id/',
        'sec-ch-ua': '"Chromium";v="142", "Google Chrome";v="142", "Not_A Brand";v="99"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-site',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
        'vary': '*',
        # 'cookie': '_gcl_au=1.1.194818965.1764085722; _gid=GA1.3.498267634.1764085726; FPID=FPID2.3.kzb62Ehm8hffyG%2BRp6feK67SvSGEJvoju5IYAmAGvSM%3D.1764085726; FPLC=B%2BEsDH0GN8YrH4d1dMZHGGxYAHnjK7N5PzPBDo%2FakzCE9manY8daEPetmPKypn%2FrBRqqu3sGmENutlpF%2B0ONAXwSpc%2B3hEIgOF%2Fn1hyhyDCdF1dm13iYZ5wvgfbqIQ%3D%3D; _fbp=fb.2.1764085727007.86933967454674112; _gcl_gs=2.1.k1$i1764085726$u113127938; _gcl_aw=GCL.1764085750.Cj0KCQiAxJXJBhD_ARIsAH_JGjgqnb31MWBLVHAofc7ENs2RRBMAbhb9qoLBk4ntkuFO1IAiBmeWT9AaAsOGEALw_wcB; _gac_UA-82620053-5=1.1764085751.Cj0KCQiAxJXJBhD_ARIsAH_JGjgqnb31MWBLVHAofc7ENs2RRBMAbhb9qoLBk4ntkuFO1IAiBmeWT9AaAsOGEALw_wcB; _gac_UA-82620053-3=1.1764085752.Cj0KCQiAxJXJBhD_ARIsAH_JGjgqnb31MWBLVHAofc7ENs2RRBMAbhb9qoLBk4ntkuFO1IAiBmeWT9AaAsOGEALw_wcB; AKA_A2=A; _abck=5C68DA21CA9C0CDB9B5C870CB6BEC927~0~YAAQ7V1idp09uKOaAQAACCjCvw64IIP5pAI+C7zEuKVkpQs7GgmXKXm56Z1+zS0yWYLxXQF4TFJQ7Fj5gCVG8augoWLFt1MBQktAu2ZG1qldIEDmBkIw2yoPsINDkdfmTRKE/fpdBfAkV08BHIYO57K4AmKEKHEhVkkSfAE5xtzv4+wzyJrGvVsGvwU+LT/kEpkMf+pRjP9k2+sDKlfJBIiW/Ohi+uIVxH5Hhs7cXACkcUfdFHg2JLgV8b0COtvnscDrSZ6a0WNEdGPybnHRqXYng9WckeZQplgtQwxC1WULiHJBucO767jgWbT28tNpVP5q2/sW0xRzKsy7pZwyQpNBYY8rKmUOEt32uylLyQYiucVjsZN+HfvYWBRUox6LcLgOCvUIqLqr01qQOVE/e0mJ6jTEqI4/Ue2ja8c/vHLdeqICC44mrMZsM11hf3b13nbJtkcYaYiE+F7bJOpidBCHv9gsYf6Mg7Du/UTKGTvk+SG/RtYcNRcqxtBP09NOppTSYfI+rUaHSfz/25Kj162tex8XUDA/mhpBgp7v15CYyezWnEb8YHNsBpyN5GiwUwNz0cXf62BzycYQ/3xbMnWTuFi0IOHIqWbrb+7/ZikYhl6bJVAczVcgI1anSNJVfBzA07g=~-1~-1~-1~AAQAAAAE%2f%2f%2f%2f%2f+fWUmApEjVs7EK7PGCZwpX+5N2oQktu%2f13pllrV0fsI+3iKyXcgdyWEHkBx7srh3+YpaQFPKCgS7Gri5qJWEOJqXeEQ8HMXK3aW~-1; PIM-SESSION-ID=xWhrEjAfKN6XCwLW; authorization=JG0GV7RnGaGMraf4t3b9pqNluAE; token_type=guest; ROUTE=.api-7c5cd5f8c-ppxbz; ak_bmsc=D3454CBB3D43D76932CCF29D24F722BE~000000000000000000000000000000~YAAQ7V1idqQ9uKOaAQAA/yzCvx1JZQR7ARs+p2U0njWfYiflFuliBMo+lBnSsyC5NuvpoFhq66N0CE1WJjvmU+3u3yrE8EEUTYgZorUnWh0En7f1oWQkIIgGhtSSkIrZWeY0y33IrgOklWbq2MZatDvFdv+9Fk5UFE+Kwk2x1MA5c5byOtYWTIPGitL89ZNbqeLTGCpvTjPk0KqIk3tGPOOe8HTe7Tka0kZSpsWOxJSySlAsCjqo5Igf4c7wUyvOXqqUuZWCtl+CNiRTQWEQZal9Rhc8ddQIFybYt9kDKeh5+vts0Ogol8OKIqnAGbCtGrhoi/4Zw8lIhbkFThEQtgiR2sOiAGbN9hfPoK+4Zd5dfclj61ufo2UT4qbklT1jdAKpqwgi0dxJ+zPV4UzjlfjSHqc5Koy50K+lkKz5YqPyCQvt87LS3tQoK4fziTkcmB7A7TWS2GvSuP35rNBltUMl; bm_sz=7DD9F7F3F03E87668D10B7D391454DAE~YAAQ7V1idpJ6uKOaAQAADX3svx2eE+LzSP1YSfME4jo9ZWr62iNsR3AATwnfDkEVWY4oyD81v5Z7/VdZ8Pxtuc2YmiskNO1zahGl/j2jJ/zpqgmx/Op/ruCAgYsCEt4xwNWAGRnbDCMRZBbhYP7DvIhmnpux37pqLPSGXd3RdIrM1u9VKldQfAiA/hX5JtEa7mOgHxkpn9sZ4Qf74eyLLoeEL/b5HmhzPBhW/SZjbgCseOKOX5OceYOXUw5wAfmjo8X5eWJBKqESiua6KG6l19WqGyr6Dpc11UKL6zWht4sHis/QMvzCB7gmXdwVDeo4xWsQQrM/PcyVWP4ax8Y04bGMgaFJbJsf+VSSmy3d0Lg/wsURCMiP5LGXy0oDVXo+80vp9h9GhD85PLAIAvloJJuYuJBFN6Wju4rH/JrpfdLVoFqTICuEBj9tJbwodoCwAX8V0ussScIiCeZ52PChYUVqBc36VV+J~3290436~3294517; _ga=GA1.1.97958037.1764085726; FPGSID=1.1764156542.1764156542.G-JJRD05R0VJ.XoToHe4Gupg3as2D-kaAOw; bm_sv=A7F1C408E93BE73158D9DC7FE981F09C~YAAQo8zbF7DUbqyaAQAARPnsvx2cwO6uikjC1wP4mvII72HJPgJL8y6RgBheGAVJPaNp7kNFLZmT1r984cVLte0ruRYp+eP/jTKsbNdPj5ABrU9m8oUo6XWP1Vb+bfroQIBKn1Bxx4DgawKg77SQ45N2rD9wGF+eRsHrDV8dJddNWIObR/EWSodm0jgnql6fDZnvxXRTM2SfOsIXkjjzauu2AXNNLOqyQQMloQmmXECAFLwczst3XS8aajuh5uth96coRzc=~1; _ga_JJRD05R0VJ=GS2.1.s1764153768$o2$g1$t1764156980$j60$l0$h1293980454',
    }

    total_pages = 0
    loop_count = 0
    while loop_count < total_pages or loop_count == 0:
        page = loop_count+1
        params = {
            'fields': 'FULL',
            'query': f':bestSeller:category:{cat_id}',
            'pageSize': range_limit,
            'currentPage': page,
            'sort': 'bestSeller',
            'brandRedirect': 'true',
            'ignoreSort': 'false',
            'skipRedirect': 'false',
            'lang': 'id',
            'curr': 'IDR',
        }

        url_scrape = 'https://api.watsons.co.id/api/v2/wtcid/products/search'

        print(f"Get Category Items {cat_slug} {index+1}/{len(list_cat_id)} | Loop Count: {page}/{total_pages} | Range Items: {range_limit} | URL: {url_scrape}")

        response = requests.get(
            url_scrape,
            params=params,
            headers=headers
        )

        print(f"Response Category Items {cat_slug} {index+1}/{len(list_cat_id)} | Loop Count: {page}/{total_pages} | Range Items: {range_limit} | URL: {url_scrape} | STATUS CODE: {response.status_code}")

        try:
            total_pages = response.json()['pagination']['totalPages']

            print(f"Identify Category Items {cat_slug} {index+1}/{len(list_cat_id)} | Loop Count: {page}/{total_pages} | Range Items: {range_limit} | URL: {url_scrape} | CATEGORY NAME: {response.json()['breadcrumbs'][0]['facetValueName']}")

            data_export_json(
                data=response.json(),
                website=website,
                folder_name=f'scraper/{website}',
                version=version,
                content_date=content_date, # "0000-00-00"
                additional_info=f"scrape-cat_{cat_id}-page{page}",
                metadata={
                    "status_code": response.status_code,
                    "page": page,
                    "range_limit": range_limit,
                    "url_scrape": url_scrape
                }
            )

        except Exception as e:
            print(f"Identify Category Items {cat_slug} {index+1}/{len(list_cat_id)} | Loop Count: {page}/{total_pages} | Range Items: {range_limit} | URL: {url_scrape} | ERROR: {e}")

            data_export_json(
                data=response.text,
                website=website,
                folder_name=f'scraper/{website}',
                version=version,
                content_date=content_date, # "0000-00-00"
                additional_info=f"scrape-cat_{cat_id}-page{page}",
                metadata={
                    "status_code": response.status_code,
                    "error": e,
                    "page": page,
                    "range_limit": range_limit,
                    "url_scrape": url_scrape
                }
            )

        loop_count += 1
        if total_pages == 0:
            break

        time.sleep(0.5)
    #     break
    # break

## Parser

In [None]:
def parser_item(
        response_json: dict,
        page: int,
        scrape_date: datetime
    ):

    parsed_list = list()
    len_items = len(response_json['products'])
    cat_name = response_json['breadcrumbs'][0]['facetValueName']
    for index, item in enumerate(response_json['products']):
        try:
            item_slug = item['url']
            item_name = item['name']
            item_id = item['code']
            item_brand = item['masterBrand']['name']
            item_url = f"https://www.watsons.co.id/id{item['url']}"
            url_image = item['images'][0]['url'] if len(item['images']) > 0 else None
            review_total = item['productNumberOfReview']
            review_rating = item['reviewAvgRating']
            stock = item['stock']['stockLevelStatus']

            data_dict = dict()
            data_dict['scrape_date'] = str(scrape_date)
            data_dict['category_slug'] = cat_slug
            data_dict['category'] = cat_name
            data_dict['slug'] = item_slug
            data_dict['id'] = item_id
            data_dict['brand'] = item_brand
            data_dict['name'] = item_name
            data_dict['url'] = item_url
            data_dict['url_image'] = url_image
            data_dict['review_total'] = review_total
            data_dict['review_rating'] = review_rating
            data_dict['review_recommended'] = None
            data_dict['wishlist'] = None
            data_dict['variant_id'] = None
            data_dict['variant_name'] = ' , '.join(item['elabVariantProductContentSizeUnits']) if 'elabVariantProductContentSizeUnits' in item.keys() else None
            data_dict['stock'] = stock
            data_dict['ean'] = None
            data_dict['currency'] = 'IDR'
            data_dict['price_range'] = item['priceRange'] if item['priceRange'] else None
            data_dict['price'] = item['elabPrice']['value']
            data_dict['price_after_disc'] = item['elabMarkDownPrice']['value'] if 'elabMarkDownPrice' in item.keys() else None
            data_dict['price_disc_type'] = None
            data_dict['price_disc'] = item['elabMarkDownPrice']['discountAmount'] if 'elabMarkDownPrice' in item.keys() else None
            data_dict['price_disc_perc'] = item['elabMarkDownPrice']['discountRate'] if 'elabMarkDownPrice' in item.keys() else None
            data_dict['is_package'] = None

        except Exception as e:
            data_dict = dict()
            print(f"ERROR Parser Category: {cat_name} | Page: {page} | Item Index: {index} | Item Count: {index+1}/{len_items} | URL: {item_url} | {e}")

        parsed_list.append(data_dict)

    return parsed_list

In [None]:
list_cat_id = ['facial/c/030100', 'tangan-kaki-pembersih-bulu/c/030200', 'derma-skin-care/c/030300']
range_limit = 100

url_scrape = 'https://api.watsons.co.id/api/v2/wtcid/products/search'
parsed_list = list()
try:
    for index, cat_slug in enumerate(list_cat_id):
        cat_id = cat_slug.split('/')[-1]

        total_pages = 0
        loop_count = 0
        while loop_count < total_pages or loop_count == 0:
            page = loop_count+1

            print(f"Parsing Category Items {cat_slug} {index+1}/{len(list_cat_id)} | Loop Count: {page}/{total_pages} | Range Items: {range_limit} | URL: {url_scrape}")

            try:
                response_json = data_import_json(
                    website=website,
                    folder_name=f'scraper/{website}',
                    version=version,
                    content_date=content_date, # "0000-00-00"
                    additional_info=f"scrape-cat_{cat_id}-page{page}"
                )
                response_json = response_json['data']

                parsed_list_temp = parser_item(
                    response_json=response_json,
                    page=page,
                    scrape_date=content_date
                )

                total_pages = response_json['pagination']['totalPages']

            except Exception as e:
                parsed_list_temp = list()
                print(e)
                break

            parsed_list = parsed_list + parsed_list_temp

            loop_count += 1
            if total_pages == 0:
                break

    # Convert all list to DataFrame
    df_parse = pd.DataFrame(parsed_list)

    data_export_pandas(
        df_output=df_parse,
        website=website,
        folder_name=f'parser/{website}',
        version=version,
        content_date=content_date, # "0000-00-00"
        additional_info="parsed",
        incl_excel=True
    )

except Exception as e:
    print(e)

    df_parse = pd.DataFrame(parsed_list)
    data_export_pandas(
        df_output=df_parse,
        website=website,
        folder_name=f'parser/{website}',
        version=version,
        content_date=content_date, # "0000-00-00"
        additional_info="parsed",
        incl_excel=True
    )