Environment VS Code

version 1

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time, random
import pandas as pd
import os

# Setup WebDriver
driver = webdriver.Chrome()
url = "https://www.tokopedia.com/p/makanan-minuman/makanan-ringan/camilan-instant"
driver.get(url)

data = []
max_halaman = 50

for halaman in range(max_halaman):
    print(f"\n🔄 Memproses halaman ke-{halaman + 1}")
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#zeus-root")))
        time.sleep(random.uniform(2, 4))

        for _ in range(17):
            driver.execute_script("window.scrollBy(0, 250)")
            time.sleep(0.7)

        driver.execute_script("window.scrollBy(50, 0)")
        time.sleep(1)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        product_links = set()

        for item in soup.findAll('div', class_='css-bk6tzz e1nlzfl2'):
            link_tag = item.find('a', href=True)
            if link_tag:
                url_produk = link_tag['href']
                if 'ta.tokopedia.com' not in url_produk and '/product/' not in url_produk:
                    if not url_produk.startswith('https://'):
                        url_produk = f"https://www.tokopedia.com{url_produk}"
                    product_links.add(url_produk)

        print(f"🛒 Ditemukan {len(product_links)} produk.")

        for product_link in product_links:
            try:
                driver.execute_script("window.open(arguments[0]);", product_link)
                driver.switch_to.window(driver.window_handles[1])
                time.sleep(random.uniform(3, 5))

                detail_soup = BeautifulSoup(driver.page_source, "html.parser")

                def get_text(selector):
                    try:
                        return selector.text.strip()
                    except:
                        return ''

                product_name = get_text(detail_soup.find('h1', class_='css-j63za0'))
                price = get_text(detail_soup.find('div', class_='price', attrs={'data-testid': 'lblPDPDetailProductPrice'}))
                rating_product = get_text(detail_soup.find('span', {'data-testid': 'lblPDPDetailProductRatingNumber'}))
                review_counts = get_text(detail_soup.find('span', {'data-testid': 'lblPDPDetailProductRatingCounter'}))
                sold_count = get_text(detail_soup.find('p', {'data-testid': 'lblPDPDetailProductSoldCounter'})).replace("Terjual ", "")

                # kategori
                breadcrumb_items = driver.find_elements(By.CSS_SELECTOR, 'ol[data-testid="lnkPDPDetailBreadcrumb"] li')
                # ambil kategori ke-4 (index 3) yang merupakan kategori "Kacang"
                if len(breadcrumb_items) >= 4:
                    category = breadcrumb_items[3].text.strip()  # kategori ke-4 (index 3)
                else:
                    category = ''

                # informasi toko
                store_name = get_text(detail_soup.find('h2', class_='css-nc7wd7-unf-heading'))
                store_loc_raw = get_text(detail_soup.find('h2', class_='css-g78l6p-unf-heading'))
                store_loc = store_loc_raw.replace("Dikirim dari ", "")

                try:
                    rating_review_store = detail_soup.find('div', class_='css-e39d2g').find('p').text.strip()
                    rating_store = rating_review_store.split(" ")[0]
                    review_count_store = rating_review_store.split(" ")[1].strip("()")
                except:
                    rating_store = ''
                    review_count_store = ''

                # Rating 1-5 Count
                rating_distribution = {}
                rating_elements = detail_soup.find_all('span', class_='css-myjxhx')
                for i, span in enumerate(rating_elements):
                    rating_distribution[f'Rating {i+1} Count'] = span.text.strip().replace('(', '').replace(')', '')

                data.append({
                    'Product Name': product_name,
                    'Price': price,
                    'Product Rating': rating_product,
                    'Rating 1 Count': rating_distribution.get('Rating 1 Count', ''),
                    'Rating 2 Count': rating_distribution.get('Rating 2 Count', ''),
                    'Rating 3 Count': rating_distribution.get('Rating 3 Count', ''),
                    'Rating 4 Count': rating_distribution.get('Rating 4 Count', ''),
                    'Rating 5 Count': rating_distribution.get('Rating 5 Count', ''),
                    'Product Reviews Count': review_counts,
                    'Sold Count': sold_count,
                    'Category': category,
                    'Store Name': store_name,
                    'Store Location': store_loc,
                    'Store Rating': rating_store,
                    'Store Reviews Count': review_count_store,
                    'URL': product_link
                })

                print(f"✅ {product_name}")
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
            except Exception as e:
                print(f"❌ Gagal ambil detail dari {product_link} karena {e}")
                if len(driver.window_handles) > 1:
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
                continue

        # Klik ke halaman berikutnya
        try:
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button.css-dzvl4q-unf-pagination-item[aria-label='Laman berikutnya']"))
            )
            if not next_button.is_enabled():
                print("⚠️ Tombol 'Laman berikutnya' tidak aktif. Berhenti.")
                break

            driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)
            driver.execute_script("arguments[0].click();", next_button)
            print("➡️ Klik laman berikutnya berhasil.")
        except Exception as e:
            print(f"⚠️ Tidak bisa klik laman berikutnya: {e}")
            break

    except Exception as e:
        print(f"⚠️ Error pada halaman ke-{halaman+1}: {e}")
        continue

# Simpan ke csv lokal
df = pd.DataFrame(data)

folder_path = r"D:\TA\Data Tokopedia"
os.makedirs(folder_path, exist_ok=True)
csv_path = os.path.join(folder_path, "Data_Tokopedia_Camilan-instant.csv")

df.to_csv(csv_path, index=False, encoding='utf-8-sig')
print(f"\n✅ Data berhasil disimpan ke: {csv_path}")

driver.quit()




Scraping page 1: https://www.tokopedia.com/p/makanan-minuman/makanan-ringan/kacang?page=1
Request failed: HTTPSConnectionPool(host='www.tokopedia.com', port=443): Read timed out. (read timeout=10)
Request failed: HTTPSConnectionPool(host='www.tokopedia.com', port=443): Read timed out. (read timeout=10)
Request failed: HTTPSConnectionPool(host='www.tokopedia.com', port=443): Read timed out. (read timeout=10)
Failed to fetch page 1.


version 2

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time, random
import pandas as pd
import os

# Setup WebDriver
driver = webdriver.Chrome()
url = "https://www.tokopedia.com/p/makanan-minuman/makanan-ringan/camilan-instant"
driver.get(url)

data = []
max_halaman = 50

for halaman in range(max_halaman):
    print(f"\n🔄 Memproses halaman ke-{halaman + 1}")
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#zeus-root")))
        time.sleep(random.uniform(2, 4))

        for _ in range(17):
            driver.execute_script("window.scrollBy(0, 250)")
            time.sleep(0.7)

        driver.execute_script("window.scrollBy(50, 0)")
        time.sleep(1)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        product_links = set()

        for item in soup.findAll('div', class_='css-bk6tzz e1nlzfl2'):
            link_tag = item.find('a', href=True)
            if link_tag:
                url_produk = link_tag['href']
                if 'ta.tokopedia.com' not in url_produk and '/product/' not in url_produk:
                    if not url_produk.startswith('https://'):
                        url_produk = f"https://www.tokopedia.com{url_produk}"
                    product_links.add(url_produk)

        print(f"🛒 Ditemukan {len(product_links)} produk.")

        for product_link in product_links:
            try:
                driver.execute_script("window.open(arguments[0]);", product_link)
                driver.switch_to.window(driver.window_handles[1])
                time.sleep(random.uniform(3, 5))

                detail_soup = BeautifulSoup(driver.page_source, "html.parser")

                def get_text(selector):
                    try:
                        return selector.text.strip()
                    except:
                        return ''

                # Nama produk
                product_name = get_text(detail_soup.find('h1', class_='css-j63za0'))

                # Harga produk
                price = get_text(detail_soup.find('div', class_='price', attrs={'data-testid': 'lblPDPDetailProductPrice'}))

                # Rating produk
                rating_product = get_text(detail_soup.find('span', {'data-testid': 'lblPDPDetailProductRatingNumber'}))

                # Jumlah ulasan produk
                review_counts = get_text(detail_soup.find('span', {'data-testid': 'lblPDPDetailProductRatingCounter'}))

                # Jumlah produk terjual
                sold_count = get_text(detail_soup.find('p', {'data-testid': 'lblPDPDetailProductSoldCounter'})).replace("Terjual ", "")

                # Kategori produk
                breadcrumb_items = driver.find_elements(By.CSS_SELECTOR, 'ol[data-testid="lnkPDPDetailBreadcrumb"] li')
                if len(breadcrumb_items) >= 4:
                    category = breadcrumb_items[3].text.strip()
                else:
                    category = ''

                # Nama toko
                store_name = get_text(detail_soup.find('h2', class_='css-nc7wd7-unf-heading'))

                # Lokasi toko
                store_loc_raw = get_text(detail_soup.find('h2', class_='css-g78l6p-unf-heading'))
                store_loc = store_loc_raw.replace("Dikirim dari ", "")

                try:
                    rating_review_store = detail_soup.find('div', class_='css-e39d2g').find('p').text.strip()
                    # Rating toko
                    rating_store = rating_review_store.split(" ")[0]

                    # Jumlah ulasan toko
                    review_count_store = rating_review_store.split(" ")[1].strip("()")
                except:
                    rating_store = ''
                    review_count_store = ''

                # Distribusi jumlah rating produk skala 1-5
                rating_distribution = {}
                rating_elements = detail_soup.find_all('span', class_='css-myjxhx')
                for i, span in enumerate(rating_elements):
                    rating_distribution[f'Rating {i+1} Count'] = span.text.strip().replace('(', '').replace(')', '')

                data.append({
                    'Product Name': product_name,
                    'Price': price,
                    'Product Rating': rating_product,
                    'Rating 1 Count': rating_distribution.get('Rating 1 Count', ''),
                    'Rating 2 Count': rating_distribution.get('Rating 2 Count', ''),
                    'Rating 3 Count': rating_distribution.get('Rating 3 Count', ''),
                    'Rating 4 Count': rating_distribution.get('Rating 4 Count', ''),
                    'Rating 5 Count': rating_distribution.get('Rating 5 Count', ''),
                    'Product Reviews Count': review_counts,
                    'Sold Count': sold_count,
                    'Category': category,
                    'Store Name': store_name,
                    'Store Location': store_loc,
                    'Store Rating': rating_store,
                    'Store Reviews Count': review_count_store,
                    'URL': product_link
                })

                print(f"✅ {product_name}")
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
            except Exception as e:
                print(f"❌ Gagal ambil detail dari {product_link} karena {e}")
                if len(driver.window_handles) > 1:
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
                continue

        # Navigasi ke halaman berikutnya
        try:
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button.css-dzvl4q-unf-pagination-item[aria-label='Laman berikutnya']"))
            )
            if not next_button.is_enabled():
                print("⚠️ Tombol 'Laman berikutnya' tidak aktif. Berhenti.")
                break

            driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)
            driver.execute_script("arguments[0].click();", next_button)
            print("➡️ Klik laman berikutnya berhasil.")
        except Exception as e:
            print(f"⚠️ Tidak bisa klik laman berikutnya: {e}")
            break

    except Exception as e:
        print(f"⚠️ Error pada halaman ke-{halaman+1}: {e}")
        continue

# Simpan ke file csv lokal
df = pd.DataFrame(data)

folder_path = r"D:\TA\Data Tokopedia"
os.makedirs(folder_path, exist_ok=True)
csv_path = os.path.join(folder_path, "Data_Tokopedia_Camilan-instant.csv")

df.to_csv(csv_path, index=False, encoding='utf-8-sig')
print(f"\n✅ Data berhasil disimpan ke: {csv_path}")

driver.quit()


