# 01 - Thu thập dữ liệu (Crawling)

Mục tiêu: Thu thập dữ liệu sản phẩm/giá/đánh giá từ các sàn TMĐT tại Việt Nam (Shopee, Lazada, Tiki, ...).

**Lưu ý:** Không sử dụng dataset có sẵn. Dữ liệu phải được tự thu thập qua Web Scraping hoặc API.

## Kế hoạch thu thập
- Nguồn dữ liệu: Shopee/Lazada/Tiki
- Trường dữ liệu: tên sản phẩm, giá, số lượng bán, đánh giá, cửa hàng, thời gian
- Định dạng lưu: CSV/JSON

In [1]:
# TODO: Cài đặt và kiểm tra thư viện
import requests
import pandas as pd

In [None]:
# Dung lai ham scraper tu src/ (Tiki)
from pathlib import Path
import sys
import importlib

# Xac dinh project root (neu dang o thu muc notebooks/)
project_root = Path.cwd()
if not (project_root / "src").exists():
    project_root = project_root.parent

if (project_root / "src").exists():
    sys.path.append(str(project_root))
else:
    raise FileNotFoundError("Khong tim thay thu muc src; hay mo notebook tu project root.")

# Reload module de nhan cap nhat moi nhat
import src.scrapers.tiki_scraper as tiki_scraper
importlib.reload(tiki_scraper)
from src.scrapers.tiki_scraper import fetch_tiki_products, save_with_timestamp

# Vi du crawl 5 trang tu khoa "ao dai nu"
keyword = "ao dai nu"

raw_df = fetch_tiki_products(keyword=keyword, pages=5, limit=40, sleep_seconds=2)
file_path = save_with_timestamp(raw_df, keyword=keyword, out_dir="../data/raw")

print(f"Saved to: {file_path}")
raw_df.head()


Saved to: ..\data\raw\shopee_ao_dai_nu_full.csv


In [12]:
# Dung pipeline Shopee tu src/
from pathlib import Path
import sys
import importlib

# Xac dinh project root (neu dang o thu muc notebooks/)
project_root = Path.cwd()
if not (project_root / "src").exists():
    project_root = project_root.parent

if (project_root / "src").exists():
    sys.path.append(str(project_root))
else:
    raise FileNotFoundError("Khong tim thay thu muc src; hay mo notebook tu project root.")

# Reload module de nhan cap nhat moi nhat
import src.scrapers.shopee_scraper as shopee_scraper
importlib.reload(shopee_scraper)
from src.scrapers.shopee_scraper import fetch_shopee_products, save_full_dataset

# Tu khoa tim kiem
keyword = "ao dai nu"

# Cookie can duoc thay bang cookie hop le tu trinh duyet cua ban
SHOPEE_COOKIE = """_gcl_au=1.1.690685263.1770636016; _med=refer; csrftoken=bfWP8FBMoZdhw7ZcO553wSqE3ZisjE3v; _sapid=6a2b76ec9ae9ff856f76baeecf1478d2c5e23fb9e55ded5c723bf341; SPC_DT_TRANSLATED=0; language=vi; SPC_SEC_SI=v1-WDRnYjZBeHk2UFlLM00xdCpeS+OPIIsi8W44OzSvXC4nMsRj+0RUoY/N1zWMyzBoHIlhL/Cg/E+cFXz6Aj9DorIw1h2+aC+wd2ODgSRHgzc=; SPC_F=ascsCcfJqgPJzN7VPWMItAx9G450HSsK; REC_T_ID=4f532691-05a9-11f1-85c9-a26b8caf2c2d; SPC_SI=yUdTaQAAAABNeE5QakRrUkq9OQEAAAAAR09ZbFRMUkM=; _QPWSDCXHZQA=46cd0318-501b-4344-f6d0-3c8a221a5769; REC7iLP4Q=4b0add2b-803b-46c0-8d1e-721e0e180616; SPC_CLIENTID=YXNjc0NjZkpxZ1BKcpdyflrrhdbqepun; _hjSession_868286=eyJpZCI6Ijg1ZjM0MGMyLWUzOTEtNDE0OC04ZDc1LThkNGFhM2JlMzUwNCIsImMiOjE3NzA2MzYwMTg3NzEsInMiOjAsInIiOjAsInNiIjowLCJzciI6MCwic2UiOjAsImZzIjoxLCJzcCI6MH0=; AMP_TOKEN=%24NOT_FOUND; _gid=GA1.2.1775121742.1770636019; AC_CERT_D=U2FsdGVkX1+g+fOANhCTzZUOdDVjTXFNJSvDi/n1icoa2BhGbPHDQZI8temL3ASe8WY947Nbe0Mey2LujF8ql8F9ywi3oU3GSGeyz1R1AS3pkseUmg9ARvw/AG3Bu+Hn4oINwwJWOwqLURGCHdsL4P3kHE12WvvLXZcU8gxAue0KeODx1ozRdOtjgojIhqzxfUNj9/H8H2bW2lQ+WHrj0y4LAb+8VncLjbrQd245Cs6GYObEotXbEnEAKqHvyUaKWEQV9h58QZa0eyzY4XRWi60TCot7/6JmHQYmqNNN2MuaK3SHuxcAtvb6LYfOLDfRc+zq06/bCZhCGXnlcuhRWHu9x6D/DRxfIzje24wim1raBsb469ORpV+8wwPIgb2LKgvrLhICuKh+A7RnWPF3NefPEEAqh/tQSf2LBVcVcu9u7vIzkHUyN/NcldJF0E7e2afux1z6SItxlHOo6g8b/Q+CD5PRHws94aMrqFJBorjg8ZxkDw7h299KRYtMHh5AYLz3c9fAcpE64MW03EjjGRd+3GLa950FJKqDp/jOcVV/Fwdy3AbmfK8FQGr7Axp4E0CmWwxC7ZwjmgT+i09O8yHox9aSF1Qai+ZRzdhENgrTlUWUZ+cCGMW08zm36zNGzaDFXEi9Sai3pFRMunfhtsOp3n9KPXKQM91LhHiFI7E+UM+KBygHX/hzSGko+8gJEh/Me3pUH1RRD2b1i4fU5SYTmWc0BVm29zAhTaiMUc4Ti3zu6z6pXqRWT2tRx/oBYIUr0L4hmL2yhMkGuBEqr3goWHt7sDB5YU8v9toaOZCIW4yljC7DNJBqb9zf2+mjpupBNUWqhQnlegLSgKwot6+IpEYnuw/9+GigDxECfD/gQoR/0utQA2zGjvI0h4TViz4zwptdzBXSBazbP8bUhl0/vF0CtDLZ1DBVkcqKv5zBqy+lZzi2qHYY+OuMAHdClXGCx5K2eLeEYH4xcK7w/jMWcOnarhASOrQ0AnqLgoAjj6F/Uqfmd/ZjJSBMan1DTksK0eJ4WlukBdopVpBELLE5YTZQKBp+Rein2AN/n+d2T79JNqfsqkOlLqVWJj8ZbkMcNM9p7fltPec08PEMZA==; SPC_ST=.d0lkSzBMaXJ5Zlg5OW5yVtPwdQn6toibRaax8nR7VG7s2sTUHo8nWwMC1zNgXysRRaYP1sQDO+TmG6zqHV3eXQpnIYyfsBZroqzFT9TD71f2cGxJTl9WcI3xXbH7VzI+lrwE18a16RP0+c7RuShggUhFFuayutGg5B3bh0aQ4GzeX1cM+l9Z20XgLz8etRl1H16Rit+8S4nwdPtx+wjJtkWzsT039zTGxglFVgEx3dtRAMW4unLJ7e2EZZpzbWDiOkIYaPTwk976sQXNnKrpdzKNKWhr8sc+e/mo6uRjwdw=; SPC_U=926875692; SPC_R_T_ID=5MbbPA5AjWUDhh5Bptu/QBuuqb4+JGK9ndw+m40f8nBJ0vlZl7cKmyN/wKHwR+0WUT+ITB9eeEAx1TnXwHDF9dkYVHqGYr6LYN7NmsEICTpoL0/hDgTWQi4gtbcsxavE8/w+g6Lmyfvn02SKNOFbF1a6ac5q34JfOJJmXVjbYis=; SPC_R_T_IV=cmVnT2dENG9nclBmMTBKUg==; SPC_T_ID=5MbbPA5AjWUDhh5Bptu/QBuuqb4+JGK9ndw+m40f8nBJ0vlZl7cKmyN/wKHwR+0WUT+ITB9eeEAx1TnXwHDF9dkYVHqGYr6LYN7NmsEICTpoL0/hDgTWQi4gtbcsxavE8/w+g6Lmyfvn02SKNOFbF1a6ac5q34JfOJJmXVjbYis=; SPC_T_IV=cmVnT2dENG9nclBmMTBKUg==; SPC_CDS_CHAT=e2733168-63fa-4c8e-93ee-b3f09055d3ea; _hjSessionUser_868286=eyJpZCI6ImI2YTdhZDUxLTQ2NjEtNTAwYy05Y2U5LWM1ZjNiNzBlZWM1YSIsImNyZWF0ZWQiOjE3NzA2MzYwMTg3NzAsImV4aXN0aW5nIjp0cnVlfQ==; SPC_IA=1; _ga=GA1.1.1127385638.1770636018; _ga_4GPP1ZXG63=GS2.1.s1770636018$o1$g1$t1770637464$j39$l1$h1423162071; shopee_webUnique_ccd=USIaoezdsYa31XnSaTaHag%3D%3D%7CT%2Br%2FVCOv08D%2BQOz4xnIFESrY%2BFvxXWYG2dyMiwseVI8VsryEbbglO75XpoWlrixRuIyk5tbTteyi3Ks%3D%7CwqbhVWajucujqB%2F2%7C08%7C3; ds=6512f6288062d35135bbe6b0cfe2ebee; SPC_EC=.OXZreFNtaDBZQ293Y0ZUeYLeHeV5MxFgbVI/Ftj9vqugp8oSrXA9RDcl8msgE2NFW+cH0S31O77IQ6bqzE0si/KuIER1luAhnxxfiwALmlNifNY8eh6TAkPPsHnhxiy/YgdzfYZ4HceIVU3g0nDmfzJZWv8meQAAx0yXqPKNlQJ4f6Ci64Uo39pyHWusPErUbcpooD1S9SPc7zMGM3aWOxYUt6tv2/6BK8YRryaJZmGsqviwnW0H0bZSeqCeamlvjvKAyJflp/6WnbjCYfP4mF69dqXlMhXpnupeYTxknZs="""

raw_df = fetch_shopee_products(
    keyword=keyword,
    offsets=(0, 60, 120),
    limit=60,
    cookie=SHOPEE_COOKIE,
    checkpoint_every=10,
    out_dir="../data/raw",
)

file_path = save_full_dataset(raw_df, keyword=keyword, out_dir="../data/raw")

print(f"Saved to: {file_path}")
raw_df.head()


[WARN] Empty search results at offset 0: 90309999
[WARN] Empty search results at offset 60: 90309999
[WARN] Empty search results at offset 120: 90309999


ValueError: Khong tim thay itemid/shopid. Cookie co the da het han hoac keyword khong hop le.