In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import math

In [2]:
# Crawl dữ liệu theo từng trang
def product_crawling(root_URL, headers):
    page = 1
    products = []

    while(True):
        URL = root_URL + f"?p={page}"
        print(URL)
        response = requests.get(URL, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            
            no_products = soup.find("div", class_="no_products_filter")
            if no_products:
                break
            
            for item in soup.find_all("div", class_="item_sp_hasaki width_common relative"):
                # Lấy tên tiếng Việt
                vn_name_tag = item.find("div", class_="vn_names")
                vn_name = vn_name_tag.text.strip() if vn_name_tag else None
                
                # Lấy tên tiếng Anh
                en_name_tag = item.find("div", class_="en_names")
                en_name = en_name_tag.text.strip() if en_name_tag else None
                
                # Lấy nhãn hàng
                brand_tag = item.find("div", class_="width_common txt_color_1 space_bottom_3")
                brand = brand_tag.find("strong").text.strip() if brand_tag else None
                
                # Lấy URL và product ID
                a_tag = item.find("a", class_="block_info_item_sp")
                link = a_tag.get("href") if a_tag else None
                product_id = int(a_tag.get("data-product")) if a_tag else None
                
                # Lấy giá tiền
                price_tag = item.find("div", class_="width_common block_price space_bottom_3")
                # TH1: Chỉ tồn tại giá gốc
                if not price_tag.find("span", class_="item_giacu txt_12 right"):
                    actual_price = price_tag.find("strong", class_="item_giamoi txt_16").text.strip() if price_tag else 0
                    actual_price = re.sub(r"[^\d]", "", actual_price)
                # TH2: Có cả giá giảm và giá gốc
                else: 
                    actual_price = price_tag.find("span", class_="item_giacu txt_12 right").text.strip() if price_tag else 0
                    actual_price = re.sub(r"[^\d]", "", actual_price)
                    discount_price = price_tag.find("strong", class_="item_giamoi txt_16").text.strip() if price_tag else 0
                    discount_price = re.sub(r"[^\d]", "", discount_price)
                    if price_tag.find("span", class_="discount_percent2_deal"):
                        discount_rate = price_tag.find("span", class_="discount_percent2_deal").text.strip().replace("%", "") if price_tag else 0
                discount_rate = int(discount_rate)
                actual_price = int(actual_price)
                discount_rate = int(discount_rate)
                
                # Lấy dung tích sản phẩm
                data_variant = a_tag.get("data-variant") if a_tag else None
                
                # Lấy category
                category = a_tag.get("data-category-name") if a_tag else None
                
                # Lấy số lượt rating và số lượt bán
                rating_tag = item.find("div", class_="block_count_by")
                # TH1: Có đủ số lượng Rating và số lượt bán
                if len(rating_tag.get_text(strip=True).split('|')) == 2:
                    try:
                        rating = rating_tag.get_text(strip=True).split('|')[0].strip()
                        rating = int(rating.replace("(", "").replace(")", ""))
                    except:
                        rating = 0
                        
                    try:
                        sold_tag = item.find("span", class_="item_count_by")
                        sold = int(sold_tag.text.strip().replace('.', '')) if sold_tag else None
                    except:
                        sold = 0
                # TH2: Chỉ có số lượt bán
                elif len(rating_tag.get_text(strip=True).split('|')) == 1:
                    rating = 0
                    try:
                        sold_tag = item.find("span", class_="item_count_by")
                        sold = int(sold_tag.text.strip().replace('.', '')) if sold_tag else None
                    except:
                        sold = 0
                # TH3: Không có số lượt Rating và số lượt bán
                elif len(rating_tag.get_text(strip=True).split('|')) == 0:
                    rating = 0
                    sold = 0
                
                # Lấy số sao
                star_tag = item.find("div", class_="number_start")
                try:
                    style_attr = star_tag.get("style")
                    width_value = style_attr.split("width:")[1].split(";")[0].strip().replace("%", "")
                    star = int(width_value) * 5 / 100
                except:
                    star = 0
                
                products.append({
                    "Product ID": product_id,
                    "VN Name": vn_name,
                    "EN Name": en_name,
                    "Brand": brand,
                    "Category": category,
                    "Actual Price": actual_price,
                    "Discount Price": discount_price,
                    "Discount Rate": discount_rate,
                    "Variant": data_variant,
                    "Star": star,
                    "Rating": rating,
                    "Sold": sold,
                    "Link": link,
                })
                
        page += 1
        
    return products

In [3]:
# Chuyển đổi dữ liệu bị lỗi về UTF-8
def save_products(df, file_name):
    def fix_encoding(text):
        try:
            return text.encode('latin1').decode('utf-8')
        except:
            return text  # Nếu không bị lỗi, giữ nguyên

    # Áp dụng cho cột hoặc dữ liệu bị lỗi
    df["Category"] = df["Category"].apply(fix_encoding)
    df["VN Name"] = df["VN Name"].apply(fix_encoding)

    df.to_csv(f"../Data/{file_name}", index=False, encoding="utf-8")
    print("Thông tin Products đã được lưu thành công")

In [None]:
root = "https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

products = product_crawling(root_URL=root, headers=headers)
df = pd.DataFrame(products)
save_products(df, "suc-khoe.csv")


https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=1
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=2
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=3
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=4
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=5
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=6
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=7
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=8
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=9
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=10
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=11
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=12
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=13
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=14
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=15
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=16
https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html?p=17
https://hasaki.vn/danh-muc/suc-khoe-lam-

In [None]:
# # Crawl comment
# link = "https://hasaki.vn/ajax?api=product.getRatingMore&id=9740&offset=20&sort=date"
# '''Mỗi trang offset 10'''