In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import math
from datetime import datetime
import os
from tqdm import tqdm

Crawl Products

In [6]:
# Crawl dữ liệu theo từng trang
def product_crawling(root_URL, headers):
    print("Crawling...")
    page = 1
    products = []

    while(True):
        URL = root_URL + f"?p={page}"
        response = requests.get(URL, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            
            no_products = soup.find("div", class_="no_products_filter")
            if no_products:
                break
            
            for item in soup.find_all("div", class_="item_sp_hasaki width_common relative"):
                # Lấy tên tiếng Việt
                vn_name_tag = item.find("div", class_="vn_names")
                vn_name = vn_name_tag.text.strip() if vn_name_tag else None
                
                # Lấy tên tiếng Anh
                en_name_tag = item.find("div", class_="en_names")
                en_name = en_name_tag.text.strip() if en_name_tag else None
                
                # Lấy nhãn hàng
                brand_tag = item.find("div", class_="width_common txt_color_1 space_bottom_3")
                brand = brand_tag.find("strong").text.strip() if brand_tag else None
                
                # Lấy URL và product ID
                a_tag = item.find("a", class_="block_info_item_sp")
                link = a_tag.get("href") if a_tag else None
                product_id = int(a_tag.get("data-product")) if a_tag else None
                
                # Lấy giá tiền
                price_tag = item.find("div", class_="width_common block_price space_bottom_3")
                # TH1: Chỉ tồn tại giá gốc
                if not price_tag.find("span", class_="item_giacu txt_12 right"):
                    actual_price = price_tag.find("strong", class_="item_giamoi txt_16").text.strip() if price_tag else 0
                    actual_price = re.sub(r"[^\d]", "", actual_price)
                # TH2: Có cả giá giảm và giá gốc
                else: 
                    actual_price = price_tag.find("span", class_="item_giacu txt_12 right").text.strip() if price_tag else 0
                    actual_price = re.sub(r"[^\d]", "", actual_price)
                    discount_price = price_tag.find("strong", class_="item_giamoi txt_16").text.strip() if price_tag else 0
                    discount_price = re.sub(r"[^\d]", "", discount_price)
                    if price_tag.find("span", class_="discount_percent2_deal"):
                        discount_rate = price_tag.find("span", class_="discount_percent2_deal").text.strip().replace("%", "") if price_tag else 0
                discount_rate = int(discount_rate)
                actual_price = int(actual_price)
                discount_rate = int(discount_rate)
                
                # Lấy dung tích sản phẩm
                data_variant = a_tag.get("data-variant") if a_tag else None
                
                # Lấy category
                category = a_tag.get("data-category-name") if a_tag else None
                
                # Lấy số lượt rating và số lượt bán
                rating_tag = item.find("div", class_="block_count_by")
                # TH1: Có đủ số lượng Rating và số lượt bán
                if len(rating_tag.get_text(strip=True).split('|')) == 2:
                    try:
                        rating = rating_tag.get_text(strip=True).split('|')[0].strip()
                        rating = int(rating.replace("(", "").replace(")", ""))
                    except:
                        rating = 0
                        
                    try:
                        sold_tag = item.find("span", class_="item_count_by")
                        sold = int(sold_tag.text.strip().replace('.', '')) if sold_tag else None
                    except:
                        sold = 0
                # TH2: Chỉ có số lượt bán
                elif len(rating_tag.get_text(strip=True).split('|')) == 1:
                    rating = 0
                    try:
                        sold_tag = item.find("span", class_="item_count_by")
                        sold = int(sold_tag.text.strip().replace('.', '')) if sold_tag else None
                    except:
                        sold = 0
                # TH3: Không có số lượt Rating và số lượt bán
                elif len(rating_tag.get_text(strip=True).split('|')) == 0:
                    rating = 0
                    sold = 0
                
                # Lấy số sao
                star_tag = item.find("div", class_="number_start")
                try:
                    style_attr = star_tag.get("style")
                    width_value = style_attr.split("width:")[1].split(";")[0].strip().replace("%", "")
                    star = int(width_value) * 5 / 100
                except:
                    star = 0
                
                products.append({
                    "Product ID": product_id,
                    "VN Name": vn_name,
                    "EN Name": en_name,
                    "Brand": brand,
                    "Category": category,
                    "Actual Price": actual_price,
                    "Discount Price": discount_price,
                    "Discount Rate": discount_rate,
                    "Variant": data_variant,
                    "Star": star,
                    "Rating": rating,
                    "Sold": sold,
                    "Link": link,
                })
                
        page += 1
        
    return products

In [3]:
# Chuyển đổi dữ liệu bị lỗi về UTF-8
def save_products(df, file_name):
    def fix_encoding(text):
        try:
            return text.encode('latin1').decode('utf-8')
        except:
            return text  # Nếu không bị lỗi, giữ nguyên

    # Áp dụng cho cột hoặc dữ liệu bị lỗi
    df["Category"] = df["Category"].apply(fix_encoding)
    df["VN Name"] = df["VN Name"].apply(fix_encoding)

    df.to_csv(f"../Data/{file_name}", index=False, encoding="utf-8")
    print("Thông tin Products đã được lưu thành công")

In [6]:
root = "https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

# products = product_crawling(root_URL=root, headers=headers)
# df = pd.DataFrame(products)
# save_products(df, "suc-khoe.csv")

Crawl Comments

In [17]:
def save_reviews(df, product_id):
    def fix_encoding(text):
        try:
            return text.encode('latin1').decode('utf-8')
        except:
            return text  # Nếu không bị lỗi, giữ nguyên
        
    # Áp dụng cho cột hoặc dữ liệu bị lỗi
    df["Username"] = df["Username"].apply(fix_encoding)
    df["Review"] = df["Review"].apply(fix_encoding)

    df.to_csv(f"../Data/reviews/{str(product_id)}.csv", index=False, encoding="utf-8")

In [32]:
# Crawl comment
def crawl_reviews(file_path):
    def read_product_ids(file_path):
        df = pd.read_csv(file_path)
        product_ids = df["Product ID"].tolist()
        return product_ids
    product_ids = read_product_ids(file_path)
    
    def crawl(id, offset):
        # Crawl tất cả comment của Product ID đang được xét đến
        while(True):
            URL = f"https://hasaki.vn/ajax?api=product.getRatingMore&id={id}&offset={offset}&sort=date"
            response = requests.get(URL, headers=headers) 
              
            if response.status_code == 200:
                json_data = response.json()
                total_item = json_data['data']['total_item']
                # Dừng crawl từ ở trang cuối
                if total_item == 0:
                    break

                html_data = json_data['data']['html']
                if html_data is not None:
                    soup = BeautifulSoup(html_data, 'html.parser')
                    
                    for comment in soup.find_all('div', class_='item_comment'):
                        user_name = comment.find('strong', class_='txt_color_1').text.strip()
                        
                        content_tag = comment.find('div', class_='content_comment').text
                        content = content_tag.strip().replace("\n", "").replace("\t", "") if content_tag else None
                        
                        # Xử lý chuỗi datetime
                        try:
                            time = comment.find('div', class_='timer_comment').text.strip()
                            cleaned_time_string = time.replace(" ", "")  # "08:23|30/11/2024"
                            final_time = datetime.strptime(cleaned_time_string, "%H:%M|%d/%m/%Y")
                        except:
                            final_time = None
                        
                        # Lấy số sao dựa theo width
                        width_style = comment.find('div', class_='number_start')['style']
                        stars = int(width_style.split(':')[1].replace('%', '').replace(';', '').strip()) / 100 * 5
                        
                        reviews.append({
                            'Product ID': id,
                            'Username': user_name,
                            'Time': final_time,
                            'Stars': stars,
                            'Review': content
                        })
            
            # Trường hợp Offset = 10, gán offset bằng 20
            offset += 10
            if offset == 10:
                offset = 20
                
        return pd.DataFrame(reviews)

    print("Crawling...")
    
    # Crawl comment trên từng Product ID
    progress_bar = tqdm(total=len(product_ids), desc="Crawling Reviews...")
    
    for id in product_ids:
        reviews = []
        review_file_path = f"../Data/reviews/{str(id)}.csv"
        if os.path.isfile(review_file_path) == False: 
            df = crawl(id=id, offset=0)
            if len(df) != 0:
                save_reviews(df, product_id=id)
        progress_bar.update(1)
        
    progress_bar.close()

In [34]:
'''Mỗi trang offset 10'''
crawl_reviews(file_path="../Data/products.csv")

Crawling...



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A