In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
import logging
import sys
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", stream=sys.stdout)

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--blink-settings=imagesEnabled=false")
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(60)
    return driver

def extract_book_details(driver, retries=2):
    for attempt in range(retries + 1):
        try:
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, "//table[@class='data-table table-additional']//th[contains(text(), 'Mã hàng')]"))
            )
            attributes = driver.find_elements(By.XPATH, "//table[@class='data-table table-additional']//tr[th[@class='table-label']]")
            details = {}
            for attr in attributes:
                label = attr.find_element(By.XPATH, ".//th").text.strip()
                value = attr.find_element(By.XPATH, ".//td//div[@class='attribute_link_container']").text.strip()
                details[label] = value

            book_code = details.get("Mã hàng", "N/A")
            author = details.get("Tác giả", "N/A")
            translator = details.get("Người Dịch", "N/A")
            language = details.get("Ngôn Ngữ", "N/A")
            pub_year = details.get("Năm XB", "N/A")
            form = details.get("Hình thức", "N/A")
            num_pages = details.get("Số trang", "N/A")

            sales = "N/A"
            if driver.find_elements(By.CSS_SELECTOR, ".product-view-qty-num"):
                sales_text = driver.find_element(By.CSS_SELECTOR, ".product-view-qty-num").text.replace("Đã bán ", "").strip()
                sales = sales_text

            rating = "N/A"
            if driver.find_elements(By.CSS_SELECTOR, ".rating-box .rating"):
                rating_style = driver.find_element(By.CSS_SELECTOR, ".rating-box .rating").get_attribute("style")
                rating_percent = rating_style.split("width:")[1].replace("%", "").replace(";", "").strip()
                rating_score = f"{float(rating_percent) / 20}/5"
                review_count = driver.find_element(By.CSS_SELECTOR, ".rating-total").text.strip("()") if driver.find_elements(By.CSS_SELECTOR, ".rating-total") else "0 đánh giá"
                rating = f"{rating_score} ({review_count})"

            return book_code, author, translator, language, pub_year, form, sales, rating, num_pages

        except Exception as e:
            if attempt < retries:
                logging.warning(f"Attempt {attempt + 1}/{retries + 1} failed for {driver.current_url}: {e}. Retrying...")
                time.sleep(5)
                driver.refresh()
            else:
                logging.error(f"Error extracting details from {driver.current_url} after {retries + 1} attempts: {e}")
                return ("N/A",) * 9

def crawl_genre(genre, max_books=100):
    driver = setup_driver()
    genre_name, base_url = genre["name"], genre["url"]
    logging.info(f"Starting crawl for {genre_name} at {base_url}")
    
    books_data = []
    page_number = 1
    target_url = f"{base_url}?order=num_orders&limit=24&p={page_number}"

    try:
        while len(books_data) < max_books:
            logging.info(f"Loading page {page_number} for {genre_name}: {target_url}")
            for attempt in range(3):
                try:
                    driver.get(target_url)
                    WebDriverWait(driver, 30).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "ul#products_grid li div.item-inner"))
                    )
                    break
                except TimeoutException as e:
                    if attempt < 2:
                        logging.warning(f"Attempt {attempt + 1}/3: Timeout loading page {page_number} for {genre_name}. Retrying...")
                        time.sleep(5)
                    else:
                        logging.error(f"Failed to load page {page_number} for {genre_name} after 3 attempts: {e}")
                        return books_data

            book_items = driver.find_elements(By.CSS_SELECTOR, "ul#products_grid li div.item-inner")
            total_items = len(book_items)
            logging.info(f"Found {total_items} items on page {page_number} for {genre_name}")

            if total_items == 0:
                logging.warning(f"No items found on page {page_number} for {genre_name}. Stopping.")
                break

            page_records = []
            for item in book_items:
                try:
                    book_link_el = item.find_element(By.CSS_SELECTOR, "h2.p-name-list a")
                    book_url = book_link_el.get_attribute("href")
                    title = book_link_el.get_attribute("title")
                    old_price = item.find_element(By.CSS_SELECTOR, "p.old-price span.price").text.strip() if item.find_elements(By.CSS_SELECTOR, "p.old-price span.price") else "N/A"
                    trending = "Yes" if item.find_elements(By.CSS_SELECTOR, "img.label-tagname[src*='ico_trending']") else "No"
                    bestseller = "Yes" if item.find_elements(By.CSS_SELECTOR, "img.label-tagname[src*='ico_best_seller']") else "No"
                    trend_status = "Trending" if trending == "Yes" else ("Bestseller" if bestseller == "Yes" else "No")

                    page_records.append({"title": title, "book_url": book_url, "price": old_price, "trend_status": trend_status})
                except Exception as e:
                    logging.error(f"Error parsing item on page {page_number} in {genre_name}: {e}")
                    continue

            for idx, record in enumerate(page_records, start=1):
                if len(books_data) >= max_books:
                    break
                try:
                    driver.get(record["book_url"])
                    logging.info(f"Extracting detail for book {idx}/{total_items} on page {page_number} in {genre_name}")
                    book_details = extract_book_details(driver)
                    books_data.append({
                        "Mã sách": book_details[0], "Tên sách": record["title"], "Lượt bán": book_details[6],
                        "Giá": record["price"], "Tác giả": book_details[1], "Người dịch": book_details[2],
                        "Ngôn ngữ": book_details[3], "Năm xuất bản": book_details[4], "Xu hướng": record["trend_status"],
                        "Hình thức": book_details[5], "Ratings": book_details[7], "Thể loại": genre_name,
                        "Link": record["book_url"], "Số trang": book_details[8]
                    })
                    logging.info(f"Collected data for: {record['title']}")
                    time.sleep(1)
                except Exception as e:
                    logging.error(f"Error extracting details from {record['book_url']}: {e}")
                    continue

            if len(books_data) >= max_books:
                logging.info(f"Reached target of {max_books} books for {genre_name}")
                break

            page_number += 1
            target_url = f"{base_url}?order=num_orders&limit=24&p={page_number}"
            time.sleep(2)

    finally:
        driver.quit()

    return books_data

def main():
    driver = setup_driver()
    main_url = "https://www.fahasa.com/sach-trong-nuoc.html?order=num_orders&limit=24"
    logging.info(f"Fetching genres from {main_url}")
    driver.get(main_url)
    
    try:
        show_more_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "#m-more-less-left_category .m-show-more-action"))
        )
        driver.execute_script("arguments[0].click();", show_more_button)
        logging.info("Clicked 'Xem Thêm' button")
        WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#children-categories li a"))
        )
    except Exception as e:
        logging.error(f"Error clicking 'Xem Thêm' button: {e}")

    genres = []
    try:
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.ID, "children-categories"))
        )
        genre_elements = driver.find_elements(By.CSS_SELECTOR, "#children-categories li a")
        for element in genre_elements:
            genre_name = element.text.strip()
            genre_url = element.get_attribute("href")
            genres.append({"name": genre_name, "url": genre_url})
            logging.info(f"Found genre: {genre_name} - {genre_url}")
    except Exception as e:
        logging.error(f"Error extracting genres: {e}")

    driver.quit()

    all_books_data = []
    max_workers = 3
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_genre = {executor.submit(crawl_genre, genre, 100): genre for genre in genres}
        for future in as_completed(future_to_genre):
            genre = future_to_genre[future]
            try:
                genre_data = future.result()
                all_books_data.extend(genre_data)
                logging.info(f"Completed crawling genre: {genre['name']} with {len(genre_data)} books")
            except Exception as e:
                logging.error(f"Error in parallel crawl for {genre['name']}: {e}")

    df = pd.DataFrame(all_books_data)
    df.to_csv("fahasa_books_data.csv", index=False, encoding="utf-8-sig")
    logging.info("Crawling completed. Data saved to 'fahasa_books_data.csv'")

if __name__ == "__main__":
    main()

2025-03-26 14:43:03,346 - INFO - Fetching genres from https://www.fahasa.com/sach-trong-nuoc.html?order=num_orders&limit=24
2025-03-26 14:43:07,259 - INFO - Clicked 'Xem Thêm' button
2025-03-26 14:43:07,651 - INFO - Found genre: Thiếu Nhi - https://www.fahasa.com/sach-trong-nuoc/thieu-nhi.html
2025-03-26 14:43:07,723 - INFO - Found genre: Giáo Khoa - Tham Khảo - https://www.fahasa.com/sach-trong-nuoc/giao-khoa-tham-khao.html
2025-03-26 14:43:07,744 - INFO - Found genre: Văn Học - https://www.fahasa.com/sach-trong-nuoc/van-hoc-trong-nuoc.html
2025-03-26 14:43:07,773 - INFO - Found genre: Tâm Lý - Kỹ Năng Sống - https://www.fahasa.com/sach-trong-nuoc/tam-ly-ky-nang-song.html
2025-03-26 14:43:07,802 - INFO - Found genre: Manga - Comic - https://www.fahasa.com/sach-trong-nuoc/manga-comic.html
2025-03-26 14:43:07,832 - INFO - Found genre: Sách Học Ngoại Ngữ - https://www.fahasa.com/sach-trong-nuoc/sach-hoc-ngoai-ngu.html
2025-03-26 14:43:07,862 - INFO - Found genre: Kinh Tế - https://www.fa

In [3]:
df = pd.read_csv('fahasa_books_data.csv')
df

Unnamed: 0,Mã sách,Tên sách,Lượt bán,Giá,Tác giả,Người dịch,Ngôn ngữ,Năm xuất bản,Xu hướng,Hình thức,Ratings,Thể loại,Link,Số trang
0,8936214272648,Góc Nhỏ Có Nắng,10k+,68.000 đ,Little Rainbow,,Tiếng Việt,2024,Trending,Bìa Mềm,5.0/5 (0 đánh giá),Văn Học,https://www.fahasa.com/goc-nho-co-nang.html,64.0
1,8932000134749,Nếu Biết Trăm Năm Là Hữu Hạn - Ấn Bản Kỉ Niệm ...,1.9k,159.000 đ,Phạm Lữ Ân,,Tiếng Việt,2024,Trending,Bìa Mềm,0.0/5 (0 đánh giá),Văn Học,https://www.fahasa.com/neu-biet-tram-nam-la-hu...,263.0
2,8934974182375,Người Đàn Ông Mang Tên OVE (Tái Bản),2.5k,160.000 đ,Fredrik Backman,Hoàng Anh,Tiếng Việt,2022,Trending,Bìa Mềm,0.0/5 (0 đánh giá),Văn Học,https://www.fahasa.com/nguoi-dan-ong-mang-ten-...,452.0
3,8935235228351,Cây Cam Ngọt Của Tôi,2.9k,108.000 đ,José Mauro de Vasconcelos,"Nguyễn Bích Lan, Tô Yến Ly",,2020,Trending,Bìa Mềm,4.4/5 (0 đánh giá),Văn Học,https://www.fahasa.com/cay-cam-ngot-cua-toi.html,244.0
4,8936213491613,Hà Thanh Hải Yến - Ngang Qua Ngõ Nhỏ Bình An,190,196.000 đ,Quất Tử Bất Toan,Lục Bích,Tiếng Việt,2024,Trending,Bìa Mềm,0.0/5 (0 đánh giá),Văn Học,https://www.fahasa.com/ha-thanh-hai-yen-ngang-...,324.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2002,8936107810506,Graphics Issue #3-Define The Shapes (Tái Bản 2...,8,169.000 đ,Kee Agency,,,2018,No,Bìa Mềm,0.0/5 (0 đánh giá),Báo - Tạp Chí,https://www.fahasa.com/graphics-issue-3-define...,142.0
2003,8938507003892,Thiên Thần Nhỏ - Số 496,20,20.000 đ,Nhiều Tác Giả,,Tiếng Việt,2024,No,Bìa Mềm,0.0/5 (0 đánh giá),Báo - Tạp Chí,https://www.fahasa.com/thien-than-nho-so-496.html,50.0
2004,8938507003991,Bé Nấm Lùn - Tuyển Tập Đặc Biệt Vol.4,28,25.000 đ,Nhiều Tác Giả,,Tiếng Việt,2024,No,Bìa Mềm,0.0/5 (0 đánh giá),Báo - Tạp Chí,https://www.fahasa.com/be-nam-lun-tuyen-tap-da...,100.0
2005,8938507001775,Hoa Học Trò Số 1346 - Tặng Kèm Fanbook TWICE +...,94,30.000 đ,Nhiều Tác Giả,,Tiếng Việt,2020,No,Bìa Mềm,0.0/5 (0 đánh giá),Báo - Tạp Chí,https://www.fahasa.com/hoa-hoc-tro-so-1346-tan...,


In [4]:
df['Mã sách'].duplicated().sum()

np.int64(240)

In [5]:
df.isna().sum()

Mã sách           52
Tên sách           0
Lượt bán          89
Giá               97
Tác giả           57
Người dịch      1102
Ngôn ngữ        1161
Năm xuất bản      72
Xu hướng           0
Hình thức         58
Ratings           52
Thể loại           0
Link               0
Số trang          82
dtype: int64