In [1]:
import csv

import time

import pandas as pd

from selenium import webdriver

from selenium.webdriver.chrome.service import Service

from selenium.webdriver.chrome.options import Options

from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException



def setup_driver():

     """Setup and return the Chrome WebDriver"""

     chrome_options = Options()

     # Uncomment below line to run in headless mode if needed

     # chrome_options.add_argument("--headless")    

     # chrome_options.add_argument("--no-sandbox")

     # chrome_options.add_argument("--disable-dev-shm-usage")

    

     driver = webdriver.Chrome(options=chrome_options)

     return driver



def extract_reviews_from_page(driver):

     """Extract review titles and content from current page"""

     reviews = []

    

     # Wait for reviews to load

     try:

         WebDriverWait(driver, 10).until(

            EC.presence_of_element_located((By.CLASS_NAME, "review-comment__title"))

         )

    

         # Find all review elements

         review_titles = driver.find_elements(By.CLASS_NAME, "review-comment__title")

         review_contents = driver.find_elements(By.CLASS_NAME, "review-comment__content")

    

         # Extract data

         for i in range(len(review_titles)):

            if i < len(review_contents):

              title = review_titles[i].text.strip()

              content = review_contents[i].text.strip()

              reviews.append({

                   'title': title,

                   'content': content

              })

              

     except TimeoutException:

         print("No reviews found or page took too long to load")

    

     return reviews



def has_next_page(driver):

     """Check if there's a next page button that's clickable"""

     try:

         next_button = driver.find_element(By.CSS_SELECTOR, "a.btn.next")

         # Check if the button exists and is not disabled

         return next_button.is_displayed() and "disabled" not in next_button.get_attribute("class")

     except NoSuchElementException:

         return False



def click_next_page(driver):

     """Click the next page button"""

     try:

         next_button = WebDriverWait(driver, 10).until(

            EC.element_to_be_clickable((By.CSS_SELECTOR, "a.btn.next"))

         )

         driver.execute_script("arguments[0].click();", next_button)

         # Wait for the page to load

         time.sleep(2)

         return True

     except (NoSuchElementException, TimeoutException, StaleElementReferenceException):

         print("Next button not found or not clickable")

         return False



def crawl_product_reviews(driver, url):

     """Crawl all reviews for a given product URL"""

     all_reviews = []

     product_data = []

    

     try:

         driver.get(url)

         time.sleep(3)      # Allow page to load completely

    

         # Scroll to reviews section if available

         try:

            reviews_section = WebDriverWait(driver, 10).until(

              EC.presence_of_element_located((By.CLASS_NAME, "review-comment__title"))

            )

            driver.execute_script("arguments[0].scrollIntoView();", reviews_section)

         except TimeoutException:

            print(f"No reviews found for URL: {url}")

            return product_data

    

         page_num = 1

         while True:

            print(f"Processing page {page_num} for URL: {url}")

            reviews = extract_reviews_from_page(driver)

            all_reviews.extend(reviews)

    

            # Create product data entries

            for review in reviews:

              product_data.append({

                   'title': review['title'],

                   'content': review['content']

              })

    

            if has_next_page(driver):

              if not click_next_page(driver):

                   break

              page_num += 1

            else:

              break

              

     except Exception as e:

         print(f"Error processing URL {url}: {e}")

    

     return product_data



def main():

     # Read URLs from the csv file

     try:

         urls_df = pd.read_csv("url_final.csv")

# Assuming the column with URLs is named 'url'

# urls = urls_df['URL'].tolist()

         urls = ['https://tiki.vn/tui-vien-giat-xa-maxkleen-huong-nuoc-hoa-huyen-dieu-15g-x-34vien-p275707347.html?spid=275707349']

     except Exception as e:

         print(f"Error reading URL file: {e}")

         return

    

     driver = setup_driver()

    

     # Initialize CSV file

     with open("raw_data.csv", 'w', newline='', encoding='utf-8') as f:

         writer = csv.DictWriter(f, fieldnames=['title', 'content'])

         writer.writeheader()

    

     # Process each URL

     for url in urls:

         print(f"Processing: {url}")

         review_data = crawl_product_reviews(driver, url)

    

         # Append data to CSV

         with open("raw_data.csv", 'a', newline='', encoding='utf-8') as f:

            writer = csv.DictWriter(f, fieldnames=['title', 'content'])

            writer.writerows(review_data)

    

     driver.quit()

     print("All reviews have been collected and saved to raw_data.csv")



if __name__ == "__main__":

     main()

The chromedriver version (134.0.6998.165) detected in PATH at C:\Webdriver\chromedriver.exe might not be compatible with the detected chrome version (135.0.7049.84); currently, chromedriver 135.0.7049.84 is recommended for chrome 135.*, so it is advised to delete the driver in PATH and retry


Processing: https://tiki.vn/tui-vien-giat-xa-maxkleen-huong-nuoc-hoa-huyen-dieu-15g-x-34vien-p275707347.html?spid=275707349
No reviews found for URL: https://tiki.vn/tui-vien-giat-xa-maxkleen-huong-nuoc-hoa-huyen-dieu-15g-x-34vien-p275707347.html?spid=275707349
All reviews have been collected and saved to raw_data.csv


In [12]:
import pandas as pd
df = pd.read_csv('test.csv')

In [16]:
df

Unnamed: 0,name,price_old,price_new,warranty,CPU,Ram_size,Ram_type,Hard_Drive,screen,resolution,graphics,size_weight,material,os,year,battery
0,Laptop Acer Switch 5 SW512 52P 34RS i3 7130U/4...,7620000.0,22490000.0,1 tháng tại TGDĐ,Intel Core i3 Kabylake - 7130U,4 GB,DDR3L,128 GB SSD M2 PCIe,12 inch,2K (2160 x 1440),Card tích hợp - Intel UHD Graphics 620,Dài Dài 292.9 mm - Rộng Rộng 201.8 mm - Dày Dà...,Vỏ kim loại,Windows 10 Home SL,2019,2-cell Li-ion
1,Laptop Acer Swift 3 SF315 52 50T9 i5 8250U/8GB...,5760000.0,16990000.0,1 tháng tại TGDĐ,Intel Core i5 Coffee Lake - 8250U,8 GB,DDR4 (Onboard +1 khe),Hỗ trợ khe cắm HDD SATA\n256 GB SSD M.2 SATA 3,15.6 inch,Full HD (1920 x 1080),Card tích hợp - Intel UHD Graphics 620,Dài Dài 359 mm - Rộng Rộng 243 mm - Dày Dày 16...,Vỏ kim loại,Windows 10 Home SL,2018,"Li-Polymer, 80 Wh"
2,Laptop Acer Swift 7 SF714 52T 76C6 i7 8500Y/16...,16940000.0,49990000.0,1 tháng tại TGDĐ,Intel Core i7 Coffee Lake - 8500Y,16 GB,DDR3 (Onboard),512 GB SSD NVMe PCIe,14 inch,Full HD (1920 x 1080),Card tích hợp - Intel UHD Graphics 615,Dài Dài 317.9 mm - Rộng Rộng 191.5 - Dày Dày 9...,Nhôm - Magie,Windows 10 Home SL,2019,3-cell Li-ion
3,Laptop Acer Swift 5 SF514 53T 51EX i5 8265U/8G...,8130000.0,23990000.0,1 tháng tại TGDĐ,Intel Core i5 Coffee Lake - 8265U,8 GB,DDR4 (Onboard),256 GB SSD NVMe PCIe,14 inch,Full HD (1920 x 1080),Card tích hợp - Intel UHD Graphics 620,Dài 329 mm - Rộng 228 mm - Dày 14.9 mm - Nặng ...,Nhôm - Magie,Windows 10 Home SL,2018,2-cell Li-Polymer
4,Laptop CHUWI CoreBook X i5 8259U/16GB/512GB/Win10,5580000.0,15490000.0,1 tháng tại TGDĐ,Intel Core i5 Coffee Lake - 8259U,16 GB,DDR4 (Onboard),512 GB SSD\nHỗ trợ khe cắm SSD M.2 SATA3 (nâng...,14 inch,2.2K (2240x1400),Card tích hợp - Intel Iris Plus Graphics 655,Dài 310 mm - Rộng 229.5 mm - Dày 20.6 mm - Nặn...,Vỏ kim loại,Windows 10 Home SL,2021,46.2 Wh
5,Laptop Acer Swift 5 SF514 53T 740R i7 8565U/8G...,9480000.0,27990000.0,1 tháng tại TGDĐ,Intel Core i7 Coffee Lake - 8565U,8 GB,DDR4 (Onboard),256 GB SSD NVMe PCIe,14 inch,Full HD (1920 x 1080),Card tích hợp - Intel UHD Graphics 620,Dài 329 mm - Rộng 228 mm - Dày 14.9 mm - Nặng ...,Nhôm - Magie,Windows 10 Home SL,2018,2-cell Li-Polymer
6,Laptop CHUWI CoreBook X i5 8259U/16GB/512GB/Win10,5680000.0,15490000.0,2.5 tháng chính hãng,Intel Core i5 Coffee Lake - 8259U,16 GB,DDR4 (Onboard),512 GB SSD\nHỗ trợ khe cắm SSD M.2 SATA3 (nâng...,14 inch,2.2K (2240x1400),Card tích hợp - Intel Iris Plus Graphics 655,Dài 310 mm - Rộng 229.5 mm - Dày 20.6 mm - Nặn...,Vỏ kim loại,Windows 10 Home SL,2021,46.2 Wh
7,Laptop CHUWI CoreBook X i5 8259U/16GB/512GB/Win10,5710000.0,15490000.0,3 tháng chính hãng,Intel Core i5 Coffee Lake - 8259U,16 GB,DDR4 (Onboard),512 GB SSD\nHỗ trợ khe cắm SSD M.2 SATA3 (nâng...,14 inch,2.2K (2240x1400),Card tích hợp - Intel Iris Plus Graphics 655,Dài 310 mm - Rộng 229.5 mm - Dày 20.6 mm - Nặn...,Vỏ kim loại,Windows 10 Home SL,2021,46.2 Wh
8,Laptop CHUWI CoreBook X i5 8259U/16GB/512GB/Win10,5740000.0,15490000.0,3.5 tháng chính hãng,Intel Core i5 Coffee Lake - 8259U,16 GB,DDR4 (Onboard),512 GB SSD\nHỗ trợ khe cắm SSD M.2 SATA3 (nâng...,14 inch,2.2K (2240x1400),Card tích hợp - Intel Iris Plus Graphics 655,Dài 310 mm - Rộng 229.5 mm - Dày 20.6 mm - Nặn...,Vỏ kim loại,Windows 10 Home SL,2021,46.2 Wh
9,Laptop CHUWI CoreBook X i5 8259U/16GB/512GB/Win10,5800000.0,15490000.0,4.5 tháng chính hãng,Intel Core i5 Coffee Lake - 8259U,16 GB,DDR4 (Onboard),512 GB SSD\nHỗ trợ khe cắm SSD M.2 SATA3 (nâng...,14 inch,2.2K (2240x1400),Card tích hợp - Intel Iris Plus Graphics 655,Dài 310 mm - Rộng 229.5 mm - Dày 20.6 mm - Nặn...,Vỏ kim loại,Windows 10 Home SL,2021,46.2 Wh


In [15]:
def chuyen_gia_tri_so(chuoi):
    # Xóa các ký tự không mong muốn khỏi chuỗi
    chuoi = chuoi.replace('.', '').replace('₫', '').strip()

    # Chuyển chuỗi thành giá trị số
    gia_tri_so = float(chuoi)

    return gia_tri_so

df['price_old'] = df['price_old'].apply(chuyen_gia_tri_so)
df['price_new'] = df['price_new'].apply(chuyen_gia_tri_so)

AttributeError: 'float' object has no attribute 'replace'