In [1]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore")

In [2]:
link_set = set()
products = []

In [4]:
def getlink(soup):
    items = soup.find_all('div', class_='list-products__item')
    links = []
    for item in items:
        a_tag = item.find('a')
        if a_tag and a_tag.get('href'):
            link = 'https://clickbuy.com.vn' + a_tag['href']
            links.append(link)
    return links

In [5]:
semaphore = asyncio.Semaphore(10)
link_set = set()

def getlink(soup):
    items = soup.find_all('div', class_='list-products__item')
    links = []
    for item in items:
        a_tag = item.find('a')
        if a_tag and a_tag.get('href'):
            link = 'https://clickbuy.com.vn' + a_tag['href']
            links.append(link)
    return links

async def get_prd(session, i):
    url = f"https://clickbuy.com.vn/dien-thoai?page={i}"
    async with semaphore:
        try:
            await asyncio.sleep(0.5)
            async with session.get(url, timeout=15) as resp:
                if resp.status != 200:
                    print(f"❌ HTTP {resp.status} at page {i}")
                    return

                html = await resp.text()
                soup = BeautifulSoup(html, "html.parser")
                link_phones = getlink(soup)

                if not link_phones:
                    print(f"⚠️ No products found on page {i}")
                    return

                for link in link_phones:
                    if link not in link_set:
                        link_set.add(link)

                print(f"✅ Page {i} done. Added {len(link_phones)} products.")

        except Exception as e:
            print(f"⚠️ Error at page {i}: {e}")

async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [get_prd(session, i) for i in range(1, 13)]
        await asyncio.gather(*tasks)

await main()
print(f"\n📦 Total product links collected: {len(link_set)}")

✅ Page 1 done. Added 20 products.
✅ Page 6 done. Added 20 products.
✅ Page 9 done. Added 20 products.
✅ Page 5 done. Added 20 products.
✅ Page 10 done. Added 20 products.
✅ Page 7 done. Added 20 products.
✅ Page 4 done. Added 20 products.
✅ Page 3 done. Added 20 products.
✅ Page 8 done. Added 20 products.
✅ Page 2 done. Added 20 products.
✅ Page 11 done. Added 20 products.
✅ Page 12 done. Added 20 products.

📦 Total product links collected: 239


In [7]:
def info(soup):
    product = {}

    name_tag = soup.find('h1', class_="product-name")
    product_name = name_tag.get_text(strip=True) if name_tag else None

    price_new_tag = soup.find("p", class_="price")
    price_new_text = price_new_tag.get_text(strip=True) if price_new_tag else None

    price_old_tag = soup.find("p", class_="price-old")
    price_old_text = price_old_tag.get_text(strip=True) if price_old_tag else None

    product = {
        'Product_Name': product_name,
        'New_Price': price_new_text,
        'Old_Price': price_old_text
    }

    spec_section = soup.find("div", class_="product-specification__content")
    if spec_section:
        rows = spec_section.find_all("tr")
        for row in rows:
            th = row.find("th")
            td = row.find("td")
            if th and td:
                key = th.get_text(strip=True)
                value = td.get_text(strip=True)
                product[key] = value

    return product

In [8]:
semaphore = asyncio.Semaphore(10)

async def getinfo(session, i):
    async with semaphore:
        try:
            await asyncio.sleep(0.5)
            async with session.get(i, timeout=15) as resp:
                if resp.status != 200:
                    print(f"❌ HTTP {resp.status} at page {i}")
                    return

                html = await resp.text()
                soup = BeautifulSoup(html, "html.parser")
                phone = info(soup)
                products.append(phone)
                print(f"✅ Done products: {i}")

        except Exception as e:
            print(f"⚠️ Error at page {i}: {e}")

async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [getinfo(session, i) for i in link_set]
        await asyncio.gather(*tasks)

await main()
print(f"\n📦 Total product links collected: {len(products)}/{len(link_set)}")

✅ Done products: https://clickbuy.com.vn/mobell-m331.html
✅ Done products: https://clickbuy.com.vn/samsung-galaxy-a05.html
✅ Done products: https://clickbuy.com.vn/inoi-240-4g.html
✅ Done products: https://clickbuy.com.vn/vivo-iqoo-z10x-5g-box.html
✅ Done products: https://clickbuy.com.vn/iphone-16-pro-max-1tb.html
✅ Done products: https://clickbuy.com.vn/xiaomi-redmi-note-14-5g-chinh-hang.html
✅ Done products: https://clickbuy.com.vn/samsung-galaxy-z-flip-7.html
✅ Done products: https://clickbuy.com.vn/xiaomi-redmi-note-14-pro-4g-chinh-hang.html
✅ Done products: https://clickbuy.com.vn/oppo-reno-14f-5g-chinh-hang.html
✅ Done products: https://clickbuy.com.vn/xiaomi-redmi-note13-chinh-hang.html
✅ Done products: https://clickbuy.com.vn/iphone-14-256gb-chinh-hang-vna.html
✅ Done products: https://clickbuy.com.vn/apple-iphone-13-128gb-chinh-hang-vn-a.html
✅ Done products: https://clickbuy.com.vn/galaxy-s24-fe.html
✅ Done products: https://clickbuy.com.vn/zte-nubia-red-magic-8s-pro-plus-16

In [9]:
import pandas as pd
df = pd.DataFrame(products)

df.to_csv('../dataset/06.phone-new.csv', index=False, encoding='utf-8-sig')