# Web Scraping: Thailand Yellow Pages

## Objective
ดึงข้อมูลรายชื่อธุรกิจประเภท **คลินิก** จากเว็บไซต์ Thailand Yellow Pages (https://www.yellowpages.co.th/) และจัดเก็บใน Pandas DataFrame

## Data to Extract
- ชื่อธุรกิจ (Business Name)
- ที่อยู่/จังหวัด (Location/Province)
- เบอร์โทรศัพท์ (Phone Number)
- รายละเอียดบริการ (Description)
- เว็บไซต์ (Website - if available)

## Approach
1. ใช้ `requests` เพื่อดึง HTML จากเว็บไซต์
2. ใช้ `BeautifulSoup` เพื่อ parse HTML และดึงข้อมูล
3. จัดเก็บข้อมูลใน Pandas DataFrame
4. Export เป็น CSV file

## 1. Import Libraries

In [None]:
# Install required packages (uncomment if needed)
# !pip install requests beautifulsoup4 pandas lxml

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from typing import List, Dict, Optional
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 2. Configuration

กำหนดค่า URL และ Headers สำหรับการ request

In [None]:
# Base URL for clinic listings
BASE_URL = "https://www.yellowpages.co.th"
CATEGORY_URL = f"{BASE_URL}/heading/คลินิก"

# Headers to mimic browser request
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'th-TH,th;q=0.9,en-US;q=0.8,en;q=0.7',
    'Connection': 'keep-alive',
}

# Number of pages to scrape (limit for demo)
MAX_PAGES = 5

print(f"Target URL: {CATEGORY_URL}")
print(f"Pages to scrape: {MAX_PAGES}")

## 3. Helper Functions

สร้างฟังก์ชันสำหรับการดึงและ parse ข้อมูล

In [None]:
def fetch_page(url: str, retries: int = 3) -> Optional[BeautifulSoup]:
    """
    Fetch HTML content from URL and return BeautifulSoup object.
    
    Args:
        url: URL to fetch
        retries: Number of retry attempts
    
    Returns:
        BeautifulSoup object or None if failed
    """
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=HEADERS, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'lxml')
        except requests.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                time.sleep(2)
    return None


def extract_text(element, default: str = "") -> str:
    """
    Safely extract text from BeautifulSoup element.
    
    Args:
        element: BeautifulSoup element
        default: Default value if element is None
    
    Returns:
        Extracted text or default value
    """
    if element:
        return element.get_text(strip=True)
    return default


print("Helper functions defined!")

## 4. Analyze Page Structure

ดึงหน้าแรกมาวิเคราะห์โครงสร้าง HTML

In [None]:
# Fetch the first page to analyze structure
soup = fetch_page(CATEGORY_URL)

if soup:
    print("Page fetched successfully!")
    print(f"Page title: {soup.title.string if soup.title else 'N/A'}")
else:
    print("Failed to fetch page")

In [None]:
# Analyze page structure - find business listing containers
if soup:
    # Use the correct selector for Yellow Pages
    listings = soup.select('div.yp-search-listing')
    print(f"Found {len(listings)} business listings on the page")
    
    if listings:
        # Show first listing structure
        first_listing = listings[0]
        print("\n--- First listing preview ---")
        
        # Extract sample data
        name = first_listing.select_one('.yp-listing-title h3 a')
        address = first_listing.select_one('p.yp-listing-address')
        desc = first_listing.select_one('p.yp-listing-desc')
        
        if name:
            print(f"Name: {name.get_text(strip=True)}")
        if address:
            print(f"Address: {address.get_text(strip=True)}")
        if desc:
            print(f"Description: {desc.get_text(strip=True)[:100]}...")
else:
    print("Failed to fetch page - check connection")

## 5. Scraping Function

ฟังก์ชันหลักสำหรับดึงข้อมูลธุรกิจจากแต่ละหน้า

In [None]:
def parse_business_listing(card) -> Dict:
    """
    Parse a single business listing card and extract information.

    Structure of Yellow Pages listing (based on actual HTML analysis):
    - Container: div.yp-search-listing
    - Name: h3 a (inside yp-listing-title)
    - Website: a.yp-listing-website
    - Address: p.yp-listing-address
    - Description: p.yp-listing-desc
    - Category: listing-category-section a

    Args:
        card: BeautifulSoup element containing business info

    Returns:
        Dictionary with business information
    """
    business = {
        'name': '',
        'address': '',
        'description': '',
        'website': '',
        'category': '',
        'profile_url': ''
    }

    try:
        # Extract business name from h3 > a inside yp-listing-title
        name_elem = card.select_one('.yp-listing-title h3 a')
        if name_elem:
            business['name'] = extract_text(name_elem)
            business['profile_url'] = name_elem.get('href', '')

        # Extract address from p.yp-listing-address
        address_elem = card.select_one('p.yp-listing-address')
        if address_elem:
            business['address'] = extract_text(address_elem)

        # Extract description from p.yp-listing-desc
        desc_elem = card.select_one('p.yp-listing-desc')
        if desc_elem:
            # Clean description and limit length
            desc_text = extract_text(desc_elem)
            business['description'] = desc_text[:300] if len(desc_text) > 300 else desc_text

        # Extract website from a.yp-listing-website
        website_elem = card.select_one('a.yp-listing-website')
        if website_elem:
            business['website'] = website_elem.get('href', '')

        # Extract category from listing-category-section
        category_elem = card.select_one('.listing-category-section a')
        if category_elem:
            business['category'] = extract_text(category_elem)

    except Exception as e:
        print(f"Error parsing listing: {e}")

    return business


def scrape_page(url: str) -> List[Dict]:
    """
    Scrape all business listings from a single page.

    Args:
        url: Page URL to scrape

    Returns:
        List of business dictionaries
    """
    businesses = []
    soup = fetch_page(url)

    if not soup:
        return businesses

    # Find all business listing cards using the correct selector
    # Each listing is in: div.yp-search-listing
    cards = soup.select('div.yp-search-listing')

    print(f"  Found {len(cards)} listing cards on page")

    for card in cards:
        business = parse_business_listing(card)
        if business['name']:  # Only add if name was found
            businesses.append(business)

    return businesses


print("Scraping functions defined!")

## 6. Execute Scraping

ดึงข้อมูลจากหลายหน้า

In [None]:
def scrape_all_pages(base_url: str, max_pages: int) -> List[Dict]:
    """
    Scrape multiple pages of business listings.
    
    Args:
        base_url: Base category URL
        max_pages: Maximum number of pages to scrape
    
    Returns:
        List of all business dictionaries
    """
    all_businesses = []
    
    for page in range(1, max_pages + 1):
        if page == 1:
            url = base_url
        else:
            url = f"{base_url}/page/{page}"
        
        print(f"Scraping page {page}/{max_pages}: {url}")
        
        businesses = scrape_page(url)
        all_businesses.extend(businesses)
        
        print(f"  Found {len(businesses)} businesses (Total: {len(all_businesses)})")
        
        # Polite delay between requests
        if page < max_pages:
            delay = random.uniform(1, 3)
            print(f"  Waiting {delay:.1f}s...")
            time.sleep(delay)
    
    return all_businesses


# Execute scraping
print("Starting scraping...\n")
all_businesses = scrape_all_pages(CATEGORY_URL, MAX_PAGES)
print(f"\nScraping complete! Total businesses collected: {len(all_businesses)}")

## 7. Create DataFrame

แปลงข้อมูลที่ดึงมาเป็น Pandas DataFrame

In [None]:
# Create DataFrame from scraped data
df = pd.DataFrame(all_businesses)

# Display basic info
print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nData types:\n{df.dtypes}")

In [None]:
# Display first few rows
df.head(10)

In [None]:
# Data summary statistics
print("=" * 50)
print("DATA SUMMARY")
print("=" * 50)
print(f"\nTotal records: {len(df)}")
print(f"\nNon-empty value counts:")
for col in df.columns:
    non_empty = (df[col] != '').sum()
    pct = (non_empty/len(df)*100) if len(df) > 0 else 0
    print(f"  {col}: {non_empty} ({pct:.1f}%)")

## 8. Data Cleaning

ทำความสะอาดข้อมูลที่ดึงมา

In [None]:
# Clean the data
df_cleaned = df.copy()

# Remove duplicates based on name
initial_count = len(df_cleaned)
df_cleaned = df_cleaned.drop_duplicates(subset=['name'], keep='first')
print(f"Removed {initial_count - len(df_cleaned)} duplicate entries")

# Remove rows with empty names
df_cleaned = df_cleaned[df_cleaned['name'] != '']
print(f"Records after cleaning: {len(df_cleaned)}")

# Reset index
df_cleaned = df_cleaned.reset_index(drop=True)

# Reorder columns for better readability
column_order = ['name', 'category', 'address', 'description', 'website', 'profile_url']
df_cleaned = df_cleaned[column_order]

df_cleaned.head(10)

## 9. Export to CSV

บันทึกข้อมูลเป็นไฟล์ CSV

In [None]:
# Export to CSV
output_file = 'clinic_listings_yellowpages.csv'
df_cleaned.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Data exported to: {output_file}")
print(f"Total records: {len(df_cleaned)}")

## 10. Summary

สรุปผลการดึงข้อมูล

In [None]:
print("=" * 60)
print("WEB SCRAPING SUMMARY")
print("=" * 60)
print(f"\nSource: Thailand Yellow Pages (yellowpages.co.th)")
print(f"Category: คลินิก (Clinic)")
print(f"Pages scraped: {MAX_PAGES}")
print(f"Total records collected: {len(df_cleaned)}")
print(f"Output file: {output_file}")
print("\nColumns in dataset:")
for col in df_cleaned.columns:
    print(f"  - {col}")
print("\n" + "=" * 60)

In [None]:
# Final display of data
df_cleaned