In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException

class AdCrawler:
    def __init__(self, base_url, max_pages=2, retry_limit=3):
        self.base_url = base_url
        self.driver = None
        self.all_ads = []
        self.max_pages = max_pages
        self.retry_limit = retry_limit  # Stale element 발생 시 최대 재시도 횟수

    def start_driver(self):
        """브라우저 드라이버를 시작"""
        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    def stop_driver(self):
        """브라우저 드라이버를 종료"""
        if self.driver:
            self.driver.quit()

    def load_page(self, page_number):
        """페이지 번호에 맞는 URL을 로드"""
        url = f"{self.base_url}?page={page_number}"
        self.driver.get(url)
        try:
            # 페이지가 완전히 로드될 때까지 대기
            WebDriverWait(self.driver, 150).until(
                EC.presence_of_element_located((By.TAG_NAME, 'article'))  # article 태그 내의 요소가 나타날 때까지 대기
            )
            print(f"--- Page {page_number} loaded ---")
        except Exception as e:
            print(f"Error loading page {page_number}: {e}")
        # except Exception as e:
        #     print(f"Page {page_number} does not exist or failed to load: {e}")
        #     return False

    def extract_ad_data(self, ad_xpath, retry_count=0):
        """단일 광고의 데이터를 추출"""
        try:
            ad = self.driver.find_element(By.XPATH, ad_xpath)  # StaleElement 발생 시 요소를 다시 찾음
            # 재귀적으로 모든 자식 요소에서 텍스트를 추출하는 함수
            def extract_text_from_element(element):
                text = element.text.strip() if element.text.strip() else ""
                children_text = []
                # 자식 요소들에서 텍스트 추출
                for child in element.find_elements(By.XPATH, './*'):
                    children_text.extend(extract_text_from_element(child))
                return [text] + children_text if text else children_text

            # article 내 모든 텍스트를 리스트로 추출
            all_texts = extract_text_from_element(ad)[0].split('\n')

            if len(all_texts) >= 5:
                title = all_texts[0]  # 첫 번째 텍스트는 제목
                link = ad.find_element(By.TAG_NAME, 'a').get_attribute('href')  # 링크는 직접 추출
                cost = all_texts[3].replace('~','')  # 가격
                rating = all_texts[1]  # 평점
                reviews = all_texts[2].replace('(', '').replace(')', '')  # 리뷰 수
                company_name = all_texts[4]  # 기업명
            
            elif len(all_texts) >= 3:
                # ['고객에게 각인되는 워드프레스 반응형 홈페이지 제작', '290,000원', '리플래닝']
                title = all_texts[0]  # 첫 번째 텍스트는 제목
                link = ad.find_element(By.TAG_NAME, 'a').get_attribute('href')  # 링크는 직접 추출
                cost = all_texts[1].replace('~','')  # 가격
                rating = 0
                reviews = 0
                company_name = all_texts[2]  # 기업명
            

            # 추출된 데이터를 사전에 저장
            ad_data = {
                'title': title,
                'link': link,
                'cost': cost,
                'rating': rating,
                'reviews': reviews,
                'company_name': company_name,
            }
            # print(ad_data)
            return ad_data

        except StaleElementReferenceException:
            if retry_count < self.retry_limit:
                print(f"Stale element detected, retrying... attempt {retry_count + 1}")
                time.sleep(1)  # 짧은 대기 후 재시도
                return self.extract_ad_data(ad_xpath, retry_count + 1)  # 요소를 다시 찾고 재시도
            else:
                print("Stale element could not be recovered after multiple retries.")
                return None
        except Exception as e:
            print(f"Error processing ad: {e}")
            return None

    def scrape_ads(self):
        """1페이지부터 max_pages 페이지까지 순차적으로 크롤링"""
        self.start_driver()

        try:
            for page in range(1, self.max_pages + 1):
                print(f"--- Scraping page {page} ---")
                
                # 해당 페이지 로드
                self.load_page(page_number=page)
                
                try:
                    # 페이지가 완전히 로드될 때까지 대기
                    WebDriverWait(self.driver, 15).until(
                        EC.presence_of_element_located((By.TAG_NAME, 'article'))
                    )
                    time.sleep(2)
                    
                except TimeoutException:
                    print(f"No articles found on page {page}. Ending scraping.")
                    break  # 더 이상 페이지를 로드하지 않고 루프 종료

                

                # 모든 article 태그를 리스트로 가져옴
                ads = self.driver.find_elements(By.XPATH, '//article')

                if not ads:
                    print(f"No articles found on page {page}.")
                    # continue
                    break

                for i, ad in enumerate(ads):
                    ad_xpath = f'(//article)[{i+1}]'  # 각 광고의 XPATH를 추출
                    ad_data = self.extract_ad_data(ad_xpath)  # XPATH를 기반으로 데이터 추출
                    if ad_data:
                        self.all_ads.append(ad_data)

        finally:
            self.stop_driver()

    def save_json(self, file_name='ads_data.json'):
        """추출된 데이터를 JSON 파일로 저장"""
        import json
        with open(file_name, 'w', encoding='utf-8') as json_file:
            json.dump(self.all_ads, json_file, ensure_ascii=False, indent=4)



In [2]:
import time
import json

import pandas as pd
df = pd.read_excel('크몽_카테고리별_링크.xlsx')
dataset = []
for index, row in df.iterrows():
    dataset.append({
        'major' : row['대분류'],
        'medium' : row['중분류'],
        'sub' : row['소분류'],
        'link' : row['링크'],
    })

In [3]:
for i, contents in enumerate(dataset):
    file_name = str(i) + '_' + contents['major'] + '_' + contents['medium'] + '_' + contents['sub'] + '.json'
    print(file_name)
    crawler = AdCrawler(contents['link'], max_pages=500)
    crawler.scrape_ads()
    crawler.save_json(file_name=file_name)

0_웹빌더_워드프레스_신규제작.json
--- Scraping page 1 ---
--- Page 1 loaded ---
--- Scraping page 2 ---
--- Page 2 loaded ---
--- Scraping page 3 ---
--- Page 3 loaded ---
--- Scraping page 4 ---
--- Page 4 loaded ---
--- Scraping page 5 ---
--- Page 5 loaded ---
--- Scraping page 6 ---
--- Page 6 loaded ---
--- Scraping page 7 ---
--- Page 7 loaded ---
--- Scraping page 8 ---
--- Page 8 loaded ---
--- Scraping page 9 ---
--- Page 9 loaded ---
--- Scraping page 10 ---
--- Page 10 loaded ---
--- Scraping page 11 ---
--- Page 11 loaded ---
--- Scraping page 12 ---
--- Page 12 loaded ---
--- Scraping page 13 ---
--- Page 13 loaded ---
--- Scraping page 14 ---
--- Page 14 loaded ---
--- Scraping page 15 ---
Error loading page 15: Message: 
Stacktrace:
	GetHandleVerifier [0x00836AB3+25587]
	(No symbol) [0x007C9C54]
	(No symbol) [0x006C2113]
	(No symbol) [0x00706F62]
	(No symbol) [0x007071AB]
	(No symbol) [0x00747852]
	(No symbol) [0x0072ABE4]
	(No symbol) [0x00745370]
	(No symbol) [0x0072A936]
	(No sym

KeyboardInterrupt: 