In [1]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re

In [2]:
class HotelCrawler:
    def __init__(self):
        self.url = "https://www.booking.com/hotel/kr/hotel-prince-seoul.ko.html?aid=304142&label=gen173nr-1FCAsofUIUc2t5cGFyay1teWVvbmdkb25nLTNIF1gEaH2IAQGYARe4AQfIAQzYAQHoAQH4AQOIAgGoAgO4Arazg7IGwAIB0gIkMzNkM2MyZGQtYzZkMy00MmMyLWJkMmYtNDZlMmY3YzY5OGJh2AIF4AIB&sid=f43cc81a1aa33436cef9262f4ee7b75a&all_sr_blocks=28644504_0_2_0_0;checkin=2024-05-14;checkout=2024-05-15;dist=0;group_adults=2;group_children=0;hapos=1;highlighted_blocks=28644504_0_2_0_0;hpos=1;matching_block_id=28644504_0_2_0_0;no_rooms=1;req_adults=2;req_children=0;room1=A%2CA;sb_price_type=total;sr_order=distance_from_search;sr_pri_blocks=28644504_0_2_0_0__13283871;srepoch=1715556186;srpvid=59cba46a22ac00e1;type=total;ucfs=1&#hotelTmpl"
        self.options = Options()
        self.options.add_argument('--headless')
        self.options.add_argument('--no-sandbox')
        self.options.add_argument('--disable-dev-shm-usage')
        self.options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36')

    def fetch_html(self):
        with webdriver.Chrome(options=self.options) as driver:
            driver.get(self.url)
            time.sleep(3)  # Adjust sleep time as necessary
            return driver.page_source

    def parse_html(self, html):
        return BeautifulSoup(html, "html.parser")

    def extract_hotel_name(self, soup):
        raise NotImplementedError("This method should be overridden by subclasses")

    def extract_hotel_data(self, soup):
        raise NotImplementedError("This method should be overridden by subclasses")

    def extract_one_night_rate(self, price):
        price = re.sub(r'[^\d.]', '', price)
        return int(price)

    def crawl(self):
        html = self.fetch_html()
        soup = self.parse_html(html)
        hotel_name = self.extract_hotel_name(soup)
        hotel_data = self.extract_hotel_data(soup)
        return hotel_name, hotel_data

In [1]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from io import StringIO
import re

# Define your hotel data DataFrame here
hotels_df = pd.read_csv("hotels_data.csv")

class HotelCrawler:
    def __init__(self, url):
        self.url = url
        self.options = Options()
        self.options.add_argument('--headless')
        self.options.add_argument('--no-sandbox')
        self.options.add_argument('--disable-dev-shm-usage')
        self.options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36')

    def fetch_html(self):
        with webdriver.Chrome(options=self.options) as driver:
            driver.get(self.url)
            time.sleep(3)  # Adjust sleep time as necessary
            return driver.page_source

    def parse_html(self, html):
        return BeautifulSoup(html, "html.parser")

    def extract_hotel_name(self, soup):
        raise NotImplementedError("This method should be overridden by subclasses")

    def extract_hotel_data(self, soup):
        raise NotImplementedError("This method should be overridden by subclasses")

    def extract_one_night_rate(self, price):
        price = re.sub(r'[^\d.]', '', price)
        return int(price)

    def crawl(self):
        html = self.fetch_html()
        soup = self.parse_html(html)
        hotel_name = self.extract_hotel_name(soup)
        hotel_data = self.extract_hotel_data(soup)
        return hotel_name, hotel_data

# Subclasses for different hotel sites
class AgodaHotelCrawler(HotelCrawler):
    def extract_hotel_name(self, soup):
        return soup.find("p", {"class": "HeaderCerebrum__Name"}).text

    def extract_one_night_rate(self, text):
        matches = re.findall(r'₩\s*(\d+(?:,\d+)*)\s*1박당\s*요금', text)
        return str(matches[0].replace(',', '')) if matches else None

    def extract_hotel_data(self, soup):
        print(self.url)
        room_table = soup.find_all("div", {"class": "MasterRoom"})
        df = pd.DataFrame(columns=["room_price", "room_name"])
        for room in room_table:
            room_name = room.find(class_="MasterRoom__HotelName").text
            room_price = room.find(class_="ChildRoom__PriceContainer").text
            room_price = self.extract_one_night_rate(room_price)
            df.loc[len(df)] = [room_price, room_name]
        return df

class BookingHotelCrawler(HotelCrawler):
    def fetch_html(self):
        with webdriver.Chrome(options=self.options) as driver:
            driver.get(self.url)
            time.sleep(3)  # Adjust sleep time as necessary
            return driver.page_source

    def extract_hotel_name(self, soup):
        return soup.find("h2", {"class": "pp-header__title"}).text
    
    def extract_one_night_rate(self, price_str):
        try:
            price = price_str.replace('₩', '').replace(',', '')
            return int(price)
        except Exception as e:
            print(f"Error extracting price from '{price_str}': {e}")
            return None

    def extract_price(self, price_str):
        try:
            if "현재 요금" in price_str:
                price_part = price_str.split("현재 요금 ")[1]
            else:
                price_part = price_str.split()[0]
            return self.extract_one_night_rate(price_part.split()[0])
        except IndexError as e:
            print(f"Error extracting room prices: {e}")
            return None

    def extract_hotel_data(self, soup):
        df_rooms_table = pd.read_html(str(soup))[-2]
        self.df_rooms_table = df_rooms_table
        df = pd.DataFrame(columns=["room_price", "room_name"])
        df["room_name"] = [desc[:desc.find("룸") + 1] for desc in df_rooms_table["객실 유형"]]
        df["room_price"] = [self.extract_price(price) for price in df_rooms_table["오늘 판매가"]]
        return df

class ExpediaHotelCrawler(HotelCrawler):
    def extract_hotel_name(self, soup):
        return soup.find("h1").text

    def extract_hotel_data(self, soup):
        time.sleep(2)
        room_grid = soup.find("div", {"id": "Offers"})
        room_table = room_grid.find_all("div", {"data-stid": re.compile("property-offer-.*")})
        df = pd.DataFrame(columns=["room_price", "room_name"])
        for room in room_table:
            room_name = room.find_all('h3')[-1].text
            price_summary = room.find("div", {"data-test-id": "price-summary"})
            if price_summary:
                room_price = self.extract_one_night_rate(price_summary.find("span").text)
            else:
                continue
            df.loc[len(df)] = [room_price, room_name]
        return df

class TripHotelCrawler(HotelCrawler):
    def extract_hotel_name(self, soup):
        return soup.find("h1").text

    def extract_hotel_data(self, soup):
        time.sleep(2)
        room_table = soup.find_all("div", {"class": "commonRoomCard_commonRoomCard___qMtD"})
        df = pd.DataFrame(columns=["room_price", "room_name"])
        for room in room_table:
            room_name = room.find("span", {"class": "commonRoomCard_commonRoomCard-title__YgDYt"}).text
            room_options = room.find_all("div", {"class": "saleRoom_saleRoomItemBox__IpWj4"})
            for room_option in room_options:
                room_price = self.extract_one_night_rate(room_option.find("div", {"class": "priceInfo_saleRoomItemBox-priceBox-displayPrice__niIBD"}).text)
                df.loc[len(df)] = [room_price, room_name]
        return df

class YanoljaHotelCrawler(HotelCrawler):
    def extract_hotel_name(self, soup):
        return soup.find("h1").text
    
    def pre_extract_one_night_rate(self, text):
        match = re.search(r'(\d{1,3}(?:,\d{3})+원)', text)
        price = match.group(1)
        return price

    def extract_hotel_data(self, soup):
        room_grids = soup.find("div", {"class":"css-1z06rwl"})
        df = pd.DataFrame(columns=["room_price", "room_name"])
        for grid in room_grids.find_all("div", {"class":"css-1nnj57j"}):
            room_name = grid.find('h3').text
            pre_room_price = self.pre_extract_one_night_rate(grid.find_all("div", {"class":"rate-plan-container"})[0].text)
            if grid.find("div", {"class":"soldOut"}):
                continue
            room_price = self.extract_one_night_rate(pre_room_price)
            df.loc[len(df)] = [room_price, room_name]
        return df

class YeogiHotelCrawler(HotelCrawler):
    def extract_hotel_name(self, soup):
        return soup.find("h1").text

    def extract_hotel_data(self, soup):
        room_grids = soup.find("div", {"id": "room"})
        df = pd.DataFrame(columns=["room_price", "room_name"])
        for room in room_grids.find_all("div", {"class": "css-gp2jfw"}):
            room_name = room.find("div", {"class": "css-rs79op"}).text
            room_price = self.extract_one_night_rate(room.find("div", {"class": "css-149gbl6"}).text)
            df.loc[len(df)] = [room_price, room_name]
        return df

# Unified manager class
class HotelCrawlerManager:
    def __init__(self, hotel_name, check_in, check_out, adults=2, children=0, rooms=1):
        self.hotel_name = hotel_name
        self.check_in = check_in
        self.check_out = check_out
        self.adults = adults
        self.children = children
        self.rooms = rooms
        self.hotel_id = hotels_df[hotels_df["Hotel Name"] == hotel_name]

    def get_crawler(self, site):
        url = self.generate_url(site)
        if site == "agoda":
            return AgodaHotelCrawler(url)
        elif site == "booking":
            return BookingHotelCrawler(url)
        elif site == "expedia":
            return ExpediaHotelCrawler(url)
        elif site == "trip":
            return TripHotelCrawler(url)
        elif site == "yanolja":
            return YanoljaHotelCrawler(url)
        elif site == "yeogi":
            return YeogiHotelCrawler(url)
        else:
            raise ValueError("Unsupported site")

    def generate_url(self, site):
        if site == "agoda":
            agoda_id = self.hotel_id["Agoda ID"].values[0]
            return f"https://www.agoda.com/ko-kr/{agoda_id}/hotel/seoul-kr.html?finalPriceView=1&checkIn={self.check_in}&los=1"
        elif site == "booking":
            booking_id = self.hotel_id["Booking.com ID"].values[0]
            return f"https://www.booking.com/hotel/kr/{booking_id}.ko.html?checkin={self.check_in}&checkout={self.check_out}&group_adults={self.adults}&group_children={self.children}&no_rooms={self.rooms}"
        elif site == "expedia":
            expedia_id = self.hotel_id["Expedia ID"].values[0]
            return f"https://www.expedia.co.kr/{expedia_id}.Hotel-Information?chkin={self.check_in}&chkout={self.check_out}"
        elif site == "trip":
            trip_id = self.hotel_id["Trip.com ID"].values[0]
            return f"https://kr.trip.com/hotels/detail/?hotelId={trip_id}&checkIn={self.check_in}&checkOut={self.check_out}&adult={self.adults}&children={self.children}"
        elif site == "yanolja":
            yanolja_id = self.hotel_id["Yanolja ID"].values[0]
            return f"https://place-site.yanolja.com/places/{yanolja_id}"
        elif site == "yeogi":
            yeogi_id = self.hotel_id["Yeogi ID"].values[0]
            return f"https://www.yeogi.com/domestic-accommodations/{yeogi_id}?checkIn={self.check_in}&checkOut={self.check_out}&personal={self.adults}"
        else:
            raise ValueError("Unsupported site")

    def crawl(self, site):
        crawler = self.get_crawler(site)
        hotel_name, hotel_data = crawler.crawl()
        return hotel_name, hotel_data

In [3]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from io import StringIO
import re

# Define your hotel data DataFrame here
hotels_df = pd.read_csv("hotels_data.csv")


In [4]:
hotels_df

Unnamed: 0,Hotel Name,Agoda ID,Booking.com ID,Expedia ID,Trip.com ID,Yanolja ID,Yeogi ID
0,호텔 스카이파크 명동 3호점,hotel-skypark-myeongdong-iii,skypark-myeongdong-3,Seoul-Hotels-HOTEL-SKYPARK-Myeongdong-III.h446...,988482,3001826,6551
1,밀리오레 호텔 서울 명동,loisir-hotel-seoul-myeongdong,loisir-seoul-myeongdong,Seoul-Hotels-Migliore-Hotel-Seoul-Myeongdong.h...,1687209,1000114102,67993
2,소테츠 프레사 인 서울 명동,sotetsu-fresa-inn-seoul-myeong-dong,sotetsu-fresa-inn-seoul-myeong-dong,Seoul-Hotels-Sotetsu-Fresa-Inn-Seoul-Myeong-Do...,41496077,1000108504,64580
3,나인트리 호텔 명동,nine-tree-hotel-myeong-dong,nine-tree,Seoul-Hotels-Nine-Tree-Hotel-Myeongdong.h6084370,988649,3000612,6492
4,프린스 호텔 명동,prince-hotel-myeongdong,hotel-prince-seoul,Seoul-Hotels-Hotel-Prince-Seoul.h2321943,988630,3000619,6283
5,스탠포드 호텔 명동,stanford-hotel-myeongdong,seutaenpodeuhotel-myeongdong-stanford-myeongdong,Seoul-Hotels-Stanford-Hotel-Myeongdong.h74101909,83924501,10042533,69970
6,호텔 토마스 명동,hotel-thomas-myeongdong_2,hotel-thomas-myeongdong,Seoul-Hotels-Hotel-Thomas-Myeongdong.h37207489,41496077,1000108504,64580


In [2]:
agoda = AgodaHotelCrawler()

TypeError: HotelCrawler.__init__() missing 1 required positional argument: 'url'

In [11]:
class BookingHotelCrawler(HotelCrawler):
    def fetch_html(self):
        with webdriver.Chrome(options=self.options) as driver:
            driver.get(self.url)
            time.sleep(3)  # Adjust sleep time as necessary
            return driver.page_source

    def extract_hotel_name(self, soup):
        return soup.find("h2", {"class": "pp-header__title"}).text
    
    def extract_one_night_rate(self, price_str):
        try:
            price = price_str.replace('₩', '').replace(',', '')
            return int(price)
        except Exception as e:
            print(f"Error extracting price from '{price_str}': {e}")
            return None
    
    def extract_price(self, price_str):
        try:
            if "현재 요금" in price_str:
                price_part = price_str.split("현재 요금 ")[1]
            else:
                price_part = price_str.split()[0]
            return self.extract_one_night_rate(price_part.split()[0])
        except IndexError as e:
            print(f"Error extracting room prices: {e}")
            return None

    def extract_hotel_data(self, soup):
        df_rooms_table = pd.read_html(str(soup))
        self.df_rooms_table = df_rooms_table
        print(df_rooms_table)
        df = pd.DataFrame(columns=["room_price", "room_name"])
        df["room_name"] = [desc[:desc.find("룸") + 1] for desc in df_rooms_table["객실 유형"]]
        df["room_price"] = [self.extract_price(price) for price in df_rooms_table["오늘 판매가"]]
        return df

In [12]:
booking = BookingHotelCrawler() 

In [13]:
booking.crawl()

[                                                   0  \
0  Genius 할인 혜택을 이용해보세요 로그인하시면 이 숙박옵션의 요금(세금 및 기타...   

                                                   1  \
0  Genius 할인 혜택을 이용해보세요 로그인하시면 이 숙박옵션의 요금(세금 및 기타...   

                                                   2  \
0  Genius 할인 혜택을 이용해보세요 로그인하시면 이 숙박옵션의 요금(세금 및 기타...   

                                                   3  
0  Genius 할인 혜택을 이용해보세요 로그인하시면 이 숙박옵션의 요금(세금 및 기타...  ,                                                 객실 유형              투숙객 수  \
0   더블룸 A 대형 더블침대 1개 15 제곱미터 에어컨 평면 TV 방음 시설 무료 Wi...        최대 투숙 인원: 2   
1   더블룸 A 우리 사이트에 남은 객실 단 2개 고층 객실 대형 더블침대 1개 15 제...        최대 투숙 인원: 2   
2   더블룸 A 우리 사이트에 남은 객실 단 2개 고층 객실 대형 더블침대 1개 15 제...        최대 투숙 인원: 2   
3   비즈니스 더블룸 우리 사이트에 남은 객실 단 2개 고층 객실 대형 더블침대 1개 객...        최대 투숙 인원: 2   
4   비즈니스 더블룸 우리 사이트에 남은 객실 단 2개 고층 객실 대형 더블침대 1개 객...        최대 투숙 인원: 2   
5   트윈룸 A 우리 사이트에 남은 객실 단 3개 고층 객실 싱글침대 2개 18 제곱미터...        최대 투숙 인원: 2   
6   트윈룸 A 우리 사이트에

  df_rooms_table = pd.read_html(str(soup))


TypeError: list indices must be integers or slices, not str

In [16]:
booking.df_rooms_table[-2]

Unnamed: 0,(월),(화),(수),(목),(금),(토),(일)
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,,,,,,,


In [58]:
pd.set_option('display.max_seq_items', None)

In [59]:
#reset index
df_rooms_table = booking.df_rooms_table
df_rooms_table.reset_index(drop=True, inplace=True)

In [60]:
df_rooms_table["오늘 판매가"]

0                    ₩181,588 요금 ₩181,588 세금 및 기타 요금 포함
1     ₩252,720 ₩206,978 기존 요금 ₩252,720 현재 요금 ₩206,97...
2     ₩252,720 ₩206,978 기존 요금 ₩252,720 현재 요금 ₩206,97...
3     ₩256,230 ₩209,852 기존 요금 ₩256,230 현재 요금 ₩209,85...
4     ₩282,438 ₩231,317 기존 요금 ₩282,438 현재 요금 ₩231,31...
5     ₩256,230 ₩209,852 기존 요금 ₩256,230 현재 요금 ₩209,85...
6     ₩278,460 ₩228,059 기존 요금 ₩278,460 현재 요금 ₩228,05...
7     ₩261,652 ₩214,293 기존 요금 ₩261,652 현재 요금 ₩214,29...
8     ₩261,652 ₩214,293 기존 요금 ₩261,652 현재 요금 ₩214,29...
9     ₩307,125 ₩251,535 기존 요금 ₩307,125 현재 요금 ₩251,53...
10    ₩376,447 ₩308,310 기존 요금 ₩376,447 현재 요금 ₩308,31...
11    ₩437,170 ₩358,043 기존 요금 ₩437,170 현재 요금 ₩358,04...
Name: 오늘 판매가, dtype: object

In [62]:
df = pd.DataFrame(columns=["room_price", "room_name"])

In [63]:
def extract_one_night_rate(price_str):
    try:
        price = price_str.replace('₩', '').replace(',', '')
        return int(price)
    except Exception as e:
        print(f"Error extracting price from '{price_str}': {e}")
        return None

def extract_price(price_str):
    try:
        if "현재 요금" in price_str:
            price_part = price_str.split("현재 요금 ")[1]
        else:
            price_part = price_str.split()[0]
        return extract_one_night_rate(price_part.split()[0])
    except IndexError as e:
        print(f"Error extracting room prices: {e}")
        return None

df["room_price"] = [extract_price(price) for price in df_rooms_table["오늘 판매가"]]


In [64]:
df

Unnamed: 0,room_price,room_name
0,181588,
1,206978,
2,206978,
3,209852,
4,231317,
5,209852,
6,228059,
7,214293,
8,214293,
9,251535,


In [57]:
from price import get_hotel_price
selected_hotels = ["호텔 스카이파크 명동 3호점"]
selected_start_date = "2024-05-14"
selected_end_date = "2024-05-15"
selected_sites = ["booking"]
price_data = get_hotel_price(selected_hotels, 
                                str(selected_start_date), 
                                str(selected_end_date), 
                                selected_sites)

                                                객실 유형        투숙객 수  \
0   더블룸 더블침대 1개 20 제곱미터 에어컨 전용 욕실 평면 TV 방음 시설 무료 W...  최대 투숙 인원: 2   
1                       스탠다드 더블룸 - 조식 더블침대 1개 무료 WiFi  최대 투숙 인원: 2   
2                       스탠다드 트윈룸 - 조식 싱글침대 2개 무료 WiFi  최대 투숙 인원: 2   
3   트윈룸 우리 사이트에 남은 객실 단 3개 싱글침대 2개 객실21 제곱미터전망에어컨전...  최대 투숙 인원: 2   
4   트윈룸 우리 사이트에 남은 객실 단 3개 싱글침대 2개 객실21 제곱미터전망에어컨전...  최대 투숙 인원: 2   
5   더블룸 우리 사이트에 남은 객실 단 2개 더블침대 1개 객실20 제곱미터에어컨전용 ...  최대 투숙 인원: 2   
6   더블룸 우리 사이트에 남은 객실 단 2개 더블침대 1개 객실20 제곱미터에어컨전용 ...  최대 투숙 인원: 2   
7       [K-라면 패키지] 스탠다드 더블룸 - K-라면 세트 더블침대 1개 무료 WiFi  최대 투숙 인원: 2   
8       [K-라면 패키지] 스탠다드 트윈룸 - K-라면 세트 싱글침대 2개 무료 WiFi  최대 투숙 인원: 2   
9   스위트룸 고층 객실 싱글침대 1개 및 대형 더블침대 1개 단독으로 사용하는 스위트룸...  최대 투숙 인원: 3   
10  쿼드러플룸 우리 사이트에 남은 객실 단 1개 싱글침대 4개 객실33 제곱미터에어컨전...  최대 투숙 인원: 4   
11  쿼드러플룸 우리 사이트에 남은 객실 단 1개 싱글침대 4개 객실33 제곱미터에어컨전...  최대 투숙 인원: 4   

                                               오늘 판매가  \
0                  ₩181,588 요금 ₩

  df_rooms_table = pd.read_html(str(soup))[0]


In [28]:
price_data

[]