In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib import font_manager, rc
%matplotlib inline


# 글꼴 경로 지정
font_path = "c:/Windows/Fonts/malgun.ttf"  # 윈도우에 설치된 맑은 고딕 폰트 경로

# 폰트 이름 얻어오기
font_name = font_manager.FontProperties(fname=font_path).get_name()

# matplotlib의 rc(run command) 기능을 이용하여 글꼴 설정
mpl.rc('font', family=font_name)

# 유니코드에서  음수 부호 설정
mpl.rc('axes', unicode_minus=False)

raw = pd.read_csv('../../../../datasets/paris_listings.csv')
df = raw.copy()

df = df[['id', 'host_id','host_is_superhost',
        'host_total_listings_count','neighbourhood_cleansed','room_type','accommodates','bathrooms','bedrooms',
        'beds','amenities','price','minimum_nights','maximum_nights','number_of_reviews', 'number_of_reviews_l30d', 'review_scores_rating', 'review_scores_accuracy', 
        'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 
        'review_scores_location', 'review_scores_value', 'reviews_per_month','listing_url','property_type','number_of_reviews_ltm','has_availability']]

df = df.rename(columns= {
    'id': '숙소_id',
    'host_id': '호스트_id',
    'host_is_superhost': '슈퍼호스트',
    'host_total_listings_count': '숙소_수',
    'neighbourhood_cleansed': '숙소_지역',
    'room_type': '숙소_유형',
    'accommodates': '수용_인원수',
    'bathrooms': '욕실수',
    'bedrooms': '침실수',
    'beds': '침대수',
    'amenities': '편의시설',
    'price': '숙소_가격',
    'minimum_nights': '최소_숙박일',
    'maximum_nights': '최대_숙박일',
    'number_of_reviews': '리뷰수',
    'number_of_reviews_l30d': '30일_리뷰수',
    'review_scores_rating': '리뷰점수',
    'review_scores_accuracy': '숙소_정확성_리뷰점수',
    'review_scores_cleanliness': '숙소_청결도_리뷰점수',
    'review_scores_checkin': '숙소_체크인_리뷰점수',
    'review_scores_communication': '숙소_소통_리뷰점수',
    'review_scores_location': '숙소_위치_리뷰점수',
    'review_scores_value': '숙소_가격_리뷰점수',
    'reviews_per_month': '평균_리뷰수',
    'listing_url':'url',
    'property_type':'숙소_특징',
    'number_of_reviews_ltm':'12개월_리뷰수',
    'has_availability':'예약가능여부'
})


# 슈퍼호스트, 리뷰수 결측치 제거
print(f"처음 df : {len(df)}")
df = df[~df['슈퍼호스트'].isnull()]
print(f"슈퍼호스트 제거 : {len(df)}")
df = df[df['리뷰수']>2]
print(f"리뷰수 제거 : {len(df)}")

# 숙소가격 null값 제거
df= df[~df['숙소_가격'].isnull()]

#가격 앞 통화기호 제거
df['숙소_가격'] = df['숙소_가격'].replace('[\$,]', '', regex=True).astype(float)

# 유형 제거 
df = df[(df['숙소_유형']=='Entire home/apt') | (df['숙소_유형']=='Private room')]
print(f"숙소_유형 제거 : {len(df)}")

# 12개월 리뷰수 0 개 제거ㅋ 
df = df[df['12개월_리뷰수']!=0]
print(f"12개월_리뷰수 0개 제거 : {len(df)}")

# 욕실수, 침실수, 침대수 null값 제거
df = df.dropna(subset=['욕실수', '침실수', '침대수'])
print(f"욕실수, 침실수, 침대수 제거 후 : {len(df)}")

# 리뷰  null값 제거
df = df.dropna(subset=['숙소_정확성_리뷰점수','숙소_청결도_리뷰점수','숙소_체크인_리뷰점수','숙소_소통_리뷰점수','숙소_위치_리뷰점수','숙소_가격_리뷰점수'])
print(f"리뷰 결측치 제거 후 : {len(df)}")


# 예약 가능여부 f 버리기
df = df.dropna(subset='예약가능여부')
print(f"예약가능여부 : {len(df)}")



from urllib.parse import urlparse, urlunparse, parse_qs, urlencode

# URL에 쿼리 파라미터를 추가하는 함수
def modify_url(url, params):
    url_parts = list(urlparse(url))
    query = dict(parse_qs(url_parts[4]))
    query.update(params)
    url_parts[4] = urlencode(query, doseq=True)
    return urlunparse(url_parts)

# 파라미터 추가
params = {'locale': 'en'}

# 각 URL을 순회하면서 수정
df['url'] = [modify_url(url, params) for url in df['url']]

처음 df : 84397
슈퍼호스트 제거 : 84306
리뷰수 제거 : 47841
숙소_유형 제거 : 34280
12개월_리뷰수 0개 제거 : 31052
욕실수, 침실수, 침대수 제거 후 : 30997
리뷰 결측치 제거 후 : 30997
예약가능여부 : 30990


In [13]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import random
import re
from datetime import datetime, timedelta

def convert_date(date_str):
    today = datetime.now()
    match_ago = re.search(r"(\d+) (week|day)s? ago", date_str)
    if match_ago:
        number, unit = match_ago.groups()
        number = int(number)
        if 'week' in unit:
            date = today - timedelta(weeks=number)
        elif 'day' in unit:
            date = today - timedelta(days=number)
        return date.strftime("%Y-%m-%d")
    match_month_year = re.search(r"(\w+) \d{4}", date_str)
    if match_month_year:
        return datetime.strptime(match_month_year.group(), "%B %Y").strftime("%Y-%m-%d")
    return "없음"

def airbnb_reviews(url):
    random_sec = random.uniform(1, 5)
    chrome_driver_path = 'C:/chromedriver-win64/chromedriver.exe'
    chrome_service = Service(chrome_driver_path)
    options = Options()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_experimental_option("detach", True)
    driver = webdriver.Chrome(service=chrome_service, options=options)
    driver.set_window_size(1920, 1080)
    driver.get(url)
    time.sleep(random_sec)

    # 번역 모달창 닫기
    try:
        translation_modal_close_button = WebDriverWait(driver, random_sec).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label="Close"]'))
        )
        translation_modal_close_button.click()
        time.sleep(1)
    except Exception as e:
        print("번역 모달 창 실패 :", e)


    # 리뷰버튼 클릭
    try:
        review_button = WebDriverWait(driver, random_sec).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Show all') and contains(text(), 'reviews')]"))
        )
        review_button.click()
    except Exception as e:
        print("리뷰 버튼 실패 :", e)
        driver.quit()
        return []
    
    #모달 열리기  
    try:
        scroll_panel = WebDriverWait(driver, random_sec).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-testid="pdp-reviews-modal-scrollable-panel"]'))
        )
    except Exception as e:
        print("리뷰 모달창 실패 :", e)
        driver.quit()
        return []
    
    # 처음에 강제로 스크롤 내리기
    driver.execute_script("""
        arguments[0].scrollTop = arguments[0].scrollHeight;
        arguments[0].dispatchEvent(new Event('scroll'));
    """, scroll_panel)
    time.sleep(1) 

    # 무한 스크롤을 통해 모든 리뷰를 로드
    last_height = driver.execute_script("return arguments[0].scrollHeight", scroll_panel)

    for _ in range(2):
        # 스크롤을 내리는 동작 (패널의 마지막 부분으로 이동)
        driver.execute_script("arguments[0].scrollIntoView(false);", scroll_panel)
        time.sleep(1)  # 스크롤 후 로딩을 위해 잠시 대기

        # 새로운 스크롤 높이를 가져오기
        new_height = driver.execute_script("return arguments[0].scrollHeight", scroll_panel)
        last_height = new_height  # 다음 반복을 위해 마지막 높이를 업데이트

    reviews = []
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    review_container = soup.find('div', {'data-testid': 'pdp-reviews-modal-scrollable-panel'})
    if review_container:
        review_tags = review_container.find_all('span', class_='lrl13de atm_kd_19r6f69_24z95b atm_kd_19r6f69_1xbvphn_1oszvuo dir dir-ltr')
        for i in range(1, len(review_tags) + 1):
            text = review_tags[i-1].text.strip()
            date_elements = driver.find_elements(By.XPATH, f"/html/body/div[9]/div/div/section/div/div/div[2]/div/div[3]/div/div/div/section/div/section/div/div[2]/div[2]/div/div[{i}]/div[1]/div")
            if date_elements:
                dates = [element.text for element in date_elements if element.text.strip() != '']
                dates = [convert_date(date) for date in dates]
                if dates:  
                    reviews.append({'리뷰': text, '리뷰날짜': dates[0]})
                else:
                    reviews.append({'리뷰': text, '리뷰날짜': '날짜 정보 없음'})
            else:
                reviews.append({'리뷰': text, '리뷰날짜': '날짜 정보 없음'})
    driver.quit()

    reviews_df = pd.DataFrame(reviews)
    reviews_df['리뷰날짜'] = pd.to_datetime(reviews_df['리뷰날짜'],errors='coerce', format="%Y-%m-%d")
    filtered_reviews_df = reviews_df[reviews_df['리뷰날짜'] >= "2024-04-01"]

    return filtered_reviews_df


df_subset = df[['숙소_id', 'url']]

# 새로운 DataFrame 생성
paris_reviews = pd.DataFrame(columns=['url', '숙소_id', '리뷰', '리뷰날짜'])

# 각 숙소에 대해 리뷰 정보 수집
for index, row in df_subset.iterrows():
    review_data = airbnb_reviews(row['url'])
    if index == 2:
        break
    review_data['url'] = row['url']
    review_data['숙소_id'] = row['숙소_id']
    paris_reviews = pd.concat([paris_reviews, review_data], ignore_index=True)



  paris_reviews = pd.concat([paris_reviews, review_data], ignore_index=True)


In [14]:

paris_reviews

Unnamed: 0,url,숙소_id,리뷰,리뷰날짜
0,https://www.airbnb.com/rooms/165409?locale=en,165409,"Great apartment, fantastic location.",2024-05-31
1,https://www.airbnb.com/rooms/165409?locale=en,165409,Thank you Gillian for your review. We hope to ...,2024-05-27
2,https://www.airbnb.com/rooms/165409?locale=en,165409,"They were kind and attentive, the location was...",2024-05-13
3,https://www.airbnb.com/rooms/165409?locale=en,165409,The accommodation is located directly in the b...,2024-04-01
4,https://www.airbnb.com/rooms/165409?locale=en,165409,It was lovely ! Thanks so much.,2024-04-01
