In [38]:
import psycopg2
import json
import requests
from bs4 import BeautifulSoup as bp
import csv
from datetime import datetime
import time
from selenium import webdriver
import re
from geopy.geocoders import Nominatim
import os

In [39]:
!pip install -q langchain
!pip install -q langchain_community
!pip install -q langchain-ollama

In [40]:
!pip install geopy



In [41]:
!pip install -qU langchain-google-genai

In [42]:
from langchain_community.chat_models import ChatOllama
llm = ChatOllama(model="llama3:latest", temperature=0)

In [43]:
USER_AGENT='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
REQUEST_HEADER = {
    'User-Agent': USER_AGENT,
    'Accept-language': 'en-US, en;q=0.5',
}
def get_page_html(url):
    res= requests.get(url=url,headers= REQUEST_HEADER)
    return res.text


def get_hotel_price(soup):
    price_element=soup.find('div', attrs={'style':'color: rgb(255, 94, 31); font-size: 20px;'})
    if price_element:
        true_price = price_element.text.strip().replace('VND', '').replace('.', '')
        return float(true_price)
    return None    
            
def get_hotel_name(soup):
    name=soup.find('div',class_='css-901oao r-a5wbuh r-1enofrn r-b88u0q r-1cwl3u0 r-fdjqy7 r-3s2u2q')
    return name.text.strip() if name else None


def get_hotel_rating(soup):
    rating=soup.find('div',class_ = 'css-901oao r-jwli3a r-a5wbuh r-s67bdx r-b88u0q r-10cxs7j r-q4m81j')
    return rating.text.strip() if rating else None


def get_hotel_des(soup):
    des=soup.find('div',attrs={'style':'font-family:Godwit, -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Arial, sans-serif, Apple Color Emoji, Segoe UI Emoji, Segoe UI Symbol;font-size:14px;line-height:20px;max-height:80px;overflow:hidden'})
    return des.text.strip().replace('\n','') if des else None

def get_hotel_location(soup):
    location=soup.find('div',class_='css-901oao css-cens5h r-13awgt0 r-a5wbuh r-1b43r93 r-majxgm r-rjixqe r-fdjqy7')
    return location.text.strip().replace('\t','') if location else None


def get_hotel_comments(soup):
    comments=[]
    a = soup.findAll('div',class_='css-901oao css-cens5h r-cwxd7f r-a5wbuh r-1b43r93 r-majxgm r-rjixqe r-fdjqy7')
    for comment in a:
        comments.append(comment.text.strip())
    return json.dumps(comments)
def get_hotel_img_url(soup):
    div=soup.find('div', class_='css-1dbjc4n r-j9b53g r-1i97xy8 r-1ta3fxp r-18u37iz r-1z0tv5g r-1udh08x')
    if div:
        img_tag=div.findAll('img')
        img_url=[img['src'] for img in img_tag]
        return json.dumps(img_url)
    else:
        return None

In [44]:
def insert_hotel_data(conn, info):
    cur = conn.cursor()
    try:
        cur.execute("""
            INSERT INTO travel_database.hotel(name, rating, description, img_url, comments)
            VALUES (%s, %s, %s, %s, %s)
            
        """, (
            info['name'],
            info['rating'],
            info['description'],
            info.get('img_url', None),
            info['comments']
        ))

        #hotel_id = cur.fetchone()[0]

        #if 'price' in info:
            #cur.execute("""
             #   INSERT INTO places.hotel_price_range (hotel_id, room_type, occupancy, price)
            #    VALUES (%s, %s, %s, %s)
            #""", (
             #   hotel_id,
             #   info.get('room_type', None),
            #    info.get('occupancy', None),
             #   info['price']
           # ))

        conn.commit()
    except Exception as e:
        print(f"Error inserting data: {e}")
        conn.rollback()
    finally:
        cur.close()

In [45]:
def extract_hotels_info(url):
    info = {}
    html = get_page_html(url)
    soup = bp(html, 'lxml')
    info['name'] = get_hotel_name(soup)
    info['price'] = get_hotel_price(soup)
    info['rating'] = get_hotel_rating(soup)
    #info['address'] = get_hotel_address(soup)
    #info['location'] = get_hotel_location(info['address'])
    info['description'] = get_hotel_des(soup)
    info['comments'] = get_hotel_comments(soup)
    info['img_url']=get_hotel_img_url(soup)
    return info

In [46]:
conn = psycopg2.connect(
        user="postgres",
        password="hungdz2004",
        host="localhost",
        port="5432",
        database="postgres"
)

In [47]:
data=[]
with open('hotels.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        url = row[0]
        data = extract_hotels_info(url)
        insert_hotel_data(conn, data)
        time.sleep(0.25)
    