In [1]:
import psycopg2

In [123]:
conn = psycopg2.connect(database = "travellocations", 
                        user = "postgres", 
                        host= 'localhost',
                        password = "cong2006",
                        port = 5432)

In [122]:
cur = conn.cursor()

# Thực thi các lệnh SQL để tạo schema, loại dữ liệu, và bảng
cur.execute("""
DROP SCHEMA IF EXISTS places CASCADE;
CREATE SCHEMA IF NOT EXISTS places;
SET search_path TO places;

CREATE TYPE address AS (
    details TEXT,
    ward TEXT,
    district TEXT,
    city TEXT
);

CREATE TYPE location AS (
    latitude DECIMAL(9, 6),
    longitude DECIMAL(9, 6)
);

CREATE TABLE places.hotels (
    hotel_id SERIAL PRIMARY KEY,
    name VARCHAR(255),
    address address,
    location location,
    rating DECIMAL(2, 1),
    description TEXT,
    img_url JSON,
    comments TEXT
);

CREATE TABLE places.hotel_price_range (
    id SERIAL PRIMARY KEY,
    hotel_id INT REFERENCES places.hotels(hotel_id) ON DELETE CASCADE,
    room_type VARCHAR(100),
    occupancy INT,
    price DECIMAL(10, 2)
);
""")

# Cam kết các thay đổi vào cơ sở dữ liệu
conn.commit()

# # Đóng con trỏ và kết nối
cur.close()
conn.close()


In [125]:
hotel_name = 'Grand Hotel'
address_details = '123 Main St'
address_ward = 'Main ward'
address_district = 'Downtown'
address_city = 'New York'
latitude = 40.712776
longitude = -74.005974
rating = 4.5
description = 'A luxury hotel in the heart of the city.'
img_url =   '{"img_1": "http://example.com/image1.jpg", "img_2": "http://example.com/image2.jpg"}' 
comments = 'Great amenities and service.'
room_type = 'Suite'
occupancy = 2
price = 299.99
cur = conn.cursor()

cur.execute("""
    INSERT INTO places.hotels (name, address, location, rating, description, img_url, comments)
    VALUES (%s, ROW(%s, %s, %s, %s), ROW(%s, %s), %s, %s, %s, %s)
    RETURNING hotel_id;
""", (hotel_name, address_details, address_ward, address_district, address_city, latitude, longitude, rating, description, img_url, comments))

# Lấy ID của khách sạn mới chèn
hotel_id = cur.fetchone()[0]

# Chèn dữ liệu vào bảng hotel_price_range
cur.execute("""
    INSERT INTO places.hotel_price_range (hotel_id, room_type, occupancy, price)
    VALUES (%s, %s, %s, %s);
""", (hotel_id, room_type, occupancy, price))

# Cam kết các thay đổi vào cơ sở dữ liệu
conn.commit()

# # Đóng con trỏ và kết nối
cur.close()
conn.close()

In [27]:
import requests
from bs4 import BeautifulSoup as bp
import time
import csv

In [21]:
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
REQUEST_HEADER = {
    'User-Agent': USER_AGENT,
    'Accept-language': 'en-US, en;q=0.5',
}

In [51]:
!pip install -qU langchain-google-genai

In [195]:
import getpass
import os

os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

In [196]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

In [209]:
import json
import geocoder

class HotelScraper:
    def __init__(self, db_params):
        self.conn = psycopg2.connect(**db_params)
        self.cur = self.conn.cursor()

    def get_page_html(self, url):
        res = requests.get(url=url, headers=REQUEST_HEADER)
        return res.text

    def get_hotel_price(self, soup):
        price_element = soup.find('div', attrs={'style': 'color: rgb(255, 94, 31); font-size: 20px;'})
        if price_element:
            true_price = price_element.text.strip().replace('VND', '').replace('.', '')
            return float(true_price)
        return None

    def get_hotel_name(self, soup):
        name = soup.find('div', class_='css-901oao r-a5wbuh r-1enofrn r-b88u0q r-1cwl3u0 r-fdjqy7 r-3s2u2q')
        return name.text.strip() if name else None

    def get_hotel_rating(self, soup):
        rating = soup.find('div', class_='css-901oao r-jwli3a r-a5wbuh r-s67bdx r-b88u0q r-10cxs7j r-q4m81j')
        return rating.text.strip() if rating else None

    def get_hotel_des(self, soup):
        des = soup.find('div', attrs={'style': 'font-family:Godwit, -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Arial, sans-serif, Apple Color Emoji, Segoe UI Emoji, Segoe UI Symbol;font-size:14px;line-height:20px;max-height:80px;overflow:hidden'})
        return des.text.strip().replace('\n', '') if des else None


    def get_hotel_address(self, soup):
        address = soup.find('div', class_='css-901oao css-cens5h r-13awgt0 r-a5wbuh r-1b43r93 r-majxgm r-rjixqe r-fdjqy7')
        if address:
            address_full = address.text.strip().replace('\t', '')
            
            prompt = f"""
                Separate the following address into 4 parts: house number, ward name, district name, and city name:
                {address_full}

                Provide the result in Vietnamese and strictly in the following format:
                {{
                "details": "...",
                "ward": "...",
                "district": "...",
                "city": "..."
                }}
                """
            # Sử dụng mô hình ngôn ngữ để phân tích địa chỉ
            response = llm.invoke(prompt)
            response_text = str(response.content)
            cleaned_json_str = response_text.strip('```json\n').strip('```\n')
            try:
                result_dict = json.loads(cleaned_json_str)
                return result_dict
            except json.JSONDecodeError:
                return {'details': '', 'ward': '', 'district': '', 'city': ''}

    def get_hotel_location(self, address):
        addr = str(address['ward']+address['district']+address['city']) 
        g = geocoder.osm(addr)
        location = g.osm
        latitude = location['y']
        longitude = location['x']
        
        return {'latitude':latitude, 'longitude':longitude}
        
        
    def get_hotel_comments(self, soup):
        comments = []
        a = soup.findAll('div', class_='css-901oao css-cens5h r-cwxd7f r-a5wbuh r-1b43r93 r-majxgm r-rjixqe r-fdjqy7')
        for comment in a:
            comments.append(comment.text.strip())
        return comments

    def insert_hotel_data(self, info):
        try:
            # Insert hotel data
            self.cur.execute("""
                INSERT INTO places.hotels (name, address, location, rating, description, img_url, comments)
                VALUES (%s, ROW(%s, %s, %s, %s), ROW(%s, %s), %s, %s, %s, %s)
                RETURNING hotel_id;
            """, (
                info['name'],
                info['address']['details'],
                info['address']['ward'],
                info['address']['district'],
                info['address']['city'],
                info['location']['latitude'],
                info['location']['longitude'],
                info['rating'],
                info['description'],
                info.get('img_url', None),
                info['comments']
            ))

            # Get the newly inserted hotel_id
            hotel_id = self.cur.fetchone()[0]

            # Insert hotel price range data
            if 'price' in info:
                self.cur.execute("""
                    INSERT INTO places.hotel_price_range (hotel_id, room_type, occupancy, price)
                    VALUES (%s, %s, %s, %s)
                """, (
                    hotel_id,
                    info.get('room_type', None),
                    info.get('occupancy', None),
                    info['price']
                ))

            # Commit the transaction
            self.conn.commit()
        except Exception as e:
            print(f"Error inserting data: {e}")
            self.conn.rollback()

    def extract_hotels_url(self, url):
        info = {}
        html = self.get_page_html(url)
        soup = bp(html, 'lxml')
        info['name'] = self.get_hotel_name(soup)
        info['price'] = self.get_hotel_price(soup)
        info['rating'] = self.get_hotel_rating(soup)
        info['address'] = self.get_hotel_address(soup)
        info['location'] = self.get_hotel_location(info['address'])
        info['description'] = self.get_hotel_des(soup)
        info['comments'] = self.get_hotel_comments(soup)
        return info

    def close(self):
        self.cur.close()
        self.conn.close()

In [210]:
db_params = {
    'database': 'travellocations',
    'user': 'postgres',
    'host': 'localhost',
    'password': 'cong2006',
    'port': 5432
}
scraper = HotelScraper(db_params)

with open('hotels.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        url = row[0]
        data = scraper.extract_hotels_url(url)
        scraper.insert_hotel_data(data)
        time.sleep(0.25)

scraper.close()

Status code 403 from https://nominatim.openstreetmap.org/search: ERROR - 403 Client Error: Forbidden for url: https://nominatim.openstreetmap.org/search?q=&format=jsonv2&addressdetails=1&limit=1


TypeError: 'NoneType' object is not subscriptable

In [211]:
scraper = HotelScraper(db_params)
url = "https://www.traveloka.com/vi-vn/hotel/vietnam/hanoi-sunshine-hotel-1000000468475?spec=18-08-2024.19-08-2024.1.1.HOTEL.1000000468475..2"
html = scraper.get_page_html(url)
soup = bp(html, 'lxml')
address = scraper.get_hotel_address(soup)
print(address)
addr = str("phường "+address['ward']+", quận "+address['district']+", thành phố "+address['city']) 
print(addr)
g = geocoder.osm(addr)
location = g.osm
location
print(type(location))
latitude = location['y']
longitude = location['x']
print(latitude,longitude)

{'details': '18 Hàng Hòm', 'ward': 'Hàng Gai', 'district': 'Quận Hoàn Kiếm', 'city': 'Hà Nội'}
phường Hàng Gai, quận Quận Hoàn Kiếm, thành phố Hà Nội


Status code 403 from https://nominatim.openstreetmap.org/search: ERROR - 403 Client Error: Forbidden for url: https://nominatim.openstreetmap.org/search?q=ph%C6%B0%E1%BB%9Dng+H%C3%A0ng+Gai%2C+qu%E1%BA%ADn+Qu%C3%A2%CC%A3n+Ho%C3%A0n+Ki%E1%BA%BFm%2C+th%C3%A0nh+ph%E1%BB%91+H%C3%A0+N%E1%BB%99i&format=jsonv2&addressdetails=1&limit=1


<class 'NoneType'>


TypeError: 'NoneType' object is not subscriptable

In [162]:
!pip install geocoder

I0000 00:00:1723934514.000823   30586 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers




In [187]:
import geocoder
g = geocoder.osm('Hàng Gai, Hoàn Kiếm, Hà Nội, Việt Nam')
g.osm

{'x': 105.8490719,
 'y': 21.0321611,
 'addr:city': 'Thành phố Hà Nội',
 'addr:country': 'Việt Nam',
 'addr:postal': '11015'}