<a href="https://colab.research.google.com/github/nkcong206/Android_Detection/blob/main/crawl_data/Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q langchain
!pip install -q langchain_community
!pip install -q langchain-ollama
!pip install geopy

In [None]:
import requests
from bs4 import BeautifulSoup as bp
import time
import csv
import json
from geopy.geocoders import Nominatim
import re
from langchain_community.chat_models import ChatOllama

In [None]:
!curl https://ollama.ai/install.sh | sh

In [None]:
import subprocess
import time
import threading

# Start the ollama server in a new process
process = subprocess.Popen(['ollama', 'serve'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Function to print server output
def print_output(process):
    while True:
        output = process.stdout.readline()
        if output == b'' and process.poll() is not None:
            break
        if output:
            print(output.strip().decode('utf-8'))
        time.sleep(1)

# Start a thread to print server output
thread = threading.Thread(target=print_output, args=(process,))
thread.start()

print("Ollama server is running in the background")

In [None]:
!ollama pull llama3:latest

In [None]:
llm = ChatOllama(model="llama3:latest", temperature=0)

In [None]:
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
REQUEST_HEADER = {
    'User-Agent': USER_AGENT,
    'Accept-language': 'en-US, en;q=0.5',
}


```
DROP SCHEMA IF EXISTS places CASCADE;
CREATE SCHEMA IF NOT EXISTS places;
SET search_path TO places;

CREATE TYPE address AS (
    street TEXT,
    district TEXT,
    city TEXT
);

CREATE TYPE location AS (
    latitude DECIMAL(9, 6),
    longitude DECIMAL(9, 6)
);

CREATE TABLE places.hotels (
    hotel_id SERIAL PRIMARY KEY,
    name VARCHAR(255),
    address address,
    location location,
    rating DECIMAL(2, 1),
    description TEXT,
    img_url JSON,
    comments TEXT
);

CREATE TABLE places.hotel_price_range (
    id SERIAL PRIMARY KEY,
    hotel_id INT REFERENCES places.hotels(hotel_id) ON DELETE CASCADE,
    room_type VARCHAR(100),
    occupancy INT,
    price DECIMAL(10, 2)
);
```



In [None]:
def get_page_html(url):
    res = requests.get(url=url, headers=REQUEST_HEADER)
    return res.text

def get_hotel_price(soup):
    price_element = soup.find('div', attrs={'style': 'color: rgb(255, 94, 31); font-size: 20px;'})
    if price_element:
        true_price = price_element.text.strip().replace('VND', '').replace('.', '')
        return float(true_price)
    return None

def get_hotel_name(soup):
    name = soup.find('div', class_='css-901oao r-a5wbuh r-1enofrn r-b88u0q r-1cwl3u0 r-fdjqy7 r-3s2u2q')
    return name.text.strip() if name else None

def get_hotel_rating(soup):
    rating = soup.find('div', class_='css-901oao r-jwli3a r-a5wbuh r-s67bdx r-b88u0q r-10cxs7j r-q4m81j')
    return rating.text.strip() if rating else None

def get_hotel_des(soup):
    des = soup.find('div', attrs={'style': 'font-family:Godwit, -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Arial, sans-serif, Apple Color Emoji, Segoe UI Emoji, Segoe UI Symbol;font-size:14px;line-height:20px;max-height:80px;overflow:hidden'})
    return des.text.strip().replace('\n', '') if des else None

def get_hotel_address(soup):
    address = soup.find('div', class_='css-901oao css-cens5h r-13awgt0 r-a5wbuh r-1b43r93 r-majxgm r-rjixqe r-fdjqy7')
    if address:
        address_full = address.text.strip().replace('\t', '')

        prompt = f"""
            Please separate the following address into 3 parts: street, district, city:
            {address_full}

            Ensure that the "district" value is one of the following 12 options: Ba Đình, Cầu Giấy, Đống Đa, Hai Bà Trưng, Hoàn Kiếm, Thanh Xuân, Hoàng Mai, Long Biên, Hà Đông, Tây Hồ, Nam Từ Liêm, Bắc Từ Liêm.
            The "city" value must be Hà Nội.

            Provide the result in the following JSON format:
            {{
            "street": "...",
            "district": "...",
            "city": "..."
            }}
        """
        response = llm.invoke(prompt)
        response_text = str(response.content)
        cleaned_json_str = re.search(r'\{.*?\}', response_text, re.DOTALL).group(0)
        try:
            result_dict = json.loads(cleaned_json_str)
            return result_dict
        except json.JSONDecodeError:
            return {'street': '', 'district': '', 'city': ''}

def get_hotel_location(address):
    addr = f"{address['street']}, {address['district']}, {address['city']}"
    geolocator = Nominatim(user_agent="my_geocoder")
    location = geolocator.geocode(addr)

    if location:
        latitude = location.latitude
        longitude = location.longitude
        return {'latitude': latitude, 'longitude': longitude}
    else:
        return {'latitude': '', 'longitude': ''}

def get_hotel_comments(soup):
    comments = []
    a = soup.findAll('div', class_='css-901oao css-cens5h r-cwxd7f r-a5wbuh r-1b43r93 r-majxgm r-rjixqe r-fdjqy7')
    for comment in a:
        comments.append(comment.text.strip())
    return comments

def insert_hotel_data(conn, info):
    cur = conn.cursor()
    try:
        cur.execute("""
            INSERT INTO places.hotels (name, address, location, rating, description, img_url, comments)
            VALUES (%s, ROW(%s, %s, %s), ROW(%s, %s), %s, %s, %s, %s)
            RETURNING hotel_id;
        """, (
            info['name'],
            info['address']['street'],
            info['address']['district'],
            info['address']['city'],
            info['location']['latitude'],
            info['location']['longitude'],
            info['rating'],
            info['description'],
            info.get('img_url', None),
            info['comments']
        ))

        hotel_id = cur.fetchone()[0]

        if 'price' in info:
            cur.execute("""
                INSERT INTO places.hotel_price_range (hotel_id, room_type, occupancy, price)
                VALUES (%s, %s, %s, %s)
            """, (
                hotel_id,
                info.get('room_type', None),
                info.get('occupancy', None),
                info['price']
            ))

        conn.commit()
    except Exception as e:
        print(f"Error inserting data: {e}")
        conn.rollback()
    finally:
        cur.close()

def extract_hotels_info(url):
    info = {}
    html = get_page_html(url)
    soup = bp(html, 'lxml')
    info['name'] = get_hotel_name(soup)
    info['price'] = get_hotel_price(soup)
    info['rating'] = get_hotel_rating(soup)
    info['address'] = get_hotel_address(soup)
    info['location'] = get_hotel_location(info['address'])
    info['description'] = get_hotel_des(soup)
    info['comments'] = get_hotel_comments(soup)
    return info

In [None]:
url = "https://www.traveloka.com/vi-vn/hotel/vietnam/classy-holiday-hotel--spa-1000000430274?spec=18-08-2024.19-08-2024.1.1.HOTEL.1000000430274..2"
html = get_page_html(url)
soup = bp(html, 'lxml')
address = get_hotel_address(soup)
print(address)
location = get_hotel_location(address)
print(location)

In [None]:
def get_page_html(url):
    res = requests.get(url=url, headers=REQUEST_HEADER)
    return res.text

def get_restaurant_name(soup):
    name = soup.find('h1', itemprop='name')
    return name.text.strip() if name else None

def get_restaurant_address(soup):
    # Extract address components
    street_address = soup.find('span', itemprop='streetAddress')
    address_locality = soup.find('span', itemprop='addressLocality')
    address_region = soup.find('span', itemprop='addressRegion')

    address_data = {
        'street': street_address.get_text(strip=True) if street_address else None,
        'district': address_locality.get_text(strip=True) if address_locality else None,
        'city': address_region.get_text(strip=True) if address_region else None
    }
    return address_data

def get_restaurant_location(address):
    addr = f"{address['street']}, {address['district']}, {address['city']}"
    geolocator = Nominatim(user_agent="my_geocoder")
    location = geolocator.geocode(addr)

    if location:
        latitude = location.latitude
        longitude = location.longitude
        return {'latitude': latitude, 'longitude': longitude}

def get_restaurant_rating(soup):
    rating = soup.find('div', itemprop='ratingValue', class_='microsite-point-avg')
    return rating.get_text(strip=True) if rating else None

def get_restaurant_description(soup):
    # Extract cuisine type
    cuisine = soup.find('div', itemprop='servesCuisine')
    cuisine_text = cuisine.get_text(strip=True) if cuisine else None

    # Extract audience
    audience = soup.find('div', class_='audiences')
    audience_text = audience.get_text(strip=True).replace('&nbsp;', ' ') if audience else None

    # Extract category
    category = soup.find('div', class_='category-items')
    category_text = category.get_text(strip=True) if category else None

    # Combine all parts into a single description
    description = {
        'cuisine': cuisine_text,
        'audience': audience_text,
        'category': category_text
    }
    return description

def get_restaurant_comments(soup):
    comments = []
    comment_elements = soup.findAll('div', class_='comment')
    for comment in comment_elements:
        comments.append(comment.text.strip())
    return comments

def extract_restaurant_info(url):
    info = {}
    html = get_page_html(url=url)
    soup = bp(html, 'lxml')
    info['name'] = get_restaurant_name(soup)
    info['address'] = get_restaurant_address(soup)
    info['location'] = get_restaurant_location(info['address'])
    info['rating'] = get_restaurant_rating(soup)
    info['description'] = get_restaurant_description(soup)
    info['comments'] = get_restaurant_comments(soup)
    return info

In [None]:
url = "https://www.foody.vn/ha-noi/kfc-tay-son"
html = get_page_html(url)
soup = bp(html, 'lxml')
print(get_restaurant_name(soup))
print(get_restaurant_rating(soup))
print(get_restaurant_description(soup))
#print(get_restaurant_comments(soup))