In [1]:
import os, glob, time
import pandas as pd
import sqlite3
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import Statistical_Analysis, House_Price_Prediction

In [2]:
os.makedirs(os.path.join("Real Property Dataset"),exist_ok = True)
Dataset_Path = os.path.join("Real Property Dataset", "Real Property Information.csv")

In [3]:
def Scrape_Data(driver): 
    House_Information = []
    Page = driver.page_source
    Source = BeautifulSoup(Page, "lxml")

    Titles = Source.find_all("span", class_="pr-title js__card-title")
    Areas = Source.find_all("span", class_="re__card-config-area js__card-config-item")
    Bedrooms = Source.find_all("span", class_="re__card-config-bedroom js__card-config-item")
    Bathrooms = Source.find_all("span", class_="re__card-config-toilet js__card-config-item")
    Location = Source.find_all("div", class_="re__card-location")
    Prices = Source.find_all("span", class_="re__card-config-price js__card-config-item")
    
    for title, area, bedroom, bathroom, price, local in zip(Titles, Areas, Bedrooms, Bathrooms, Prices, Location):
        House_Information.append({
            'House Title': title.text.strip(),
            'Area': area.text.strip(),
            'Bedrooms': bedroom.find('span').text.strip(),
            'Bathrooms': bathroom.find('span').text.strip(),
            'Location': local.text.strip(),
            'Price': price.text.strip()
        })
    
    return House_Information

In [4]:
def Browser_Automation():
    driver = webdriver.Edge()
    driver.set_window_size(1280, 720)
    URL = "https://batdongsan.com.vn/ban-nha-rieng-tp-hcm?cIds=325,163"
    driver.get(URL)
    
    House_List = []
    
    def close_popups():
        while True:
            try:
                close_button = WebDriverWait(driver, 0.5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "span.close")))
                close_button.click()
                time.sleep(0.25)
            except Exception:
                break
            
    while True:
        try:
            close_popups() 
            House_Data = Scrape_Data(driver)
            House_List.extend(House_Data)
            Next_Page = WebDriverWait(driver, 1.5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a.re__pagination-icon i.re__icon-chevron-right--sm")))
            Next_Page.click()
            time.sleep(1)                                   
        except Exception:
            break
            
    driver.quit()
    return House_List

Scrape_House_List = pd.DataFrame(Browser_Automation())
Scrape_House_List.to_csv(Dataset_Path, header=True, index=False, encoding="utf-8-sig")

In [4]:
def Clean_Area_Column(Dataset):
    try:
        Dataset["Area"].astype(float)
        return Dataset["Area"].astype(float)
    except:
        Dataset["Area"] = Dataset["Area"].str.replace("m²", "", regex=False).str.replace(".", "")
        Dataset["Area"] = Dataset["Area"].str.replace(",", ".").str.strip()
        return Dataset["Area"].astype(float)

def Clean_Locations_Column(Dataset):
    Location_List = [
        "Quận 1", "Quận 2", "Quận 3", "Quận 4", "Quận 5", "Quận 6", "Quận 7", "Quận 8", "Quận 9",
        "Quận 10", "Quận 11", "Quận 12", "Bình Thạnh", "Gò Vấp", "Phú Nhuận", "Tân Bình",
        "Tân Phú", "Bình Tân", "Thủ Đức", "Nhà Bè", "Hóc Môn", "Bình Chánh", "Củ Chi", "Cần Giờ" 
    ]
    Dataset["Location"] = Dataset["Location"].astype(str)
    Dataset["Location"] = Dataset["Location"].str.replace("·\r\n", "")
    
    def Clean_Locations_Data(Location):
        Parts = Location.split(",")
        for Part in Parts:
            Part = Part.strip()
            if Part in Location_List:
                return Part
    Dataset["Location"] = Dataset["Location"].apply(Clean_Locations_Data)
    return Dataset["Location"]

def Clean_Price_Column(Dataset):
    try:
        Dataset["Price"].astype(float)
        return Dataset["Price"].astype(float)
    except:
        Dataset["Price"] = Dataset["Price"].str.replace("Giá thỏa thuận", "0")
        Dataset["Price"] = Dataset["Price"].str.replace(".", "")
        Dataset["Price"] = Dataset["Price"].str.replace(",", ".")
        def Clean_Price_Data(Price):
            Parts = Price.split()
            try:
                if "tỷ" in Parts:
                    return float(Parts[0])
                elif "triệu" in Parts:
                    return float(Parts[0]) / 1000
            except ValueError:
                return 0

        Dataset["Price"] = Dataset["Price"].apply(Clean_Price_Data)
        return Dataset["Price"].astype(float)

In [5]:
def Data_Cleaning():
    House_List = pd.read_csv(Dataset_Path)
    House_List["Area"] = Clean_Area_Column(House_List)
    House_List["Location"] = Clean_Locations_Column(House_List)
    House_List["Price"] = Clean_Price_Column(House_List)
    
    House_List = House_List.dropna(subset=["Location", "Price"])
    
    House_List.to_csv(Dataset_Path, header=True, index=False, encoding="utf-8-sig")
    
Data_Cleaning()
House_List = pd.read_csv(Dataset_Path)

In [6]:
def House_Price_Statistics():
    if 'House Information.db' not in glob.glob("House Information.db"):
        House_List = pd.read_csv(os.path.join("Real Property Dataset", "Real Property Information.csv"))
        Connection = sqlite3.connect('House Information.db')
        House_List.to_sql('House Information', Connection, if_exists='replace', index=False)
        print("Database already created!")
        return Connection
    else: return sqlite3.connect('House Information.db')

def Exc(Query):
    Connection = House_Price_Statistics()
    return pd.read_sql_query(Query, Connection)

In [7]:
Connection = House_Price_Statistics()
Cursor = Connection.cursor ()

Query = """
select Location, count("House Title") as "Number of Houses", avg(Price) as "Average Price", avg("Area") as "Average Area"
from 'House Information'
group BY Location;
"""
Exc(Query)

Unnamed: 0,Location,Number of Houses,Average Price,Average Area
0,Bình Chánh,165,25.283152,155.842545
1,Bình Thạnh,3076,29.369511,218.844759
2,Bình Tân,1232,27.239943,154.249789
3,Cần Giờ,1,4.0,110.0
4,Củ Chi,301,28.102359,188.24608
5,Gò Vấp,1646,26.719113,145.178426
6,Hóc Môn,78,10.449551,133.836026
7,Nhà Bè,893,29.315733,142.240269
8,Phú Nhuận,716,32.352374,693.339316
9,Quận 1,821,55.243752,152.847223


In [8]:
House_Price_Prediction.Model_Predict(House_List)

Unnamed: 0,Location,Area,Bedrooms,Bathrooms,Actual,Predicted,Trend Prediction,Trend Actual
0,Thủ Đức,80.0,3,3,29.50,23.729064,Decrease,Decrease
1,Phú Nhuận,48.0,2,2,10.90,25.051691,Increase,Decrease
2,Củ Chi,142.0,7,4,30.00,23.145010,Decrease,Increase
3,Quận 9,57.5,4,4,5.40,30.229637,Increase,Decrease
4,Quận 9,40.1,4,4,3.15,30.200223,Decrease,Decrease
...,...,...,...,...,...,...,...,...
4707,Gò Vấp,70.0,2,3,14.70,26.309776,Decrease,Increase
4708,Bình Thạnh,51.0,2,2,6.00,26.187523,Decrease,Decrease
4709,Quận 7,85.0,2,4,3.37,30.948313,Increase,Decrease
4710,Bình Thạnh,360.0,2,4,105.00,28.020907,Decrease,Increase
