## Scrape Lamudi.ph website

### Import modules

In [1]:
import requests
from bs4 import BeautifulSoup
import importlib
import pandas as pd
import time

# to import path from src
import sys
sys.path.append('../src')

# custom mdoules
from utils import get_header, write_file, read_file

# reload modules
importlib.reload(sys.modules['utils'])

<module 'utils' from '/Users/nadine/Documents/Spiced_Academy/github/ph-property-price-prediction/notebooks/../src/utils.py'>

### Set URL and filename

In [4]:
URL_house = 'https://www.lamudi.com.ph/house/buy/'
filename_house = 'lamudi_house.html'

### Define Functions

In [17]:
def get_span_data(listing, class_name):
    """Gets the span data from a listing
    :param listing: a listing from the search results
    :param class_name: the class name of the span element
    :return: the span data
    :rtype: string"""
    try:
        return listing.find("span", class_=class_name).next_sibling.strip()
    except AttributeError:
        return "0"
    

def get_listings(soup):
    """ Get the listings from the soup object and return a dataframe.
    :param soup: BeautifulSoup object
    :return: dataframe containing the listings 
    :rtype: DataFrame
    """

    dict_listings = []

    # Get the listings
    # listing_cells = soup.find_all('div', class_='ListingCell-AllInfo ListingUnit')
    listing_cells = soup.find_all('div', class_='row ListingCell-row ListingCell-agent-redesign')
    

    # Loop through each listing
    for listing in listing_cells:
        # Extracting the last word in the "alt" attribute
        alt_attribute = listing.find('img')['alt']
        region = alt_attribute.split(', ')[-1]

        # category = listing['data-category']
        # data_geo_point = listing['data-geo-point']
        category = listing.find('div', {'data-category': True})['data-category']
        data_geo_point = listing.find('div', {'data-geo-point': True})['data-geo-point']
        
        title = listing.find('h2', class_='ListingCell-KeyInfo-title').text.strip()
        address = listing.find('span', class_='ListingCell-KeyInfo-address-text').text.strip()
        href = listing.find('a', class_='js-listing-link')['href']

        # Some listings do not have bedrooms, bathrooms, floor area, price and lot area
        bedrooms = get_span_data(listing, 'icon-bedrooms')
        floor_area = get_span_data(listing, 'icon-livingsize')
        lot_area = get_span_data(listing, 'icon-land_size')


        img_tag = listing.find('img')
        try:
            img_link = img_tag['data-src']
        except KeyError:
            img_link = img_tag['src']

        # Some listings do not have price
        try:
            price = listing.find('span', class_='PriceSection-FirstPrice').text.strip()
        except AttributeError:
            price = '0'
        # price = get_span_data(listing, 'PriceSection-FirstPrice')
        # price = listing.find('span', class_='PriceSection-FirstPrice').text.strip()

        # Append to dictionary
        dict_listings.append({'Category': category,
                              'Title': title,
                              'Price': price,
                              'Location': address,
                              'Region': region,
                              'Bedrooms': bedrooms,
                              'Floor Area': floor_area,
                              'Lot Area': lot_area,
                              'URL': href,
                              'Geo Point': data_geo_point,
                              'Image Link': img_link
                              })

    return dict_listings

# Function to split location into barangay and town/city
def split_location(location):
    '''
    Split location into barangay and town/city
    :param location: Location string
    :return: Barangay and town/city
    :rtype: tuple
    '''

    parts = location.split(', ')
    if len(parts) == 2:
        return parts[0], parts[1]
    elif len(parts) == 1:
        # If there is no barangay, only return town/city
        return None, parts[0]
    else:
        return None, None
    
def pre_process_data(df_listings):
    '''
    Pre-process the data
    :param df_listings: DataFrame of listings
    :return: Pre-processed DataFrame of listings
    :rtype: DataFrame
    '''
    # Apply a lambda function to clean and convert price
    df_listings['Price'] = df_listings['Price'].apply(lambda price_str: int(price_str.replace('₱', '').replace(',', '')))

    # Apply a lambda function to clean and convert area
    df_listings['Floor Area'] = df_listings['Floor Area'].apply(lambda area_str: int(area_str.replace('m²', '').replace(',', '')))
    df_listings['Lot Area'] = df_listings['Lot Area'].apply(lambda area_str: int(area_str.replace('m²', '').replace(',', '')))

    # Split location into barangay and town/city
    df_listings[['Barangay', 'Town/City']] = df_listings['Location'].apply(split_location).apply(pd.Series)
    df_listings.drop(columns=['Location'], inplace=True)

    # Split Geo Point into latitude and longitude
    df_listings['Geo Point'] = df_listings['Geo Point'].apply(lambda geo_point: geo_point.replace('[','').replace(']', ''))
    df_listings[['Longitude','Latitude']] = df_listings['Geo Point'].apply(lambda geo_point: geo_point.split(',')).apply(pd.Series)
    df_listings.drop(columns=['Geo Point'], inplace=True)

    return df_listings

def get_last_page(soup):
    '''
    Get the last page number from the select element
    :param soup: BeautifulSoup object
    :return: Last page number
    :rtype: int
    '''
    # Find the select element to get the number of pages
    select_element = soup.find('select', class_='js-pagination-dropdown')

    # Extract the value of the data-pagination-end attribute
    pagination_end = select_element['data-pagination-end']

    return int(pagination_end)


In [49]:
def scrape_pages(url):
    """
    Scrapes the pages of a website and returns a list of all the pages.
    
    Args:
        url (str): The url of the website to scrape.
        
    Returns:
        list: A list of all the pages scraped.
    """
    # Scrape page and save to html file
    response = requests.get(url, headers=get_header())

    # get last page number
    soup = BeautifulSoup(response.text, 'html.parser')
    last_page = get_last_page(soup)

    pages = []
    for i in range(1, last_page + 1):
        page = requests.get(f'{url}?page={i}', headers=get_header())
        pages.append(page.text)
        time.sleep(1)
    return pages

### Scrape pages
- Get the number of pages
- Loop through each page and append it to a list
- Save the scraped pages to a html file

In [50]:
# scrape pages for house data
# house_pages = scrape_pages(URL_house)

# convert list to string
# house_pages = ''.join(house_pages)

# save to html file
# write_file(house_pages, f'../data/html/{filename_house}')

### Read the HTML file and Pre-process data
The following informations will be taken from the html file:
- *Category*: this can be land or house
- *Title*: Title of the listing
- *Price*
- *Bedrooms*
- *Floor area*
- *Lot area*
- *URL*
- *Location*: this is split into Barangay (this is somehow equivalent to a district) and Town/City
- *Geo Points*: this is split into Latitude and Longitude

**Pre-processing of data**
- *Price*: Remove the pesos symbol and convert into an integer
- *Floor area*: Remove the sqm and convert into an integer
- *Lot area*: Remove the sqm and convert into an integer
- *Location*: split into two new columns - Barangay, Town/City
- *Geo Points*: split into two new columns - Longitude, Latitude


In [15]:
# read the html file
html_file = read_file(f"../data/html/{filename_house}")

# create a BeautifulSoup object
soup = BeautifulSoup(html_file, "html.parser")

# get the listings and store them in a dataframe
dict_listings = get_listings(soup)
df_listings = pd.DataFrame(dict_listings)

df_listings = pre_process_data(df_listings.copy())

df_listings.head()

Brand New Single House And Lot In BF Resort With Roof Deck And CCTV Included
<img alt="Single-family House For Sale in Talon Dos, Las Piñas, Metro Manila" height="230" src="https://static-ph.lamudi.com/static/media/bm9uZS9ub25l/2x2x2x360x230/11b6f0df3f7675.webp" style="opacity:1" width="360">
</img>
2 Storey Spacious Townhouse For Sale in BF Resort Village, Las Piñas City
<img alt="Townhouse For Sale in Talon Dos, Las Piñas, Metro Manila" height="230" src="https://static-ph.lamudi.com/static/media/bm9uZS9ub25l/2x2x2x360x230/95c5bca8252bdd.webp" style="opacity:1" width="360"/>
Reasonable price brand new single attached house in Pilar Village Las Pinas
<img alt="Single-family House For Sale in Pilar, Las Piñas, Metro Manila" data-src="https://static-ph.lamudi.com/static/media/bm9uZS9ub25l/2x2x2x360x230/818f851b3ec41f.webp" height="230" src="https://asset-ph.lamudi.com/img/placeholder-image.svg" width="360"/>
Spectacular PRIME LOCATION Modern 3 Storey house and lot For Sale in Pilar Villa

Unnamed: 0,Category,Title,Price,Region,Bedrooms,Floor Area,Lot Area,URL,Image Link,Barangay,Town/City,Longitude,Latitude
0,house,Brand New Single House And Lot In BF Resort Wi...,11800000,Metro Manila,4,222,96,https://www.lamudi.com.ph/brand-new-single-hou...,https://static-ph.lamudi.com/static/media/bm9u...,Talon Dos,Las Piñas,120.985214,14.438526
1,house,2 Storey Spacious Townhouse For Sale in BF Res...,8800000,Metro Manila,4,189,120,https://www.lamudi.com.ph/2-storey-spacious-to...,https://static-ph.lamudi.com/static/media/bm9u...,Talon Dos,Las Piñas,120.99425,14.43278
2,house,Reasonable price brand new single attached hou...,7200000,Metro Manila,4,128,120,https://www.lamudi.com.ph/reasonable-price-bra...,https://static-ph.lamudi.com/static/media/bm9u...,Pilar,Las Piñas,121.00894,14.42465
3,house,Spectacular PRIME LOCATION Modern 3 Storey hou...,10758000,Metro Manila,4,216,105,https://www.lamudi.com.ph/spectacular-prime-lo...,https://static-ph.lamudi.com/static/media/bm9u...,Pilar,Las Piñas,121.0080137,14.417168
4,house,Brand New House And Lot In Bf Resort With Mode...,9650000,Metro Manila,4,120,85,https://www.lamudi.com.ph/brand-new-house-and-...,https://static-ph.lamudi.com/static/media/bm9u...,Talon Dos,Las Piñas,120.985214,14.438526


In [16]:
df_listings.shape

(2670, 13)

In [18]:
df_listings.head()

Unnamed: 0,Category,Title,Price,Region,Bedrooms,Floor Area,Lot Area,URL,Image Link,Barangay,Town/City,Longitude,Latitude
0,house,Brand New Single House And Lot In BF Resort Wi...,11800000,Metro Manila,4,222,96,https://www.lamudi.com.ph/brand-new-single-hou...,https://static-ph.lamudi.com/static/media/bm9u...,Talon Dos,Las Piñas,120.985214,14.438526
1,house,2 Storey Spacious Townhouse For Sale in BF Res...,8800000,Metro Manila,4,189,120,https://www.lamudi.com.ph/2-storey-spacious-to...,https://static-ph.lamudi.com/static/media/bm9u...,Talon Dos,Las Piñas,120.99425,14.43278
2,house,Reasonable price brand new single attached hou...,7200000,Metro Manila,4,128,120,https://www.lamudi.com.ph/reasonable-price-bra...,https://static-ph.lamudi.com/static/media/bm9u...,Pilar,Las Piñas,121.00894,14.42465
3,house,Spectacular PRIME LOCATION Modern 3 Storey hou...,10758000,Metro Manila,4,216,105,https://www.lamudi.com.ph/spectacular-prime-lo...,https://static-ph.lamudi.com/static/media/bm9u...,Pilar,Las Piñas,121.0080137,14.417168
4,house,Brand New House And Lot In Bf Resort With Mode...,9650000,Metro Manila,4,120,85,https://www.lamudi.com.ph/brand-new-house-and-...,https://static-ph.lamudi.com/static/media/bm9u...,Talon Dos,Las Piñas,120.985214,14.438526


### Export to CSV file the preprocessed data

In [19]:
# Export to CSV file
df_listings.to_csv('../data/csv/lamudi_house_region.csv', index=False)