## Multiple Page Scraping

In [1]:
import requests
import pandas as pd
import re

from bs4 import BeautifulSoup
pd.set_option('display.max_columns',None)

#global dataframe
data_combined = pd.DataFrame()

for page in range (1,7):
  print (f'Processing page {page}')
  if page==1:
    url = 'https://www.bukitvista.com/search-results?location%5B%5D=&areas%5B%5D=&bedrooms='
  else:
    url = f'https://www.bukitvista.com/search-results/page/{page}?location%5B0%5D&bedrooms'
  response = requests.get(url)
  soup = BeautifulSoup(response.text,'html.parser')

  containers = soup.find_all('div', attrs={'class': 'item-wrap item-wrap-v1 item-wrap-no-frame h-100'})

  #list all villa names
  villa_name = []

  for name in containers:
      try:
        name.find('div',attrs={'class':'item-body flex-grow-1'})
        villa_name.append(name.find('h2',attrs={'class':'item-title'}).text)
      except:
        villa_name.append('')

  #list all addresses
  location = []

  for loc in containers:
      try:
        location.append(loc.find('address',attrs={'class':'item-address'}).text)
      except:
        location.append('')

  #list number of rooms
  rooms = []

  for room in containers:
    try:
        rooms.append(room.find('li',attrs={'class':'h-beds'}).text)
    except:
        rooms.append('')

  #list number of bathrooms
  bathrooms = []

  for bthrm in containers:
    try:
        bathrooms.append(bthrm.find('li',attrs={'class':'h-baths'}).text)
    except:
        bathrooms.append('')

  #list all prices
  prices = []

  for usd in containers:
    try:
        usd.find('ul',attrs={'class':'item-price-wrap hide-on-list'})
        prices.append(usd.find('li',attrs={'class':'item-price item-price-text'}).text)
    except:
        prices.append('')

  #list room types
  room_type = []

  for rmtype in containers:
    try:
        room_type.append(rmtype.find('li',attrs={'class':'h-type'}).text)
    except:
        room_type.append('')

  #list all links
  links = []

  for link in containers:
    try:
        links.append(link.find('a',attrs={'target':'_self'})['href'])
    except:
        links.append('')

  #list the number of maximum possible guests
  guests = []

  for link in links:
    if link:  #only continue if the link is not empty
        url = link
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        guest = soup.find('li', attrs={'class': 'guest-number'})
        if guest:  # Check if address is found
            visit = guest.find('span').text.strip()  # Extract the address from the <span> tag
            guests.append(visit)
        else:
            guests.append('')
    else:
        guests.append('Invalid link')

  #combine all data in a dataframe
  df = pd.DataFrame ({
    'VillaName': villa_name,
    'Address': location,
    'Prices': prices,
    'Bedrooms': rooms,
    'Bathrooms': bathrooms,
    'GuestNo': guests,
    'PropertyType': room_type,
    'URL': links
  })

  data_combined = pd.concat([data_combined,df])
  print(f'End of page {page}')

Processing page 1
End of page 1
Processing page 2
End of page 2
Processing page 3
End of page 3
Processing page 4
End of page 4
Processing page 5
End of page 5
Processing page 6
End of page 6


## Data Cleaning

In [95]:
bv_data = data_combined

In [96]:
#filter properties in jogja
substring = 'Yogyakarta'
filter = bv_data['Address'].str.contains(substring)
bv_data = bv_data[~filter]

In [97]:
bv_data = bv_data.reset_index(drop = True)
bv_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   VillaName     46 non-null     object
 1   Address       46 non-null     object
 2   Prices        46 non-null     object
 3   Bedrooms      46 non-null     object
 4   Bathrooms     46 non-null     object
 5   GuestNo       46 non-null     object
 6   PropertyType  46 non-null     object
 7   URL           46 non-null     object
dtypes: object(8)
memory usage: 3.0+ KB


In [5]:
bv_data.head(10)

Unnamed: 0,VillaName,Address,Prices,Bedrooms,Bathrooms,GuestNo,PropertyType,URL
0,\nUluwatu Modern Boho Villa Near Nyang Nyang B...,"Jl. Batu Nunggul No.1, Pecatu, Kec. Kuta Sel.,...",$258 per 2 nights,Beds: 2,Baths: 2,,"Amazing pool, Island life, Pool view, Surfing,...",https://www.bukitvista.com/property/uluwatu-mo...
1,\nBingin Beach Hideaway: Group Villa with Pool...,"Jl. Pantai Cemongkak Gg. Samuh Sari No.2, Peca...",Starting from USD 161 per night,Beds: 3,Baths: 3,6.0,"Amazing pool, Island life, Pool view, Surfing,...",https://www.bukitvista.com/property/bingin-bea...
2,\n4-Bedroom Mediterranean Luxury Villa with Ub...,"Jl. Suweta, Ubud, Kecamatan Ubud, Kabupaten Gi...",USD 202 / Night,Beds: 4,Baths: 4,8.0,"Amazing pool, Amazing View, Jungle View, Pool ...",https://www.bukitvista.com/property/mediterran...
3,\nLuxurious 3-Bedroom Nusa Dua Seafront Villa ...,,Starting from USD 715 per 2 nights,Beds: 3,Baths: 4,6.0,"Beachfront, Villa",https://www.bukitvista.com/property/nusa-dua-s...
4,\nGrand Villa Retreat w/ Pool & Garden in Unga...,"Jl. Pantai Balangan I No.9x, Ungasan, Kec. Kut...",Starting from USD 84 per night,Beds: 2,Baths: 2,4.0,"Amazing pool, Golfing, Pool view, Villa",https://www.bukitvista.com/property/grand-vill...
5,\nSurfer’s Villa 4 Mins to Bingin & Dreamland ...,"Pecatu, Kuta Selatan, Badung, Bali, Nusa Tengg...",Starting from USD 118 per night,Beds: 2,Baths: 2.5,4.0,"Pool view, Villa",https://www.bukitvista.com/property/surfers-vi...
6,\nPrivate Pool Villa Minutes from Bingin Surf ...,"Pecatu, Kuta Selatan, Badung, Bali, Nusa Tengg...",Starting from USD 165 per night,Beds: 2,Baths: 2,4.0,"Pool view, Villa",https://www.bukitvista.com/property/private-po...
7,\nUngasan Exquisite Villa w/ Rooftop & Private...,"Ungasan, Kuta Selatan, Badung, Bali, Nusa Teng...",Starting from USD 167 per night,Beds: 3,Baths: 3.5,6.0,"Amazing pool, Golfing, Surfing, Tropical, Villa",https://www.bukitvista.com/property/ungasan-ex...
8,\nSun-Soaked Canggu Villa Perfect for Families\n,"Tibubeneng, Kuta Utara, Badung, Bali, Nusa Ten...",Starting from USD 100 per night,Beds: 2,Baths: 2,4.0,Villa,https://www.bukitvista.com/property/sun-soaked...
9,\nSunny Exquisite Umalas Villa: 20 Minutes to ...,,Starting from USD 108 per night,Beds: 2,Baths: 2,4.0,"Guest House, Villa",https://www.bukitvista.com/property/sunny-exqu...


In [98]:
#clean villa name
bv_data['VillaName']= bv_data['VillaName'].str.strip()

In [99]:
#clean address to form a new area column
target_locations =['Canggu', 'Umalas', 'Seminyak', 'Legian', 'Uluwatu', 'Jimbaran', 'Uluwatu', 'Pecatu', 'Kutuh',
                   'Ubud', 'Sanur', 'Nusa Dua', 'Ungasan', 'Lembongan', 'Nusapenida', 'Denpasar']

def extract_location(row, locations):
    #combine VillaName and Address into one string for checking
    combined_text = str(row['VillaName']) + ' ' + str(row['Address'])
    
    #iterate through the locations list and check if any of the locations are in the combined text
    for location in locations:
        if location.lower() in combined_text.lower():
            return location
    return 'Others'  #if no match found

In [100]:
bv_data['Area'] = bv_data.apply(lambda row: extract_location(row, target_locations), axis=1)

In [40]:
bv_data[['VillaName', 'Address', 'Area']].head()

Unnamed: 0,VillaName,Address,Area
0,Uluwatu Modern Boho Villa Near Nyang Nyang Beach,"Jl. Batu Nunggul No.1, Pecatu, Kec. Kuta Sel.,...",Uluwatu
1,Bingin Beach Hideaway: Group Villa with Pool &...,"Jl. Pantai Cemongkak Gg. Samuh Sari No.2, Peca...",Pecatu
2,4-Bedroom Mediterranean Luxury Villa with Ubud...,"Jl. Suweta, Ubud, Kecamatan Ubud, Kabupaten Gi...",Ubud
3,Luxurious 3-Bedroom Nusa Dua Seafront Villa w/...,,Nusa Dua
4,Grand Villa Retreat w/ Pool & Garden in Ungasan,"Jl. Pantai Balangan I No.9x, Ungasan, Kec. Kut...",Ungasan


In [101]:
#clean price column
#create fucntion to search for currency, price, and unit if exists
def extract_currency (text):
  if text is not None:
    match  = re.search(r'(USD|\$|Rupiah|Rp)', text)
    if match:
      return match.group(0)
  return None

def extract_price (text):
  if text is not None:
    match  = re.search(r'(USD|\$|Rp)\s?([\d.,]+)', text)
    if match:
      return int(match.group(2).replace('.','').replace(',',''))
  return None

def extract_unit (text):
  if text is not None:
    match  = re.search(r'(?:per|/)\s*(.+)', text)
    if match:
      return match.group(1)
  return None

In [102]:
#extract currency, price, and unit
bv_data['Currency'] = bv_data['Prices'].apply(extract_currency)
bv_data['PriceValue'] = bv_data['Prices'].apply(extract_price)
bv_data['Unit'] = bv_data['Prices'].apply(extract_unit)

#drop redundant columns
bv_data = bv_data.drop(columns = ['Prices'])

#standardize price values
exchange_rate = 16000  # 1 USD = 15,500 IDR (check latest)

#convert curencies
bv_data.loc[bv_data['Currency'] == 'Rp', 'PriceValue'] = bv_data['PriceValue'] / exchange_rate  #convert IDR to USD
bv_data['Currency'] = bv_data['Currency'].replace({'$': 'USD','Rp': 'USD'})

#convert unit
def convert_unit(row):
    if row['Unit'] == '2 nights':
        return row['PriceValue'] / 2  #convert price to per-night rate
    elif row['Unit'] in ['month', 'Month']:
        return row['PriceValue'] / 30  #approximate per-night cost
    else:
        return row['PriceValue'] #keep all other values the same

bv_data['PriceValue'] = bv_data.apply(convert_unit, axis=1)
bv_data['Unit'] = 'night'

In [103]:
#get number of bedrooms
bv_data['Bedrooms'] = bv_data['Bedrooms'].str.split(': ').str[1]
bv_data['Bedrooms'] = bv_data['Bedrooms'].astype(int)

#get number of bathrooms
bv_data['Bathrooms'] = bv_data['Bathrooms'].str.split(': ').str[1]
bv_data['Bathrooms'] = bv_data['Bathrooms'].astype(float)

#fill missing guest numbers
bv_data['GuestNo'] = pd.to_numeric(bv_data['GuestNo'], errors = 'coerce')
bv_data['GuestNo'] = bv_data['GuestNo'].fillna(bv_data['Bedrooms'] * 2)

In [104]:
bv_data.head()

Unnamed: 0,VillaName,Address,Bedrooms,Bathrooms,GuestNo,PropertyType,URL,Area,Currency,PriceValue,Unit
0,Uluwatu Modern Boho Villa Near Nyang Nyang Beach,"Jl. Batu Nunggul No.1, Pecatu, Kec. Kuta Sel.,...",2,2.0,4.0,"Amazing pool, Island life, Pool view, Surfing,...",https://www.bukitvista.com/property/uluwatu-mo...,Uluwatu,USD,129.0,night
1,Bingin Beach Hideaway: Group Villa with Pool &...,"Jl. Pantai Cemongkak Gg. Samuh Sari No.2, Peca...",3,3.0,6.0,"Amazing pool, Island life, Pool view, Surfing,...",https://www.bukitvista.com/property/bingin-bea...,Pecatu,USD,161.0,night
2,4-Bedroom Mediterranean Luxury Villa with Ubud...,"Jl. Suweta, Ubud, Kecamatan Ubud, Kabupaten Gi...",4,4.0,8.0,"Amazing pool, Amazing View, Jungle View, Pool ...",https://www.bukitvista.com/property/mediterran...,Ubud,USD,202.0,night
3,Luxurious 3-Bedroom Nusa Dua Seafront Villa w/...,,3,4.0,6.0,"Beachfront, Villa",https://www.bukitvista.com/property/nusa-dua-s...,Nusa Dua,USD,357.5,night
4,Grand Villa Retreat w/ Pool & Garden in Ungasan,"Jl. Pantai Balangan I No.9x, Ungasan, Kec. Kut...",2,2.0,4.0,"Amazing pool, Golfing, Pool view, Villa",https://www.bukitvista.com/property/grand-vill...,Ungasan,USD,84.0,night


In [105]:
bv_data['PropertyType'].unique()

array(['Amazing pool, Island life, Pool view, Surfing, Tropical, Villa',
       'Amazing pool, Amazing View, Jungle View, Pool view, Tropical, Villa',
       'Beachfront, Villa', 'Amazing pool, Golfing, Pool view, Villa',
       'Pool view, Villa',
       'Amazing pool, Golfing, Surfing, Tropical, Villa', 'Villa',
       'Guest House, Villa', 'Guest House', 'Guest House, Ocean view',
       'Amazing View, Jungle View, Residential, Tropical, View, Villa',
       'Rice paddy view, Villa', 'View', 'Residential, Villa',
       'View, Villa', 'Guest House, Residential',
       'Beachfront, Guest House, Residential',
       'Beachfront, Residential, Villa', 'Villa, Residential', '',
       'Jungle View, Villa'], dtype=object)

In [106]:
property_types = {'Villa', 'Guest House', 'Residential'}

#extract property type and features
def extract_types_and_features(property_type):
    items = [x.strip() for x in property_type.split(",") if x.strip()]  
    extracted_types = [x for x in items if x in property_types] 
    features = [x for x in items if x not in property_types]

    return extracted_types if extracted_types else ['Unknown'], features

bv_data[['PropertyType', 'Features']] = bv_data['PropertyType'].apply(lambda x: pd.Series(extract_types_and_features(x)))

In [107]:
bv_data.head()

Unnamed: 0,VillaName,Address,Bedrooms,Bathrooms,GuestNo,PropertyType,URL,Area,Currency,PriceValue,Unit,Features
0,Uluwatu Modern Boho Villa Near Nyang Nyang Beach,"Jl. Batu Nunggul No.1, Pecatu, Kec. Kuta Sel.,...",2,2.0,4.0,[Villa],https://www.bukitvista.com/property/uluwatu-mo...,Uluwatu,USD,129.0,night,"[Amazing pool, Island life, Pool view, Surfing..."
1,Bingin Beach Hideaway: Group Villa with Pool &...,"Jl. Pantai Cemongkak Gg. Samuh Sari No.2, Peca...",3,3.0,6.0,[Villa],https://www.bukitvista.com/property/bingin-bea...,Pecatu,USD,161.0,night,"[Amazing pool, Island life, Pool view, Surfing..."
2,4-Bedroom Mediterranean Luxury Villa with Ubud...,"Jl. Suweta, Ubud, Kecamatan Ubud, Kabupaten Gi...",4,4.0,8.0,[Villa],https://www.bukitvista.com/property/mediterran...,Ubud,USD,202.0,night,"[Amazing pool, Amazing View, Jungle View, Pool..."
3,Luxurious 3-Bedroom Nusa Dua Seafront Villa w/...,,3,4.0,6.0,[Villa],https://www.bukitvista.com/property/nusa-dua-s...,Nusa Dua,USD,357.5,night,[Beachfront]
4,Grand Villa Retreat w/ Pool & Garden in Ungasan,"Jl. Pantai Balangan I No.9x, Ungasan, Kec. Kut...",2,2.0,4.0,[Villa],https://www.bukitvista.com/property/grand-vill...,Ungasan,USD,84.0,night,"[Amazing pool, Golfing, Pool view]"


In [108]:
bv_data.to_csv('BukitVistaScrape.csv')