In [None]:
knitr::opts_chunk$set(echo = TRUE)


## Question 1.
#### Suppose we have a matrix of 1s and 0s. We want to create a vector as follows: for each row of the matrix, the corresponding element of the vector will be either 1 or 0, depending on whether the majority of the first c elements in the row is 1 or 0. Here c will be a parameter which we want to control. Create a function to perform this task.



In [None]:
rowsWithMajorityOnes <- function(m, c) {

  if (!is.matrix(m)){
    return("Error. First parameter must be a matrix.")
  }
  else if (any((m==0)+(m==1) != 1)){
    return("Error. Matrix can only contain values of 0 or 1.")
  }
  if (c > dim(m)[2]){
    return("Error. Second parameter c must not be greater than number of columns in matrix.")
  }

  v = (length=dim(m)[1])
  for (i in 1:dim(m)[1]) {
    v[i] <- ifelse(sum(m[i,1:c])>=(c/2), 1, 0)
 }
  return(v)
}


The following is an example to test the rowsWithMajorityOnes function.


In [None]:
m <- matrix(nrow=5, ncol=7)
m <- apply(m, c(1,2), function(x) sample(c(0,1),1))
c = 5

m
rowsWithMajorityOnes(m, c)


## Question 2.
#### Create a script to crawl all property data from SRX.com.sg.



In [1]:
library(magrittr)
library(dplyr)
library(rvest)

host_url = 'https://www.srx.com.sg'


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: xml2


In [2]:
# this function will fetch all listing nodes and return them as a list
fetch_property_nodes = function(page) {
    url = paste(host_url, "/search/sale/residential?page=", page, sep = "")
    srx_html = read_html(url)
    listing = html_nodes(srx_html, ".listingDetailTitle")
    if(length(listing) == 0)
        return(c('Done'))
    return(listing)
}

In [3]:
# this is just a try-catch wrapper on top of ^
trycatch_fetch_property_notes = function(n) {
    result = tryCatch({
        return (fetch_property_nodes(n))
    }, warning = function(n) {
        print(paste("Warning in crawling the ", n, "th page of SRX", sep = ""))
    }, error = function(n) {
        print(paste("Error in crawling the ", n, "th page of SRX", sep = ""))
        return(c('Done'))
    }, finally = function(n) {
    })
    return (result)
}

In [39]:
# this function takes in the node's 1) details 2) facilities and 3) agent name and spits out a single-row df for entry
get_df_from_node_list = function(listing_info_node, facilities_info_node, agent_name, min_dist) {
    labels = listing_info_node %>% html_children %>% `[`(c(T, F)) %>% html_text %>% takeout_last_char
    values = listing_info_node %>% html_children %>% `[`(c(F, T)) %>% html_text %>% remove_tabs
    facilities = facilities_info %>% html_children %>% html_text %>% remove_tabs %>% paste(collapse=' | ')

    data_list = list()
    data_list[labels] = values
    data_list['Facilities'] = facilities
    data_list['Agent'] = agent_name
    data_list['Min.Distance'] = min_dist

    data_list %>% as.data.frame
}

In [134]:
# simple function to remove spaces of all kinds
remove_tabs = function(x) {
    result = gsub("([\t]|[\r\n])", "", x)
    result
}

# simple function to remove the colon from the label
takeout_last_char = function(x) {
    x %>% substr(1, nchar(x) - 1)
}

In [199]:
mrt_class = ".amenity-15"
distance_class = ".side-nearby-amenity-train-bus-distance"

In [147]:
convert_km_to_m = function(dist) {
    if (identical(dist, character(0))) {
        return (list())
    }
    if (substring(dist, nchar(dist)-1, nchar(dist)) == 'km') {
        return (as.numeric(substring(dist, 1, nchar(dist)-2)) * 1000)
    } else {
        return (as.numeric(substring(dist, 1, nchar(dist)-1)))
    }
}

In [191]:
get_min_distance = function(html) {
    distance = html %>%
        html_nodes(mrt_class) %>%
        html_nodes(distance_class) %>%
        html_text %>%
        remove_tabs %>%
        sapply(convert_km_to_m) %>% 
        as.vector %>% 
        get_min_dist_if_exists
    
    return (distance)
}

In [203]:
get_min_dist_if_exists = function(dists) {
    if (length(dists) == 0) {
        return (NA)
    } else {
        return (min(dists))
    }
}

In [216]:
# the real script,
# 1. calls a query for each page
# 2. for every listing, call a query to pull data about listing
# 3. feeds listing data into get_df_from_node_list to get a single row df
# 4. stitch df into main df
# 5. breaks when done
#
# this function can be further refactored for SLAP (not important for this mod)

n = 1L
prop_df = data.frame()

while(n < 0){ #we use 100,000 instead of TRUE so that even if the code fails, we don't go on forever
    property = trycatch_fetch_property_notes(n)
    print(n)

    if (property[1] == 'Done') {
        break
    }

    for (i in 1:length(property)) {
        listing_url = paste(host_url, html_attr(property[i], 'href'), sep="")
        listing_html = read_html(listing_url)
        listing_data = html_nodes(listing_html, '.listing-about-main')
        listing_info = html_nodes(listing_data[1], 'p')
        facilities_info = listing_data[2]
        min_dist = suppressWarnings(get_min_distance(listing_html))
        agent_name = listing_html %>% html_node('.featuredAgentName') %>% html_text
        
        row_df = get_df_from_node_list(listing_info, facilities_info, agent_name, min_dist)
        prop_df = suppressWarnings(bind_rows(prop_df, row_df))
    }

    n = n + 1
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
[1] 16
[1] 17
[1] 18
[1] 19
[1] 20
[1] 21
[1] 22
[1] 23
[1] 24
[1] 25
[1] 26
[1] 27
[1] 28
[1] 29
[1] 30
[1] 31
[1] 32
[1] 33
[1] 34
[1] 35
[1] 36
[1] 37
[1] 38
[1] 39
[1] 40
[1] 41
[1] 42
[1] 43
[1] 44
[1] 45
[1] 46
[1] 47
[1] 48
[1] 49
[1] 50
[1] 51
[1] 52
[1] 53
[1] 54
[1] 55
[1] 56
[1] 57
[1] 58
[1] 59
[1] 60
[1] 61
[1] 62
[1] 63
[1] 64
[1] 65
[1] 66
[1] 67
[1] 68
[1] 69
[1] 70
[1] 71
[1] 72
[1] 73
[1] 74
[1] 75
[1] 76
[1] 77
[1] 78
[1] 79
[1] 80
[1] 81
[1] 82
[1] 83
[1] 84
[1] 85
[1] 86
[1] 87
[1] 88
[1] 89
[1] 90
[1] 91
[1] 92
[1] 93
[1] 94
[1] 95
[1] 96
[1] 97
[1] 98
[1] 99
[1] 100


In [209]:
nrow(prop_df[is.na(prop_df$Min.Distance),])

In [210]:
prop_df[is.na(prop_df$Min.Distance),]

Unnamed: 0,Property.Name,Property.Type,Asking,PSF,Built.Year,Model,Developer,Address,District,Bedrooms,Bathrooms,Furnish,Floor,Area,Tenure,No..of.Units,Facilities,Agent,Min.Distance,HDB.Town
54,The Oceanfront @ Sentosa Cove,Condominium,"$12,800,000","$2,139 psf (Built-up)",2010.0,Condominium,TC Development Pte Ltd,289 Ocean Drive (098531),D4 - Sentosa / Harbourfront,4,4.0,Partially Furnished,15,"5,983 sqft (Built-up)",LEASEHOLD/99 years,264.0,Corner Unit | Sea View | Air Conditioning | Bathtub | Intercom | Water Heater | Cooker Hob/hood | Private Lift | Balcony | Maidsroom | Roof Terrace | Terrace | Walk-in-wardrobe,Harry Zeng (MBA-EXPERT-SRX) 曾伟源,,
85,Pearl Island,Detached,"$15,633,675","$2,384 psf (Land) /$2,369 psf (Built-up)",2012.0,Detached,,Pearl Island,D4 - Sentosa / Harbourfront,5,5.0,Partially Furnished,,"6,556 sqft (Land) / 6,598 sqft (Built-up)",,,Corner Unit | Swimming Pool View | Air Conditioning | Water Heater | Bathtub | Cooker Hob/hood | Private Pool | Private Lift | Outdoor Patio | Walk-in-wardrobe | Bombshelter | Maidsroom | Private Garden | Terrace,Harry Zeng (MBA-EXPERT-SRX) 曾伟源,,
138,Pearl Island,Detached,"$33,000,000","$2,828 psf (Land) /$2,928 psf (Built-up)",2012.0,Detached,,Pearl Island,D4 - Sentosa / Harbourfront,7,,,,"11,667 sqft (Land) / 11,270 sqft (Built-up)",LEASEHOLD/99 years,,Park/greenery View | Sea View | Swimming Pool View | Corner Unit | Hairdryer | Bathtub | Intercom | Cooker Hob/hood | Jacuzzi | Private Pool | Water Heater | Air Conditioning | Bombshelter | Balcony | Garage,Linda Ong,,
151,Pearl Island,Detached,"$23,000,000","$2,429 psf (Land) /$2,252 psf (Built-up)",2012.0,Detached,,Pearl Island,D4 - Sentosa / Harbourfront,6,7.0,Fully Furnished,,"9,468 sqft (Land) / 10,215 sqft (Built-up)",LEASEHOLD/99 years,,Renovated | Park/greenery View | Swimming Pool View | Private Pool | Water Heater | Bathtub | Cooker Hob/hood | Air Conditioning | Balcony | Bombshelter | Walk-in-wardrobe | Roof Terrace | Outdoor Patio | Maidsroom | Private Garden,Lin Sallee,,
152,Marina Collection,Condominium,"$4,400,000","$1,875 psf (Built-up)",2011.0,Condominium,Lippo Marina Collection Pte Ltd,17 Cove Drive (098329),D4 - Sentosa / Harbourfront,4,4.0,Fully Furnished,02,"2,347 sqft (Built-up)",LEASEHOLD/99 years,124.0,Lawn | Clubhouse | Jacuzzi | Playground | Docking Station | Pool Deck | Lap Pool | Garden Plaza | Bbq Terrace | Leisure Pool | Wading Pool | Outdoor Terrace | Entry Driveway | Swimming Pool View | Corner Unit | Water Heater | Bathtub | Intercom | Cooker Hob/hood | Air Conditioning | Balcony,Lin Sallee,,
241,Marina Collection,Condominium,"$4,500,000","$2,059 psf (Built-up)",2011.0,Condominium,Lippo Marina Collection Pte Ltd,13 Cove Drive (098327),D4 - Sentosa / Harbourfront,4,4.0,Partially Furnished,LOW,"2,185 sqft (Built-up)",LEASEHOLD/99 years,124.0,Leisure Pool | Docking Station | Garden Plaza | Clubhouse | Bbq Terrace | Jacuzzi | Lap Pool | Pool Deck | Entry Driveway | Wading Pool | Lawn | Playground | Outdoor Terrace | Park/greenery View | Swimming Pool View | Air Conditioning | Bathtub | Intercom | Water Heater | Cooker Hob/hood | Balcony,Harry Zeng (MBA-EXPERT-SRX) 曾伟源,,
357,Cove Drive,Detached,"$22,800,000","$2,874 psf (Land) /$2,053 psf (Built-up)",2009.0,Detached,,Cove Drive,D4 - Sentosa / Harbourfront,5+1,5.0,Fully Furnished,GROUND,"7,931 sqft (Land) / 11,104 sqft (Built-up)",LEASEHOLD/99 years,,Sea View | Renovated | Swimming Pool View | Air Conditioning | Private Pool | Water Heater | Intercom | Cooker Hob/hood | Bombshelter | Outdoor Patio | Terrace | Private Garden | Walk-in-wardrobe | Balcony | Maidsroom | Roof Terrace,Pang Francis,,
359,Cove Drive,Detached,"$21,800,000","$2,783 psf (Land) /$1,982 psf (Built-up)",2009.0,Detached,,Cove Drive,D4 - Sentosa / Harbourfront,5+1,,,,"7,831 sqft (Land) / 11,000 sqft (Built-up)",LEASEHOLD/99 years,,Sea View | Swimming Pool View | Renovated | Air Conditioning | Bathtub | Private Pool | Intercom | Water Heater | Cooker Hob/hood | Balcony | Maidsroom | Roof Terrace | Bombshelter | Outdoor Patio | Terrace | Garage | Private Garden,Pang Francis,,
393,Marina Collection,Condominium,"$7,009,650","$1,850 psf (Built-up)",2011.0,Penthouse,Lippo Marina Collection Pte Ltd,13 Cove Drive (098327),D4 - Sentosa / Harbourfront,5,,,,"3,789 sqft (Built-up)",LEASEHOLD/99 years,124.0,Outdoor Terrace | Pool Deck | Bbq Terrace | Jacuzzi | Docking Station | Wading Pool | Leisure Pool | Playground | Lap Pool | Entry Driveway | Lawn | Garden Plaza | Clubhouse,Navin Bafna,,
399,Marina Collection,Condominium,"$6,533,980","$1,915 psf (Built-up)",2011.0,Penthouse,Lippo Marina Collection Pte Ltd,17 Cove Drive (098329),D4 - Sentosa / Harbourfront,4,,,,"3,412 sqft (Built-up)",LEASEHOLD/99 years,124.0,Entry Driveway | Pool Deck | Clubhouse | Jacuzzi | Outdoor Terrace | Playground | Wading Pool | Garden Plaza | Lawn | Lap Pool | Leisure Pool | Docking Station | Bbq Terrace,Raymond Ho,,


In [217]:
prop_df %>% nrow

In [218]:
# prop_df %>% write.csv(file='srx_team_6.csv')