In [1]:
# Libraries
library(xml2)
library(rvest)
library(tidyverse)
library(magrittr)
library(purrr)
library(glue)
library(stringr)
library(tidytext)
library(textdata)
library(dplyr)
library(httr)
library(jsonlite)

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m         masks [34mstats[39m::filter()
[31m✖[39m [34mreadr[39m::[32mguess_encoding()[39m masks [34mrvest[39m::guess_encoding()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m            masks [34mstats[39m::lag()

Attaching package: 'magrittr'


The following object is masked from 'package:purrr':

    set_names


The following object is masked from 'package:tidyr':

    extrac

In [2]:
## Scrape running events page 
# URL for base page 
url <- "https://runningevents.co.nz/#events"
# Base page HTML
events_html <- read_html(url)
# Event name 
event_name <- events_html %>%
    html_nodes('.mb-3 .position-relative h5') %>%
    html_text()
# Event date 
event_date <- events_html %>%
    html_nodes('.mb-3 .position-relative p') %>%
    html_text()
# Event link
event_link <- events_html %>%
    html_nodes('a.mt-2') %>% 
    html_attrs() %>% 
    map_chr("href")
# Dataframe 
event_link_df <- data.frame(Event = event_name, Date = event_date, Link = event_link)
# Remove date column
event_link_df <- event_link_df[-2]
# Show results
event_link_df

Event,Link
<chr>,<chr>
Tauranga Marathon,https://taurangamarathon.nz
Devonport Half Marathon,https://devonporthalfmarathon.co.nz
Run Orewa,https://runorewa.nz
Corporate Challenge Wellington,https://corporatechallenge.co.nz
Corporate Challenge Christchurch,https://corporatechallenge.co.nz
Corporate Challenge Auckland,https://corporatechallenge.co.nz
Run The Point,https://runthepoint.nz
Omaha Half Marathon,https://omahahalfmarathon.co.nz
Run Devonport,https://rundevonport.nz
Coatesville Half Marathon,https://coatesvillehalfmarathon.co.nz


In [3]:
## Scrape running events page further detail
# URL for base page 
url <- "https://results.runningevents.co.nz/Home"
# Base page HTML
runningevents_html <- read_html(url)
# Event table 
event_table <- runningevents_html %>%
    html_nodes('#data-table-results') %>%
    html_table()
event_df <- event_table[[1]]

## Tidy data 
# Remove empty column
event_df <- event_df[-5]
event_df <- event_df[-3]
# Remove date from 
event_df$Event <- gsub('[0-9]+', '', event_df$Event)

## City
# Locations and corresponding cities 
location_list <- list(
    "Rotorua" = c("Rotorua"),
    "Auckland" = c("Auckland", "Devonport", "Coatesville", "Albany", "Orewa", "Omaha", "Maraetai", "run the point", "waterfront"), 
    "Christchurch" = c("Christchurch"), 
    "Tauranga" = c("Tauranga"), 
    "Wellington" = c("Wellington"), 
    "Virtual" = c("Virtual"))
# Names of cities
cities <- names(location_list)
# Creating empty columns in events df
event_df$Location <- NA
event_df$City <- NA
# Fill in city and location results
for (city in cities) {
    for (location in location_list[[city]]) {
        matches <- grep(location, event_df$Event, ignore.case = TRUE) # if location matches regular expression
        event_df[matches, "City"]  = city # populate cell of df with city
        if (location == 'run the point') {
            location <- 'Hobsonville'}
        if (location == 'waterfront') {
            location <- 'Auckland'}
        event_df[matches, "Location"]  = location # populate cell of df with location
    }}

## Recent events 
# Convert date to datetime
event_df$Date <- as.POSIXct(event_df$Date, format = "%d/%m/%Y")
# Remove all but most recent occurance 
event_df <-  event_df %>% 
    group_by(Event) %>% 
    filter(Date == max(Date))

# Show results
event_df %>%
    head()

Date,Event,Participants,Location,City
<dttm>,<chr>,<int>,<chr>,<chr>
2011-11-02,Corporate Challenge - Christchurch,427,Christchurch,Christchurch
2011-11-16,Corporate Challenge - Wellington,546,Wellington,Wellington
2011-11-23,Corporate Challenge - Auckland,1120,Auckland,Auckland
2012-11-07,Fidelity Life Corporate Challenge - Christchurch,375,Christchurch,Christchurch
2012-11-14,Fidelity Life Corporate Challenge - Wellington,289,Wellington,Wellington
2012-11-21,Fidelity Life Corporate Challenge - Auckland,795,Auckland,Auckland


In [4]:
## Combine tables 
# Join tables
event_df$Event <- trimws(event_df$Event)
event_df <- left_join(event_link_df, event_df, by = 'Event')

# Show results
event_df

Event,Link,Date,Participants,Location,City
<chr>,<chr>,<dttm>,<int>,<chr>,<chr>
Tauranga Marathon,https://taurangamarathon.nz,2022-09-18,1570,Tauranga,Tauranga
Devonport Half Marathon,https://devonporthalfmarathon.co.nz,2022-10-02,1268,Devonport,Auckland
Run Orewa,https://runorewa.nz,2022-10-16,834,Orewa,Auckland
Corporate Challenge Wellington,https://corporatechallenge.co.nz,2020-11-11,393,Wellington,Wellington
Corporate Challenge Christchurch,https://corporatechallenge.co.nz,2022-02-16,767,Christchurch,Christchurch
Corporate Challenge Auckland,https://corporatechallenge.co.nz,2020-11-18,938,Auckland,Auckland
Run The Point,https://runthepoint.nz,2022-02-20,771,Hobsonville,Auckland
Omaha Half Marathon,https://omahahalfmarathon.co.nz,2022-03-27,1689,Omaha,Auckland
Run Devonport,https://rundevonport.nz,2022-02-07,768,Devonport,Auckland
Coatesville Half Marathon,https://coatesvillehalfmarathon.co.nz,2022-02-13,1270,Coatesville,Auckland


In [5]:
## Save as CSV file 
write.csv(event_df,"events.csv")
