# DATA 422 PROJECT SCRAPING

In [1]:
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.1     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [2]:
# install.packages("magrittr")
# install.packages("purrr")
# install.packages("glue")
# install.packages("stringr")
library(magrittr) # better handling of pipes
library(purrr) # to work with lists and map functions
library(glue) # to paste strings
library(stringr) # to hand strings


Attaching package: ‘magrittr’


The following object is masked from ‘package:purrr’:

    set_names


The following object is masked from ‘package:tidyr’:

    extract



Attaching package: ‘glue’


The following object is masked from ‘package:dplyr’:

    collapse




In [3]:
# install.packages("rvest")
#install.packages("polite")
library(rvest) # rvest makes scraping easier
library(polite) # polite is the "polite" version of rvest

Loading required package: xml2


Attaching package: ‘rvest’


The following object is masked from ‘package:purrr’:

    pluck


The following object is masked from ‘package:readr’:

    guess_encoding




In [4]:
# Function to extract the relevant data from a web page (identified by its URL)
# Takes URL correspoding to a web page to be scraped as input
# Returns a data frame containing all the relevant data extracted from the web page involved 
extract_text <- function(url) {
    
    # Extract the headlines summarizing NZ historic weather events 
    summary_text <- url %>%
                    read_html() %>%
                    html_nodes(".summary") %>%
                    html_nodes("dt") %>%
                    html_text() %>%
                    # Remove whitespace from the beginning and end of string
                    str_trim()
    
    # Extract the brief description corresponding to each weather event 
    description_text <- url %>%
                        read_html() %>%
                        html_nodes(".summary") %>%
                        html_nodes("dd") %>%
                        html_text() %>%
                        # Remove whitespace from the beginning and end of string
                        str_trim()
    
    # Create a data frame containing "summary_text" (extratcted above) as the initial column 
    # to extract all the relevant fields from
    df <- tibble(summary_text)
    
    # Extract the first word (representing  from the "summary_text" column and insert it into a new column "Month"
    df$Month <- word(df$summary_text, 1)
    
    # Extract the second word from the "summary_text" column and insert it into a new column "Year"
    df$Year <- as.integer(word(df$summary_text, 2))
    
    # Extract everything except the first two words from the "summary_text" column and insert that into a new column "Summary"
    df$Summary <- word(df$summary_text, 3, -1)
    
    # Add a new column coresponding to the brief description of the weather events
    df <- df %>% 
          mutate(Description = description_text)
    
    # Drop the "summary_text" column 
    df <- subset(df, select = -c(summary_text))
    
    # Areas affected by the weather events
    NZ_areas <- c("South Island", "New Zealand", "Timaru", "West Coast", "North Island",
                  "Upper North Island", "Lower North Island", "Canterbury", "Otago", 
                  "Bay of Plenty", "Wellington", "Nelson", "Auckland", "Tasman", "Taranaki",
                  "Hawke's Bay", "Gisborne", "Southland", "Marlborough")
    NZ_areas_cond <- str_c(NZ_areas, collapse = "|")
    
    # Extract the area name(s) from the "Summary" column and insert the same into a new column "Affected Area"
    df$"Affected Area" <- sapply(str_extract_all(df$Summary, NZ_areas_cond), toString)
    
    df <- df %>% replace(.=="", "New Zealand")
    
    df <- separate_rows(df, "Affected Area", sep = ",")
    
    # Weather events
    Weather_events <- c("Flood", "Storm", "Hail", "Cyclone", "Heavy Rain", "Cold Snap",
                        "Tornado", "Weather Bomb", "Winds", "Snowfall")
    
    Weather_events_cond <- str_c(Weather_events, collapse = "|")
    
    # Extract the weather event(s) from the "Summary" column and insert the same into a new column "Weather Event"
    df$"Weather Event" <- sapply(str_extract_all(df$Summary, Weather_events_cond), toString)
    
    # Reordering the columns
    df <- df %>%
          select(Year, Month, "Affected Area":"Weather Event", Description)
    
    return(df) 
}


In [5]:
# Part of URL common to all the web pages to be scraped
url_subset <- "https://hwe.niwa.co.nz/search/summary/Startdate/01-01-2010/Enddate/02-10-2020/Regions/all/Hazards/all/Impacts/all/Keywords/none/numberOfEvents/20/page/"

# Total number of web pages to be scraped
total_pages = 3

# List containing all the data frames corresponding to different web pages' data
df_list <- list()

# Extracting the data from different web pages   
for (i in 1:total_pages) {
    # URL corresponding to a web page to be scraped 
    url <- paste0(url_subset, i)
    # Extacting the data from a particular web page in the form of a data frame using the function defined above 
    df <- extract_text(url)
    df_list <- append(df_list, list(df))
}

# Merging the rows of all the data frames (corresponding to the data from different web pages) together
# to get the final data frame containing all the required data relevant to NZ historic weather events
nz_historic_weather_events_df <- do.call("rbind", df_list)

nz_historic_weather_events_df


Year,Month,Affected Area,Weather Event,Description
<int>,<chr>,<chr>,<chr>,<chr>
2020,February,South Island,Flood,"Fronts within a very warm northwesterly flow brought very heavy rain to Westland, Fiordland and Southland. This caused severe flooding and slips across Fiordland, Southland, and South Otago."
2019,December,New Zealand,Storm,"A prolonged period of unstable northwesterly flow affected much of New Zealand over this period. There was particularly heavy rain about and west of the Southern Alps, which caused severe disruption and damage to the South Island’s West Coast."
2019,November,Timaru,Hail,"A severe hailstorm over Timaru, with golf ball-sized hail stones, caused extensive damage to buildings and vehicles."
2019,August,New Zealand,Storm,"A complex low pressure system, with a number of associated fronts, brought rain, gales and thunderstorms to many parts of the country, and snow down to 300 to 400 metres across southern and eastern parts of the lower South Island."
2019,March,South Island,Flood,"A strong, warm northwest flow covered the South Island, with several fronts slowly crossed the island. In addition, a warmer than usual Tasman Sea resulted in heavier rain (some parts of Westland had up to 800 mm during the downpour) than would normally occur. The heavy rain raised rivers to record levels. There were many slips and flooding that damaged property and closed roads. Haast experienced a one-in-100-year flood."
2018,November,West Coast,Storm,A trough over southern and central areas brought very heavy rain to the West Coast of the South Island and snow to low levels in inland Canterbury.
2018,April,New Zealand,Storm,"A low pressure system crossed New Zealand, bringing severe gales to the North Island and upper South Island. Auckland was particularly badly affected. Two tornadoes (one at Rahotu in Taranaki, and the other at National Park) were reported."
2018,February,New Zealand,Cyclone,"Ex-tropical cyclone Gita brought very heavy rain and severe gales, and caused severe damage to many areas of New Zealand. The worst affected areas were northwest Nelson/Golden Bay, the Kaikoura Coast, and Taranaki."
2018,February,New Zealand,Cyclone,"Ex-tropical cyclone Fehi moved into the North Tasman Sea, and then combined with a trough moving in from the southwest. The storm crossed New Zealand, causing extensive damage due to wind, rain and coastal storm surges. The West Coast of the South Island was badly affected by the wind and rain, and the Nelson area was affected by storm surges."
2018,January,New Zealand,Storm,"A deep low brought stormy weather to northern and central areas of New Zealand. Severe gales and heavy rain caused severe damage in the north of the North Island. There was also coastal devastation from the combination of low pressure, king tides and very strong winds."


In [6]:
#nz_historic_weather_events_df %>% 
    #write_csv("nz_historic_weather_events_df.csv") 
