# NZ HISTORIC WEATHER EVENTS DATA SCRAPING

In [4]:
library(tidyverse)
# install.packages("magrittr")
# install.packages("purrr")
# install.packages("glue")
# install.packages("stringr")
# install.packages("rvest")
# install.packages("polite")
library(magrittr) # better handling of pipes
library(purrr) # to work with lists and map functions
library(glue) # to paste strings
library(stringr) # to hand strings
library(rvest) # rvest makes scraping easier
library(polite) # polite is the "polite" version of rvest

In [5]:
# Function to extract the relevant data from a web page (identified by its URL)
# Takes URL correspoding to a web page to be scraped as input
# Returns a data frame containing all the relevant data extracted from the web page involved 
extract_text <- function(url) {
    
    # Extract the headlines summarizing NZ historic weather events 
    summary_text <- url %>%
                    read_html() %>%
                    html_nodes(".summary") %>%
                    html_nodes("dt") %>%
                    html_text() %>%
                    # Remove whitespace from the beginning and end of string
                    str_trim()
    
    # Extract the brief description corresponding to each weather event 
    description_text <- url %>%
                        read_html() %>%
                        html_nodes(".summary") %>%
                        html_nodes("dd") %>%
                        html_text() %>%
                        # Remove whitespace from the beginning and end of string
                        str_trim()
    
    # Create a data frame containing "summary_text" (extratcted above) as the initial column 
    # to extract all the relevant fields from
    df <- tibble(summary_text)
    
    # Extract the first word from the "summary_text" column and insert it into a new column "Month"
    df$Month <- word(df$summary_text, 1)
    
    # Extract the second word from the "summary_text" column and insert it into a new column "Year"
    df$Year <- as.integer(word(df$summary_text, 2))
    
    # Extract everything except the first two words from the "summary_text" column and insert that into a new column "Summary"
    df$Summary <- word(df$summary_text, 3, -1)
    
    # Add a new column coresponding to the brief description of the weather events
    df <- df %>% 
          mutate(Description = description_text)
    
    # Drop the "summary_text" column 
    df <- subset(df, select = -c(summary_text))
    
    # Areas affected by the weather events
    NZ_areas <- c("South Island", "New Zealand", "Timaru", "West Coast", "North Island",
                  "Upper North Island", "Lower North Island", "Canterbury", "Otago", 
                  "Bay of Plenty", "Wellington", "Nelson", "Auckland", "Tasman", "Taranaki",
                  "Hawke's Bay", "Gisborne", "Southland", "Marlborough")
    NZ_areas_cond <- str_c(NZ_areas, collapse = "|")
    
    # Extract the area name(s) from the "Summary" column and insert the same into a new column "Affected Area"
    df$"Affected Area" <- sapply(str_extract_all(df$Summary, NZ_areas_cond), toString)
    
    df$"Affected Area"[df$"Affected Area" == ""] <- "New Zealand"

    df <- separate_rows(df, "Affected Area", sep = ",")
    
    # Weather events
    Weather_events <- c("Flood", "Storm", "Hail", "Cyclone", "Heavy Rain", "Cold Snap",
                        "Tornado", "Weather Bomb", "Winds", "Snowfall")
    
    Weather_events_cond <- str_c(Weather_events, collapse = "|")
    
    # Extract the weather event(s) from the "Summary" column and insert the same into a new column "Weather Event"
    df$"Weather Event" <- sapply(str_extract_all(df$Summary, Weather_events_cond), toString)
    
    # Reordering the columns
    df <- df %>%
          select(Year, Month, "Affected Area":"Weather Event", Description)
    
    return(df) 
}


In [6]:
# Part of URL common to all the web pages to be scraped
url_subset <- "https://hwe.niwa.co.nz/search/summary/Startdate/01-01-2011/Enddate/31-12-2017/Regions/all/Hazards/all/Impacts/all/Keywords/none/numberOfEvents/20/page/"

# Total number of web pages to be scraped
total_pages = 2

# List containing all the data frames corresponding to different web pages' data
df_list <- list()

# Extracting the data from different web pages   
for (i in 1:total_pages) {
    # URL corresponding to a web page to be scraped 
    url <- paste0(url_subset, i)
    # Extacting the data from a particular web page in the form of a data frame using the function defined above 
    df <- extract_text(url)
    df_list <- append(df_list, list(df))
}

# Merging the rows of all the data frames (corresponding to the data from different web pages) together
# to get the final data frame containing all the required data relevant to NZ historic weather events
nz_historic_weather_events_df <- do.call("rbind", df_list)

# Sorting the dataframe in ascending order of year
nz_historic_weather_events_df <- nz_historic_weather_events_df[order(nz_historic_weather_events_df$Year),]
nz_historic_weather_events_df


Year,Month,Affected Area,Weather Event,Description
<int>,<chr>,<chr>,<chr>,<chr>
2011,December,Tasman,"Heavy Rain, Flood",There was very heavy rain in Tasman-Nelson which resulted in flooding and numerous slips in the region.
2011,December,Nelson,"Heavy Rain, Flood",There was very heavy rain in Tasman-Nelson which resulted in flooding and numerous slips in the region.
2011,November,West Coast,"Flood, Winds",Heavy rain brought flooding to the West Coast. There were also very high winds in Wellington and the Wairarapa.
2011,November,North Island,"Flood, Winds",Heavy rain brought flooding to the West Coast. There were also very high winds in Wellington and the Wairarapa.
2011,August,New Zealand,Snowfall,"Widespread snow in both islands caused much disruption: many roads were closed, schools were closed, CBD work was suspended in Christchurch, and power was cut to thousands of homes."
2011,July,New Zealand,Snowfall,"Widespread snow in both islands caused much disruption, with many roads closed, many flights to South Island airports cancelled."
2011,June,Taranaki,Tornado,"A swarm of tornados struck North Taranaki (New Plymouth city, Bell Block and Omata) damaging property and causing power outage."
2011,May,Upper North Island,"Heavy Rain, Winds","Strong winds caused damage and disruption in Otago, and heavy rain caused disruption in the Upper North Island on 12 May 2011."
2011,May,Otago,"Heavy Rain, Winds","Strong winds caused damage and disruption in Otago, and heavy rain caused disruption in the Upper North Island on 12 May 2011."
2011,May,Auckland,Tornado,"The tornado formed from a very active and well defined band of thunderstorm activity on the afternoon of the 3rd May 2011. The destructive tornado swept across Auckland's North Shore,  leaving a trail of destruction from Albany to Glenfield."


In [7]:
#nz_historic_weather_events_df %>% 
    #write_csv("nz_historic_weather_events_df.csv")