#### Load the necessary packages

In [5]:
library(rvest)
library(stringr)

#### Define a function to get weather data

In [12]:
get_weather_data<-function(start_date, end_date, city){
    
    start_date<-strsplit(start_date, "-")[[1]]
    start_y<-as.integer(start_date[1])
    start_m<-as.integer(start_date[2])
    end_date<-strsplit(end_date, "-")[[1]]
    end_y<-as.integer(end_date[1])
    end_m<-as.integer(end_date[2])
    
    failed<-c()
  
  if(start_y>end_y) {
    stop("start year must be earier than end year")
  }
  
  if(start_y==end_y){
    Y<-rep(start_y,times=end_m-start_m+1)
    M<-str_pad(start_m:end_m, width = 2, side = "left", pad = "0")
  } else{
    Y<-rep(c(start_y:end_y),times=c(12-start_m+1, rep(12, end_y-start_y-1), end_m-1+1))
    M<-c(str_pad(start_m:12, width = 2, side = "left", pad = "0"),
         rep(1:12, times=end_y-start_y-1),
         str_pad(1:end_m, width = 2, side = "left", pad = "0"))
  }
  
  ds<-rbind()
  for(i in 1:length(Y)){
    url<-paste0('http://lishi.tianqi.com/',city,'/',Y[i],M[i],'.html')
    web<-read_html(url)
    d_t<-web%>%html_nodes('.thrui div')%>%html_text()
    d_t<-d_t[-length(d_t)]
    ds.tmp<-matrix(d_t,ncol=5,byrow=T)
    
    if(nrow(ds.tmp)<2) {
      failed<-c(failed,paste0(Y[i],M[i]))
    } 
    ds<-rbind(ds,ds.tmp)
    Sys.sleep(runif(1,0,1))
  }
  
  ds1<-as.data.frame(ds)
  names(ds1)<-c("date","max","min","weather","wind")
  
  return(list("data"=ds1,
             "failed"=failed))
}

#### Example

Get weather data for Mianyang from Jan 2011 to Oct 2012

In [13]:
result<-get_weather_data(start_date = "2011-1",
                     end_date = "2011-10",
                     city = "mianyang")

#### View data

In [15]:
head(result$data, 6)

Unnamed: 0_level_0,date,max,min,weather,wind
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<fct>
1,2011-01-01,4℃,0℃,阴~阵雨,北风 微风
2,2011-01-02,5℃,1℃,阴~阵雨,北风~旋转风 微风
3,2011-01-03,6℃,1℃,阵雨,旋转风 微风
4,2011-01-04,6℃,1℃,阵雨,旋转风~北风 微风
5,2011-01-05,3℃,1℃,阵雨,北风~旋转风 微风
6,2011-01-06,4℃,1℃,多云,旋转风 微风


#### View the year and month when the acquisition failed

In [17]:
result$failed

NULL