In [1]:
library(dplyr)
library(httr)
library(glue)
library(rvest)
library(stringr)
library(data.table)
library(tidyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘glue’


The following object is masked from ‘package:dplyr’:

    collapse


Loading required package: xml2


Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last




In [2]:
start_period = '2016'
end_period = '2020'

In [3]:
parsing_df <- function(data, sc, h, cat=NA, year=NA, month=NA){
    
    data = data %>% str_replace_all("'", ' ')
    
    df <- read.table(text = data, sep = ',', stringsAsFactors = F)
    df <- df[,sc]
    
    colnames(df) <- h
    
    df['Categoria'] = cat
    df['Anno'] = year
    df['Mese'] = month
    
    return(df)
    
}

In [4]:
clear_content <- function(data, sc = c(1:3,11), y=NA, m=NA){
    
    data <- data[data!=""]

    len_data = data %>% str_detect('^\\d+') %>% sum()/3

    header <- read.table(text = data[3],sep = ',', stringsAsFactors = F)[1,sc] %>% as.character()
        
    df_ix <- data.frame(ix_start = str_which(data, '^\\d+',negate = T) + 1, 
                    h = data[!str_detect(data, '^\\d+')], 
                    stringsAsFactors = F) %>% 
    filter(str_detect(pattern = 'Totale|Maschi|Femmine', string = h)) %>%
    mutate(categoria = str_extract(h, '[A-Za-z]+$')) %>% 
    select(-h) %>%
    mutate(ix_stop = ix_start + len_data - 1) %>%
    data.frame() %>%
    select(categoria, ix_start, ix_stop)
    
    data_list <- list()

    for(i in 1:nrow(df_ix)){

        data_list[[df_ix$categoria[i]]] <- parsing_df(data[df_ix$ix_start[i]:df_ix$ix_stop[i]], 
                                                      sc, 
                                                      header, 
                                                      year = y,
                                                      month = m,
                                                      cat = df_ix$categoria[i])

    }
    
    aux_df <- data_list %>% do.call(what = rbind)
    row.names(aux_df) <- NULL
    
    return(aux_df)
    
}

## OTTENGO LE PROVINCE

In [5]:
get_province = 'http://demo.istat.it/bilmens{y}gen/query1.php?lingua=ita&allrp=1&periodo=1&submit=Tavola'

In [6]:
get_data = 'http://demo.istat.it/bilmens{y}gen/bild7b1.php?lingua=ita&allrp=4&Pro={pv}&periodo={pr}&submit=Salva'

In [7]:
months = seq(1,12,1)

In [8]:
years = seq(as.integer(start_period), as.integer(end_period), by=1)

In [None]:
for(year in years){
    
    cat(year, '\n')
    
    cont <- GET(glue(get_province, y = year)) %>% content()
    tbl <- cont %>% html_nodes('table') %>% html_table()
    
    df <- tbl[[2]]
    
    province <- df %>% filter(str_detect(X1, '\\d+')) %>% pull(X1) %>% unique
    
    for(prov in province){
        
        cat(prov, '\n')
        
        for(month in months){
            
            to_read_url <- glue(get_data, y = year, pv = prov, pr = month)
                        
            fwrite(clear_content(readLines(to_read_url), y = year, m = month), glue('data/pop_{pr}_{y}_{m}.csv', pr = str_pad(prov, 5, 'left', '0'), y = year, m = str_pad(month,2, 'left', '0')))
                
            Sys.sleep(5)
        }        
        
    }
    
    pop_data <- dir('data', full.names = T, pattern = 'pop_*') %>% lapply(fread)
    full_pop_data <- pop_data %>% do.call(what = rbind)
    
    fwrite(full_pop_data, glue('popolazione_italia_{y}', y = year))
    
    lapply(dir('data', full.names = T, pattern = 'pop_*'), file.remove)
}