Title: To analyse the prevalence of the diabetes over the world and to analyse the diabetic hospitalisations data in US

Importing the libraries

In [76]:
library(tidyverse)
library(magrittr) # better handling of pipes
library(purrr) # to work with lists and map functions
library(glue) # to paste strings 
library(stringr) # to hand strings
library(rvest) # to make scraping easier
library(politeness) # (polite has been changed to politeness). politeness is the "polite" version of rvest
library(htmltab)
library(dplyr)
library(tidyr)

In [77]:
url_titles <- "http://www.who.int/diabetes/facts/world_figures/en/index1.html" # this is the base url from where the scraping starts
page_title <- read_html(url_titles)

In [78]:
page_title %>% typeof()
page_title %>% glimpse()

List of 2
 $ node:<externalptr> 
 $ doc :<externalptr> 
 - attr(*, "class")= chr [1:2] "xml_document" "xml_node"


In [79]:
page_title %>%
 html_structure()

<html>
  <head>
    <meta [http-equiv, content]>
    <meta [http-equiv, content]>
    <meta [name, content]>
    <script [src, type]>
    <script [src, type]>
    <title>
      {text}
    <meta [name, content]>
    <meta [name, content]>
    <meta [name, content]>
    <meta [name, content]>
    <meta [name, content]>
    <meta [name, content]>
    <meta [name, content]>
    <meta [name, content]>
    <meta [name, content]>
    <meta [name, content]>
    <meta [name, content]>
    <script [type]>
      {cdata}
    <link [rel, href]>
    <link [rel, href]>
    <script [type, src]>
    <script [type, src]>
    {comment}
    <script [type, src]>
    <script [type, src]>
    <script [type, src]>
    <script [type, src]>
    <link [href, rel, type]>
    {comment}
    {comment}
    {comment}
    {comment}
    {comment}
    {comment}
    <link [rel, type, href]>
    <script [type, src]>
    <link [rel, type, href]>
    <script [type, src]>
    <script [type]>
      {cdata}
    {comment}
    <s

Scrape the table for AFRICA REGION

In [80]:
africa_table <- page_title %>%
html_nodes("table.tableData") %>% 
html_table(header=T)

In [81]:
africa_table

Country,2000,2030
Algeria,426000,1203000
Angola,51000,140000
Benin,87000,266000
Botswana,25000,45000
Burkina Faso,124000,388000
Burundi,26000,72000
Cameroon,70000,171000
Cape Verde,7000,24000
Central African Republic,18000,38000
Chad,97000,269000


Converting the scraped table into a dataframe and renaming the dataframe as africa_df

In [82]:
africa_df <- as.data.frame(table) 

In [83]:
africa_df %>% head()

Country,X2000,X2030
Algeria,426000,1203000
Angola,51000,140000
Benin,87000,266000
Botswana,25000,45000
Burkina Faso,124000,388000
Burundi,26000,72000


Renaming the columns to the desired names

In [84]:
columns_wanted <- c("Country", "X2000", "X2030")
africa_df <- africa_df[columns_wanted]

In [85]:
colnames(africa_df) <- c("COUNTRY", "2000", "2030")

In [86]:
africa_df %>% head(5)

COUNTRY,2000,2030
Algeria,426000,1203000
Angola,51000,140000
Benin,87000,266000
Botswana,25000,45000
Burkina Faso,124000,388000


Scrape the table for EASTERN MEDITERRANEAN REGION

In [95]:
#Scraping table from a webpage using xpath
url_titles_2 <- "http://www.who.int/diabetes/facts/world_figures/en/index2.html" # this is the base url from where the scraping starts
page_title_2 <- read_html(url_titles_2)
east_med_table <- page_title_2 %>%
html_nodes("table.tableData") %>% 
html_table(header=T)
east_med_table

Country,2000,2030
Afghanistan,468000,1403000
Bahrain,37000,99000
Cyprus,50000,87000
Djibouti,7000,9000
Egypt,2623000,6726000
Islamic Republic of Iran,2103000,6421000
Iraq,668000,2009000
Jordan,195000,680000
Kuwait,104000,319000
Lebanon,146000,378000


In [91]:
east_med_df <- as.data.frame(east_med_table) 
east_med_df %>% head()

In [93]:
columns_wanted <- c("Country", "X2000", "X2030")
east_med_df <- east_med_df[columns_wanted]
colnames(east_med_df) <- c("COUNTRY", "2000", "2030")
east_med_df %>% head()

COUNTRY,2000,2030
Afghanistan,468000,1403000
Bahrain,37000,99000
Cyprus,50000,87000
Djibouti,7000,9000
Egypt,2623000,6726000
Islamic Republic of Iran,2103000,6421000


Scrape the table for AMERICAS REGION

In [94]:
#Scraping table from a webpage using xpath
url_titles_3 <- "http://www.who.int/diabetes/facts/world_figures/en/index3.html" # this is the base url from where the scraping starts
page_title_3 <- read_html(url_titles_3)
americas_table <- page_title_3 %>%
html_nodes("table.tableData") %>% 
html_table(header=T)
americas_table

Country,2000,2030
Antigua and Barbuda,3000,5000
Argentina,1426000,2457000
Bahamas,12000,26000
Barbados,11000,22000
Belize,5000,15000
Bolivia,207000,562000
Brazil,4553000,11305000
Canada,2006000,3543000
Chile,495000,1047000
Colombia,883000,2425000


In [98]:
americas_df <- as.data.frame(americas_table) 
americas_df %>% head()

Country,X2000,X2030
Antigua and Barbuda,3000,5000
Argentina,1426000,2457000
Bahamas,12000,26000
Barbados,11000,22000
Belize,5000,15000
Bolivia,207000,562000


In [99]:
columns_wanted <- c("Country", "X2000", "X2030")
americas_df <- americas_df[columns_wanted]
colnames(americas_df) <- c("COUNTRY", "2000", "2030")
americas_df %>% head()

COUNTRY,2000,2030
Antigua and Barbuda,3000,5000
Argentina,1426000,2457000
Bahamas,12000,26000
Barbados,11000,22000
Belize,5000,15000
Bolivia,207000,562000


Scraping the table for ASIA REGION

In [100]:
#Scraping table from a webpage using xpath
url_titles_4 <- "http://www.who.int/diabetes/facts/world_figures/en/index5.html" # this is the base url from where the scraping starts
page_title_4 <- read_html(url_titles_4)
asia_table <- page_title_4 %>%
html_nodes("table.tableData") %>% 
html_table(header=T)
asia_table

Country,2000,2030
Bangladesh,3196000,11140000
Bhutan,35000,109000
Dem. People's Rep. of Korea,367000,635000
India,31705000,79441000
Indonesia,8426000,21257000
Maldives,6000,25000
Myanmar,543000,1330000
Nepal,436000,1328000
Sri Lanka,653000,1537000
Thailand,1536000,2739000


In [103]:
asia_df <- as.data.frame(asia_table) 
asia_df %>% head()

Country,X2000,X2030
Bangladesh,3196000,11140000
Bhutan,35000,109000
Dem. People's Rep. of Korea,367000,635000
India,31705000,79441000
Indonesia,8426000,21257000
Maldives,6000,25000


In [104]:
columns_wanted <- c("Country", "X2000", "X2030")
asia_df <- asia_df[columns_wanted]
colnames(asia_df) <- c("COUNTRY", "2000", "2030")
asia_df %>% head()

COUNTRY,2000,2030
Bangladesh,3196000,11140000
Bhutan,35000,109000
Dem. People's Rep. of Korea,367000,635000
India,31705000,79441000
Indonesia,8426000,21257000
Maldives,6000,25000


Scraping the table for WESTERN PACIFIC REGION

In [105]:
#Scraping table from a webpage using xpath
url_titles_5 <- "http://www.who.int/diabetes/facts/world_figures/en/index6.html" # this is the base url from where the scraping starts
page_title_5 <- read_html(url_titles_5)
west_table <- page_title_5 %>%
html_nodes("table.tableData") %>% 
html_table(header=T)
west_table

Country,2000,2030
Australia,941000,1673000
Brunei Darussalam,18000,49000
Cambodia,110000,317000
China,20757000,42321000
Cook Islands,700,1300
Fiji,37000,72000
Japan,6765000,8914000
Kiribati,4000,7000
Lao People's Dem. Rep.,46000,128000
Malaysia,942000,2479000


In [106]:
west_df <- as.data.frame(west_table) 
west_df %>% head()

Country,X2000,X2030
Australia,941000,1673000
Brunei Darussalam,18000,49000
Cambodia,110000,317000
China,20757000,42321000
Cook Islands,700,1300
Fiji,37000,72000


In [107]:
columns_wanted <- c("Country", "X2000", "X2030")
west_df <- west_df[columns_wanted]
colnames(west_df) <- c("COUNTRY", "2000", "2030")
west_df %>% head()

COUNTRY,2000,2030
Australia,941000,1673000
Brunei Darussalam,18000,49000
Cambodia,110000,317000
China,20757000,42321000
Cook Islands,700,1300
Fiji,37000,72000


Create a big dataframe

In [108]:
countries_df <- list(africa_df,
                     east_med_df,
                     americas_df,
                     asia_df,
                    west_df)

In [109]:
big_country_df <- bind_rows(countries_df)

In [110]:
big_country_df

COUNTRY,2000,2030
Algeria,426000,1203000
Angola,51000,140000
Benin,87000,266000
Botswana,25000,45000
Burkina Faso,124000,388000
Burundi,26000,72000
Cameroon,70000,171000
Cape Verde,7000,24000
Central African Republic,18000,38000
Chad,97000,269000
