forked from KimmoVehkalahti/IODS-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_human.R
98 lines (79 loc) · 4.08 KB
/
create_human.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Create a dataset of human development indicators
# Petteri Karsisto, 2020
#
# Original data from Human Development Reports by United Nations Development Programme
# See: http://hdr.undp.org/en/content/human-development-index-hdi
# Used datasets were provided by IODS lecturers
library(dplyr)
library(stringr)
# Read "Human development" and "Gender inequality" datasets
hd <- read.csv("http://s3.amazonaws.com/assets.datacamp.com/production/course_2218/datasets/human_development.csv",
stringsAsFactors = F)
gii <- read.csv("http://s3.amazonaws.com/assets.datacamp.com/production/course_2218/datasets/gender_inequality.csv",
stringsAsFactors = F, na.strings = "..")
# Exploration: print dimensions, structure and summary for both datasets
dim(hd)
str(hd)
summary(hd)
dim(gii)
str(gii)
summary(gii)
# Rename the columns with shorter names (names modified in week 5)
new_hd_colnames <- c("HDI.rank", "HDI", "exp.life.years", "exp.edu.years", "mean.edu.years", "GNI", "GNIRankMinusHDIRank")
new_gii_colnames <- c("GII.rank", "GII", "mat.mort.rate", "adol.birth.rate", "parl.F.ratio", "edu2.F", "edu2.M", "lab.F", "lab.M")
# Country is the second column, don't rename that as it is already fine
names(hd)[-2] <- new_hd_colnames
names(gii)[-2] <- new_gii_colnames
# Mutate the “Gender inequality” data and create two new variables:
# - ratio of Female and Male populations with secondary education in each country
# - ratio of labour force participation of females and males in each country
gii <- gii %>% mutate(edu2.ratio = edu2.F / edu2.M) %>% mutate(lab.ratio = lab.F / lab.M)
#str(gii) # Check that they exist
# Join datasets by country and save the dataframe
human <- inner_join(hd, gii, by="Country")
#write.csv(human, "data/human.csv", row.names=FALSE) # No need to save at this point anymore
# Week 5 data wrangling starts here
# Using a combined dataset based on "Human development" and "Gender inequality" from above
dim(human)
str(human)
summary(human)
# 'data.frame': 195 obs. of 19 variables:
# $ HDI.rank : int : Rank of Human Development Index
# $ Country : chr : Country name
# $ HDI : num : Human Development Index
# $ exp.life.years : num : Life expectancy at birth
# $ exp.edu.years : num : Expected years in education
# $ mean.edu.years : num : Average years in education
# $ GNI : chr : Gross National Income per capita
# $ GNIRankMinusHDIRank: int : Difference of GNI rank and HDI rank
# $ GII.rank : int : Rank of Gender Inequality Index
# $ GII : num : Gender Inequality Index
# $ mat.mort.rate : int : Maternal mortality rate
# $ adol.birth.rate : num : Adolescent birth rate
# $ parl.F.ratio : num : Ratio of females in the parliament
# $ edu2.F : num : Fraction of females with secondary or higher education
# $ edu2.M : num : Fraction of males with secondary or higher education
# $ lab.F : num : Fraction of females in the work force
# $ lab.M : num : Fraction of males in the work force
# $ edu2.ratio : num : edu2.F / edu2.M
# $ lab.ratio : num : lab.F / lab.M
# GNI has values as strings with a comma as thousands separator
# Remove the comma and convert to number
human$GNI <- str_replace(human$GNI, ",", "") %>% as.numeric
# Exclude unnecessary columns
to_keep = c("Country", "edu2.ratio", "lab.ratio", "exp.edu.years", "exp.life.years", "GNI",
"mat.mort.rate", "adol.birth.rate", "parl.F.ratio")
human <- dplyr::select(human, one_of(to_keep))
# Filter rows where is at least one NA element
human <- filter(human, complete.cases(human))
# Remove regions, which are the last 7 rows
tail(human, 8) # Check that above comment is correct, 8th-to-last row is a country
last <- nrow(human) - 7
human <- human[1:last, ]
# Label rows with Country name, and drop Country column
rownames(human) <- human$Country
human <- select(human, -Country) # Notice the minus!
# Check result: 155 rows, 8 variables -- OK!
str(human)
# Save with row names!
write.csv(human, "data/human.csv", row.names=TRUE)