# Downloading the Data


In [1]:
# Create project folders
#Function dir.created() is used to create filders on the file system, thus are folder is labeled Data and then within data we are creating two sub folders "raw" and "clean".
#showWarnings=False is a logical variable the suppresses warning messages if the directory already exists. This is important as it allows the script to run safely without stopping.
dir.create("data", showWarnings = FALSE)            #Creating the folder "data"
dir.create("data/raw", showWarnings = FALSE)        #Creating the subfolder "Raw"
dir.create("data/clean", showWarnings = FALSE)      # Creating the sub folder "clean"


In [2]:
# Confirm files are present
#The function list.files() is used to display the contents of a directory.
list.files("data/raw")                             # I downloaded and uploaded the files Infant Mortality, Life expectancy, and Material mortality ratio into this sub folder raw


# Maternal Mortality Ratio

In [3]:
# Read maternal mortality data
#This code reads the CSV file containing the maternal mortality ratio and stores it into a data frame
#Read.csv() is assigned to the variable maternal_raw, using the <- which is used to indicate that the data is in its original uncleaned form.
#stringASFactors is a logical variable that prevents R from automatically converting text columns into factor variables. Very important because it's easier to clean and transform
maternal_raw <- read.csv("data/raw/Maternal mortality ratio.csv",#creating the data frame maternal_raw
                          stringsAsFactors = FALSE)

In [4]:
# Inspect structure
#We use the str() to display a compact summary of the internal structure of the object maternal_raw
str(maternal_raw)

'data.frame':	196 obs. of  7 variables:
 $ name               : chr  "Nigeria" "Chad" "South Sudan" "Central African Republic" ...
 $ slug               : chr  "nigeria" "chad" "south-sudan" "central-african-republic" ...
 $ deaths.100         : int  993 748 692 692 628 563 521 518 505 494 ...
 $ X000.live.births   : int  2023 2023 2023 2023 2023 2023 2023 2023 2023 2023 ...
 $ date_of_information: int  1 2 3 4 5 6 7 8 9 10 ...
 $ ranking            : chr  "Africa" "Africa" "Africa" "Africa" ...
 $ region             : logi  NA NA NA NA NA NA ...


In [5]:
# View first few rows
#It returns the fist six observations, and helps to confirm that the data loaded correctly
#Allows to see the column names, verify the values, and spot obvious issues.
head(maternal_raw)

Unnamed: 0_level_0,name,slug,deaths.100,X000.live.births,date_of_information,ranking,region
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<int>,<chr>,<lgl>
1,Nigeria,nigeria,993,2023,1,Africa,
2,Chad,chad,748,2023,2,Africa,
3,South Sudan,south-sudan,692,2023,3,Africa,
4,Central African Republic,central-african-republic,692,2023,4,Africa,
5,Liberia,liberia,628,2023,5,Africa,
6,Somalia,somalia,563,2023,6,Africa,


In [6]:
# Select only the required columns and rename them
#We are taking the dataset maternal_raw and selecting the columns we need for the assignmnet
# We are then renaming the columns by extracting the the columns from maternal_raw
maternal_clean <- data.frame(           #Creating the datafram maternal_clean
  country = maternal_raw$name,          # create the column country by extracting the name column
  variable = maternal_raw$deaths.100,   # create the variable column using the column deaths.100
  region = maternal_raw$ranking,        # create the variable region using the column ranking
  stringsAsFactors = FALSE              # Ensures that text variables remain character vectors and not automatically converting to other factors
)

In [7]:
# Confirm structure after cleaning
#Looking at the data system using str() to verify
str(maternal_clean) #verify the dataframe maternal_clean

'data.frame':	196 obs. of  3 variables:
 $ country : chr  "Nigeria" "Chad" "South Sudan" "Central African Republic" ...
 $ variable: int  993 748 692 692 628 563 521 518 505 494 ...
 $ region  : chr  "Africa" "Africa" "Africa" "Africa" ...


In [8]:
# View first few rows
#The function head() is used to display the first few rows of the cleaned data frame.
head(maternal_clean) # Can see clearly we have just three columns properly labeled.

Unnamed: 0_level_0,country,variable,region
Unnamed: 0_level_1,<chr>,<int>,<chr>
1,Nigeria,993,Africa
2,Chad,748,Africa
3,South Sudan,692,Africa
4,Central African Republic,692,Africa
5,Liberia,628,Africa
6,Somalia,563,Africa


In [9]:
# Ensure correct data types
#Sets the data types of the variables of the maternal clean data to ensure they are appropriate for analysis
#converst the columns using the as.character for text and the as.numeric as numeric to allow for mathematical operations.
maternal_clean$country <- as.character(maternal_clean$country)  # Ensures that country is Text
maternal_clean$variable <- as.numeric(maternal_clean$variable)  # Ensures variable is Numeric
maternal_clean$region <- as.character(maternal_clean$region)    # Ensures region is text

In [10]:
# Confirm structure to ensure the columns are the proper data types
str(maternal_clean)

'data.frame':	196 obs. of  3 variables:
 $ country : chr  "Nigeria" "Chad" "South Sudan" "Central African Republic" ...
 $ variable: num  993 748 692 692 628 563 521 518 505 494 ...
 $ region  : chr  "Africa" "Africa" "Africa" "Africa" ...


In [11]:
# Save cleaned maternal mortality data as CSV
# write.csv() used to export a data frame from R into a a CSV file.
write.csv(maternal_clean,                                       #Using the write.csv() we use the maternal_clean file which contains the finalized cleaned version of the dataset.
          "data/clean/maternal_mortality_clean.csv",            #Specifies the file path and the file name
          row.names = FALSE)                                    #row.names FALSE is a logical value that prvents R from writing row numbers as an extra column within the CSV file

In [12]:
# Save cleaned maternal mortality data as RDS
#The function saveRDS() writes a single R object to a file while preserving the exact structure and data types.
saveRDS(maternal_clean,                                          #Saving the maternal_clean in an RDS file.
        "data/clean/maternal_mortality_clean.rds")               #Providing the directory and the file name

In [42]:
# Read CSV back in
#This reads the cleaned maternal mortality dataset back into R from the CSV file and stores it in a new object.
maternal_csv <- read.csv("data/clean/maternal_mortality_clean.csv",# Provides the link to where the file is stored and creating the new object
                          stringsAsFactors = FALSE)                # Ensures that text columns are read as character vectors


In [43]:
# Read RDS back in
#This reads the cleaned maternal mortality dataset of RDS back into the R code and stores it in a new object
maternal_rds <- readRDS("data/clean/maternal_mortality_clean.rds") # Creating the object maternal_rds

In [15]:
# Verify structures
#Using string to verify that both data frames are correct before moving on to the next section.
str(maternal_csv) # Data Frame validation for the CSV file
str(maternal_rds) # Data Frame validation for the RDS file

'data.frame':	196 obs. of  3 variables:
 $ country : chr  "Nigeria" "Chad" "South Sudan" "Central African Republic" ...
 $ variable: int  993 748 692 692 628 563 521 518 505 494 ...
 $ region  : chr  "Africa" "Africa" "Africa" "Africa" ...
'data.frame':	196 obs. of  3 variables:
 $ country : chr  "Nigeria" "Chad" "South Sudan" "Central African Republic" ...
 $ variable: num  993 748 692 692 628 563 521 518 505 494 ...
 $ region  : chr  "Africa" "Africa" "Africa" "Africa" ...


# Infant Mortality Rate

In [17]:
# Read Infant mortality data
#This code reads the CSV file containing the Infant mortality rate and stores it into a data frame
#Read.csv() is assigned to the variable maternal_raw, using the <- which is used to indicate that the data is in its original uncleaned form.
#stringASFactors is a logical variable that prevents R from automatically converting text columns into factor variables. Very important because it's easier to clean and transform
Infant_raw <- read.csv("data/raw/Infant mortality rate.csv",#creating the data frame infant_raw
                          stringsAsFactors = FALSE)

In [18]:
# Inspect structure
#We use the str() to display a compact summary of the internal structure of the object Infant_raw
str(Infant_raw)

'data.frame':	227 obs. of  7 variables:
 $ name               : chr  "Somalia" "Central African Republic" "Equatorial Guinea" "Sierra Leone" ...
 $ slug               : chr  "somalia" "central-african-republic" "equatorial-guinea" "sierra-leone" ...
 $ deaths.1           : num  81.5 79.3 76.9 70.1 65.6 63 61.1 58.6 56.7 55.8 ...
 $ X000.live.births   : int  2025 2025 2025 2025 2025 2025 2025 2025 2025 2025 ...
 $ date_of_information: int  1 2 3 4 5 6 7 8 9 10 ...
 $ ranking            : chr  "Africa" "Africa" "Africa" "Africa" ...
 $ region             : logi  NA NA NA NA NA NA ...


In [19]:
# View first few rows
#It returns the fist six observations, and helps to confirm that the data loaded correctly
#Allows to see the column names, verify the values, and spot obvious issues.
head(Infant_raw)

Unnamed: 0_level_0,name,slug,deaths.1,X000.live.births,date_of_information,ranking,region
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<int>,<int>,<chr>,<lgl>
1,Somalia,somalia,81.5,2025,1,Africa,
2,Central African Republic,central-african-republic,79.3,2025,2,Africa,
3,Equatorial Guinea,equatorial-guinea,76.9,2025,3,Africa,
4,Sierra Leone,sierra-leone,70.1,2025,4,Africa,
5,Nigeria,nigeria,65.6,2025,5,Africa,
6,Niger,niger,63.0,2025,6,Africa,


In [23]:
# Create cleaned infant mortality dataset
#We are taking the dataset maternal_raw and selecting the columns we need for the assignmnet
# We are then renaming the columns by extracting the the columns from maternal_raw
infant_clean <- data.frame(         #Creating the datafram infant_clean
  country = Infant_raw$name,        # Creating the column country by extracting the name column
  variable = Infant_raw$deaths.1,   # Creating the variable column using the column deaths.1
  region = Infant_raw$ranking,      # Creating the variable region using the column rankings
  stringsAsFactors = FALSE          # Ensures that text variables remain character vectors and not automatically converting to other factors
)


In [24]:
# Ensure correct data types
# Sets the data types of the variable of the infant clean data to ensure they are appropriate for analysis
#Converts the columns using the as.character for text and the as numeric as numeric to allow for mathematical operations
infant_clean$country <- as.character(infant_clean$country)  # Ensures that country is Text
infant_clean$variable <- as.numeric(infant_clean$variable)  # Ensures variable is numeric
infant_clean$region <- as.character(infant_clean$region)    # Ensures region is text

In [25]:
# Confirm structure to ensure the columns are the proper data types
str(infant_clean)

'data.frame':	227 obs. of  3 variables:
 $ country : chr  "Somalia" "Central African Republic" "Equatorial Guinea" "Sierra Leone" ...
 $ variable: num  81.5 79.3 76.9 70.1 65.6 63 61.1 58.6 56.7 55.8 ...
 $ region  : chr  "Africa" "Africa" "Africa" "Africa" ...


In [26]:
# View first few rows
head(infant_clean)

Unnamed: 0_level_0,country,variable,region
Unnamed: 0_level_1,<chr>,<dbl>,<chr>
1,Somalia,81.5,Africa
2,Central African Republic,79.3,Africa
3,Equatorial Guinea,76.9,Africa
4,Sierra Leone,70.1,Africa
5,Nigeria,65.6,Africa
6,Niger,63.0,Africa


In [27]:
# Save cleaned infant mortality data as CSV
#write.csv() used to export a data frame from R into a CSV file
write.csv(infant_clean,                                #Using the write.csv() we use the infant_clean file which contains the finalized cleaned version of the dataset
          "data/clean/infant_mortality_clean.csv",     #Specifies the file path and the file name
          row.names = FALSE)                           #row.names False is a logical value that prevents R from writing row numbers as an extra column within the CSV file

In [28]:
# Save cleaned infant mortality data as RDS
# The function saveRDS() writes a single R object to a file while preserving the exact structure and data types
saveRDS(infant_clean,                                  #Saving the infant_clean in an RDS file
        "data/clean/infant_mortality_clean.rds")       #Providing the directory and the file name

# Life Expectancy At Birth

In [29]:
# Read Life Expectancy at Birth data
#This code reads the CSV file containing the Life Expectancy and stores it into a data frame
#Read.csv() is assigned to the variable maternal_raw, using the <- which is used to indicate that the data is in its original uncleaned form.
#StringASFactors is a logical variable that prevents R from automatically converting text columns into factor variables. Very important because it's easier to clean and transform
life_raw <- read.csv("data/raw/Life expectancy at birth.csv",    #Creating the data frame life_raw
                          stringsAsFactors = FALSE)

In [30]:
# Inspect structure
#We use the str() to display a compact summaryh of the internal structure of the object life_raw
str(life_raw)

'data.frame':	227 obs. of  6 variables:
 $ name               : chr  "Monaco" "Singapore" "Macau" "Japan" ...
 $ slug               : chr  "monaco" "singapore" "macau" "japan" ...
 $ years              : num  89.8 86.7 85.3 85.2 84.2 84.2 84 84 83.9 83.8 ...
 $ date_of_information: int  2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
 $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
 $ region             : chr  "Europe" "East and Southeast Asia" "East and Southeast Asia" "East and Southeast Asia" ...


In [31]:
# View first few rows
#Head() returns the first six observations, and helps to confirm that the data loaded correctly
#Allows to see the column names, verify the values, and spot obvious issues
head(life_raw)

Unnamed: 0_level_0,name,slug,years,date_of_information,ranking,region
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<int>,<int>,<chr>
1,Monaco,monaco,89.8,2024,1,Europe
2,Singapore,singapore,86.7,2024,2,East and Southeast Asia
3,Macau,macau,85.3,2024,3,East and Southeast Asia
4,Japan,japan,85.2,2024,4,East and Southeast Asia
5,Canada,canada,84.2,2024,5,North America
6,San Marino,san-marino,84.2,2024,6,Europe


In [45]:
# Create cleaned life expectancy dataset
#We are taking the dataset life_raw and selecting the columns we need for the assignment
# We are then renaming the columns by extracting the columns from life_raw
life_clean <- data.frame(             #Creating the dataframe life_raw
  country = life_raw$name,        # Creating the column country by extracting the name column
  variable = life_raw$years,   # Creating the variable column using the column years
  region = life_raw$region,      # Creating the variable region using the column Region
  stringsAsFactors = FALSE
)


In [46]:
# Ensure correct data types
# Sets the data types of the variable of the life clean data to ensure they are appropriate for analysis
#Converts the columns using the as.character for text and the numeric as numeric to allow for mathematical operations
life_clean$country <- as.character(life_clean$country)  # Ensures that country is text
life_clean$variable <- as.numeric(life_clean$variable)  # Ensures variable is Numeric
life_clean$region <- as.character(life_clean$region)    # Ensures region is text

In [47]:
# Confirm structure to ensure the columns are the proper data types
str(life_clean)

'data.frame':	227 obs. of  3 variables:
 $ country : chr  "Monaco" "Singapore" "Macau" "Japan" ...
 $ variable: num  89.8 86.7 85.3 85.2 84.2 84.2 84 84 83.9 83.8 ...
 $ region  : chr  "Europe" "East and Southeast Asia" "East and Southeast Asia" "East and Southeast Asia" ...


In [48]:
# View first few rows
head(life_clean)

Unnamed: 0_level_0,country,variable,region
Unnamed: 0_level_1,<chr>,<dbl>,<chr>
1,Monaco,89.8,Europe
2,Singapore,86.7,East and Southeast Asia
3,Macau,85.3,East and Southeast Asia
4,Japan,85.2,East and Southeast Asia
5,Canada,84.2,North America
6,San Marino,84.2,Europe


In [49]:
# Save cleaned Life Expectancy data as CSV
#write.csv() used to export a data frame from R into a CSV file
write.csv(life_clean,                       #Using the write.csv() we use the life_clean file which contains the finalized cleaned version of the dataset
          "data/clean/Life_Expectancy_clean.csv", # Specifies the file path and the file name
          row.names = FALSE)                      #row.names False is a logical value that prevents R from writing row numbers as an extra column within the CSV file

In [50]:
# Save cleaned Life Expectancy data as RDS
#The function saveRDS() writes a signle R object to a file while preserving the exact structure and data types
saveRDS(life_clean,                                #Saving the life_clean in an RDS_File
        "data/clean/Life_Expectancy_clean.rds")      #Providing the directory and the file name

# Verify Work

In [51]:
# List all cleaned files
list.files("data/clean")

In [52]:
# Read all cleaned datasets
maternal_final <- readRDS("data/clean/maternal_mortality_clean.rds")
infant_final <- readRDS("data/clean/infant_mortality_clean.rds")
life_final <- readRDS("data/clean/Life_Expectancy_clean.rds")

# Verify structures
str(maternal_final) #verify the maternal dataset
str(infant_final)   #verify infant mortality dataset
str(life_final)    #verify life expectancy dataset


'data.frame':	196 obs. of  3 variables:
 $ country : chr  "Nigeria" "Chad" "South Sudan" "Central African Republic" ...
 $ variable: num  993 748 692 692 628 563 521 518 505 494 ...
 $ region  : chr  "Africa" "Africa" "Africa" "Africa" ...
'data.frame':	227 obs. of  3 variables:
 $ country : chr  "Somalia" "Central African Republic" "Equatorial Guinea" "Sierra Leone" ...
 $ variable: num  81.5 79.3 76.9 70.1 65.6 63 61.1 58.6 56.7 55.8 ...
 $ region  : chr  "Africa" "Africa" "Africa" "Africa" ...
'data.frame':	227 obs. of  3 variables:
 $ country : chr  "Monaco" "Singapore" "Macau" "Japan" ...
 $ variable: num  89.8 86.7 85.3 85.2 84.2 84.2 84 84 83.9 83.8 ...
 $ region  : chr  "Europe" "East and Southeast Asia" "East and Southeast Asia" "East and Southeast Asia" ...
