### Data Analysis template

#### Goal: Analyse time series data for number of people with a tertiary education in unisc adjacent areas

Sunshine coast SA4 - 316
MB North SA4 - 313
MB South SA4 - 314
Wide Bay SA4 - 319

In [None]:
c_year <- "2021"
c_pack <- "TSP"
c_geo <- "SA4"
c_area <- "QLD"

directory <- paste("Census", c_year, c_pack, c_geo, c_area, sep = "_")

In [None]:
pkgLoad <- function( packages = "required" ) {

    if( length( packages ) == 1L && packages == "required" ) {
        packages <- c( "readxl", "tidyverse", "openssl", "dplyr", "data.table",
                       "here", "httr","purrr"
        )
    }

    packagecheck <- match( packages, utils::installed.packages()[,1] )

    packagestoinstall <- packages[ is.na( packagecheck ) ]

    if( length( packagestoinstall ) > 0L ) {
        utils::install.packages( packagestoinstall,
                             repos = "http://cran.csiro.au"
        )
    } else {
        print( "All requested packages already installed" )
    }

    for( package in packages ) {
        suppressPackageStartupMessages(
            library( package, character.only = TRUE, quietly = TRUE )
        )
    }
}

In [None]:
unzip_file <- function(zip_path, dest_path){
  unzip(zip_path, exdir = dest_path)
}

In [None]:
download_file <- function(url, dest_path, extract_path, file_name){
  # Ensure the destination directory exists
  if (!dir.exists(dest_path)) {
    dir.create(dest_path, recursive = TRUE)
  }
  
  full_dest_path <- file.path(dest_path, file_name)
  # Adjust extract_path to not append file_name directly
  GET(url, write_disk(full_dest_path, overwrite = TRUE))
  
  if (grepl(".zip$", full_dest_path)){
    unzip_file(full_dest_path, extract_path)
    file.remove(full_dest_path)
  }
}

In [None]:
download_census_data <- function(c_year, c_pack, c_geo, c_area, dest_path, extract_path){
  url <- paste0("https://www.abs.gov.au/census/find-census-data/datapacks/download/", c_year, "_", c_pack, "_", c_geo, "_for_", c_area, "_short-header.zip")
  file_name <- paste0(c_year, "_", c_pack, "_", c_geo, "_for_", c_area, "_short-header.zip")
  download_file(url, dest_path, extract_path, file_name)
  files <- list.files(extract_path, full.names = TRUE) # Get full paths
  print("Success")
}

In [None]:
#read from excel function
url_from_excel <- function(excel_path, dest_path, extract_path){
  urls_df <- read_excel(excel_path, skip = 1, col_names = c("url", "file_name"))
  for (i in 1:nrow(urls_df)) {
    download_file(urls_df$url[i], dest_path, extract_path, urls_df$file_name[i])
  }
}

In [None]:
retrieve_table <- function(search_value, table_index, datafiles) {
    # Search for the index for the dynamic value
    index <- grep(search_value, table_index$V1, fixed = TRUE, ignore.case = FALSE)
    file_paths <- file.path(datafiles, table_index$V1[index])

    # Read and combine all CSVs that match
    combined_df <- map_dfr(file_paths, read.csv)
    
    
    # Return the combined dataframe
    return(combined_df)
}

In [None]:
search_long <- function(search_value, table_desc) {
  # Search for the index for the dynamic value in the second column
  index <- grep(search_value, table_desc[, 2], fixed = TRUE, ignore.case = FALSE)
  # Return the 2nd and 3rd columns of matching rows
  result <- table_desc[index, c(2, 3)]
  return(result)
}

search_short <- function(search_value, table_desc) {
  # Replace spaces with underscores in the search value
  search_value <- gsub(" ", "_", search_value)

  # Search for the index for the dynamic value
  index <- grep(search_value, table_desc[, 3], fixed = TRUE, ignore.case = FALSE)
  
  # Return the 2nd and 3rd columns of matching rows
  result <- table_desc[index, c(2, 3)]
  return(result)
}

In [None]:
pkgLoad()

In [None]:
base_path <- here(directory)
dest_path <- here(directory, "Download")
extract_path <- here(directory, "Input")
output_path <- here(directory, "Output")
if (!dir.exists(base_path)) {
  dir.create(output_path, recursive = TRUE)
}
# Create directories if they do not exist
paths <- list(dest_path, extract_path, output_path)
lapply(paths, function(path) {
  if (!dir.exists(path)) {
    dir.create(path, recursive = TRUE)
  }
})
download_census_data(c_year, c_pack, c_geo, c_area, dest_path, extract_path)
extract_path <- here(directory, "Input", "Metadata")
files <- list.files(extract_path, full.names = TRUE) # Get full paths
geog_def_source <- files[grepl("geog", files)]
data_def_source <- files[grepl("DataPack", files)]

In [None]:
#read each table in the excel file and create dataframe based on tablename
read_excel_sheets <- function(file) {
  tables <- excel_sheets(file)
  dataframes <- list()
  for (table in tables) {
    dataframes[[table]] <- read_excel(file, sheet = table)
  }
  return(dataframes)
}
setColumns <- function(df) {
  for (col in 1:ncol(df)) {
    # Find the first non-NA value in the column
    firstNonNAIndex <- which(!is.na(df[[col]]))[1]
    
    # Check if there is a non-NA value
    if (!is.na(firstNonNAIndex)) {
      # Set the column name to the first non-NA value
      colnames(df)[col] <- as.character(df[firstNonNAIndex, col])
    }
  }
  return(df)
}

geog_def <- read_excel_sheets(geog_def_source)
data_def <- read_excel_sheets(data_def_source)

split_data_frames <- function(df_list) {
  split_dfs <- list()
  
  for (name in names(df_list)) {
    df <- df_list[[name]]
    print(paste("Processing data frame:", name))
    
    # Ensure df is a data.table
    setDT(df)
    
    # Split the data frame by the first column (assuming it's the category column)
    split_list <- split(df, df[[1]])
    
    for (category_name in names(split_list)) {
      cat_df <- split_list[[category_name]]
      
      # Check if this category already exists in split_dfs
      if (category_name %in% names(split_dfs)) {
        # If so, Combine existing and new data frames, then remove duplicates
        combined_df <- rbind(split_dfs[[category_name]], cat_df)
        # Ensure the combined data frame is unique
        unique_combined_df <- unique(combined_df)
        split_dfs[[category_name]] <- unique_combined_df
      } else {
        #if this is first occurence of category, add it to the list
        split_dfs[[category_name]] <- cat_df
      }
    }
  }
  
  return(split_dfs)
}

split_list_of_dfs <- split_data_frames(geog_def)

#process data definitions 
data_def_process <- function(data_def) {
  #process table 1
  data_def[[1]] <- data_def[[1]][!is.na(data_def[[1]][,2]),]
  data_def[[1]] <- setColumns(data_def[[1]])
  data_def[[1]] <- data_def[[1]][-1,]
  #process table 2
  data_def[[2]] <- data_def[[2]][!is.na(data_def[[2]][,2]),]
  data_def[[2]] <- setColumns(data_def[[2]])
  data_def[[2]] <- data_def[[2]][-1,]
  return(data_def)
}
data_def <- data_def_process(data_def)

In [None]:
#export data def as two seperate csvs

output_path <- here(directory, "Output", "Index") 
if (!dir.exists(output_path)) {
  dir.create(output_path, recursive = TRUE)
}

write.csv(data_def[1], here(output_path, "data_def_tables.csv"), row.names = FALSE)
write.csv(data_def[2], here(output_path, "data_def_columns.csv"), row.names = FALSE)
meta_data <-  here(directory, "Output", "Index")
output_path <- here(directory, "Output", "ASGS") 
if (!dir.exists(output_path)) {
  dir.create(output_path, recursive = TRUE)
}
for (name in names(split_list_of_dfs)) {
  df <- split_list_of_dfs[[name]]
  output_path
  write.csv(df, file.path(output_path, paste0(name, ".csv")), row.names = FALSE)
}
#read each csv and create a column called AGSS_Key
files <- list.files(output_path, pattern = "\\.csv$", full.names = TRUE)
for (file in files) {
  df <- fread(file)
  df$AGSS_Key <- file
  #add value to the AGSS_key which is the AGSS_Code_2021 
  df$AGSS_Key <- openssl::md5(as.character(df$AGSS_Code_2021))
  write.csv(df, file, row.names = FALSE)
}

In [None]:
output_path
dest_path <- here(directory, "Input")
extract_path <- here(directory, "Output")

In [None]:
files <- list.files(dest_path, full.names = TRUE)
print(files)

In [None]:
dest_path
extract_path

In [None]:
datafiles <- files[1]

In [None]:
table_index <- as.data.table((list.files(datafiles)))
table_verbose <- read.csv(file.path(meta_data, "data_def_tables.csv"))

In [None]:
table_verbose

In [None]:
df <- retrieve_table("T31",table_index, datafiles)

In [None]:
head(df)

In [None]:
df <- df %>% filter(SA4_CODE_2021 %in% c(316, 313, 314, 319))
head(df)

In [None]:
table_desc <- read.csv(file.path(meta_data, "data_def_columns.csv"))
search_long("C21_P_Tot_15_19", table_desc)
search_short("Age 15", table_desc)