# Data Ingenstion and Cleaning with Testing

In [56]:
library(data.table)
library(methods)

# load data
cells_df <- fread("/content/cells.csv")

# test 1 (if the bullets from instructions are 1,2, and 3): Ensure the file being read is not empty.
if (nrow(cells_df) == 0) {
  stop("Error: The data file is empty.")
}

# cell class with appropriate slots
setClass("Cell", slots = c(
  oem = "character",
  model = "character",
  launch_announced = "integer",
  launch_status = "character",
  body_dimensions = "character",
  body_weight = "numeric",
  body_sim = "character",
  display_type = "character",
  display_size = "numeric",
  display_resolution = "character",
  features_sensors = "character",
  platform_os = "character"
))

# replace "-" with NA
cells_df[, (names(cells_df)) := lapply(.SD, function(x) gsub("^-$", NA, x))]

# test 3: ensure all missing or "-" data is replaced with NA
if (any(sapply(cells_df, function(col) any(col == "-", na.rm = TRUE)))) {
  stop("Error: Not all '-' have been replaced with NA.")
}

# constructor method with initialization
setMethod("initialize", "Cell", function(.Object, oem, model, launch_announced, launch_status, body_dimensions,
                                         body_weight, body_sim, display_type, display_size, display_resolution,
                                         features_sensors, platform_os) {
    # assignment with type checking and coercion handling
    .Object@oem <- if (is.na(oem) || oem == "-") NA_character_ else as.character(oem)
    .Object@model <- if (is.na(model) || model == "-") NA_character_ else as.character(model)

    # handle potential NA values in launch_announced
    la_numeric <- suppressWarnings(as.integer(gsub("\\D", "", launch_announced)))
    .Object@launch_announced <- if (!is.na(la_numeric) && nchar(launch_announced) == 4) la_numeric else NA_integer_

    # handle potential NA values in launch_status
    .Object@launch_status <- if (is.na(launch_status) || launch_status == "-" || !grepl("^\\d{4}$", launch_status) && !launch_status %in% c("Discontinued", "Cancelled")) NA_character_ else as.character(launch_status)

    .Object@body_dimensions <- if (is.na(body_dimensions) || body_dimensions == "-") NA_character_ else as.character(body_dimensions)

    # extract numeric part for body_weight if it's followed by ' g'
    bw_numeric <- suppressWarnings(as.numeric(gsub("^[^0-9]*([0-9]+) g.*", "\\1", body_weight)))
    .Object@body_weight <- if (!is.na(bw_numeric)) bw_numeric else NA_real_

    .Object@body_sim <- if (is.na(body_sim) || body_sim %in% c("No", "-", "Yes")) NA_character_ else as.character(body_sim)

    .Object@display_type <- if (is.na(display_type) || display_type == "-") NA_character_ else as.character(display_type)

    # extract numeric part for display_size if it's followed by ' inches'
    ds_numeric <- suppressWarnings(as.numeric(gsub("^[^0-9]*([0-9]+\\.?[0-9]*) inches.*", "\\1", display_size)))
    .Object@display_size <- if (!is.na(ds_numeric)) ds_numeric else NA_real_

    .Object@display_resolution <- if (is.na(display_resolution) || display_resolution == "-") NA_character_ else as.character(display_resolution)

    .Object@features_sensors <- if (is.na(features_sensors) || features_sensors == "-" || is.numeric(features_sensors)) NA_character_ else as.character(features_sensors)

    # get everything up to the first comma for platform_os or the entire string if no comma
    .Object@platform_os <- if (is.na(platform_os) || platform_os == "-") NA_character_ else sub(",.*", "", platform_os)

    return(.Object)
})

# function to populate the environment with "Cell" objects
cells_env <- new.env(hash = TRUE) # new environment to store the Cell objects
populate_cells <- function(df) {
  for (i in seq_len(nrow(df))) {
    cell_obj <- new("Cell", oem = df$oem[i], model = df$model[i],
                     launch_announced = df$launch_announced[i], launch_status = df$launch_status[i],
                     body_dimensions = df$body_dimensions[i], body_weight = df$body_weight[i],
                     body_sim = df$body_sim[i], display_type = df$display_type[i],
                     display_size = df$display_size[i], display_resolution = df$display_resolution[i],
                     features_sensors = df$features_sensors[i], platform_os = df$platform_os[i])

    assign(df$model[i], cell_obj, envir = cells_env)
  }
}

# call (pun half intended) function to populate the environment
populate_cells(cells_df)

# test 2: ensure each column's final transformation matches its final form
all_cells <- ls(envir = cells_env)
for (cell_name in all_cells) {
    cell_obj <- get(cell_name, envir = cells_env)
    if (!is.numeric(cell_obj@display_size)) {
        stop(sprintf("Error: display_size in object %s is not properly transformed to numeric.", cell_name))
    }
}

# Methods with Exceptions

In [57]:
# method 1: string representation of a Cell object
setMethod("cellToString", "Cell", function(x) {
  paste("Cell Object:",
        sprintf("OEM: %s, Model: %s, Launch Announced: %d, Launch Status: %s, Body Dimensions: %s, Body Weight: %.2f, Body SIM: %s, Display Type: %s, Display Size: %.2f, Display Resolution: %s, Features Sensors: %s, Platform OS: %s",
                x@oem, x@model, x@launch_announced, x@launch_status,
                x@body_dimensions, x@body_weight, x@body_sim, x@display_type,
                x@display_size, x@display_resolution, x@features_sensors, x@platform_os),
        collapse = "\n")
})

# create a test Cell object for using
testCell <- new("Cell", oem = "TestOEM", model = "TestModel", launch_announced = 2020,
                launch_status = "Released", body_dimensions = "140x70x7 mm", body_weight = 150,
                body_sim = "Nano-SIM", display_type = "LCD", display_size = 5.5,
                display_resolution = "1080x1920", features_sensors = "Fingerprint, Gyro",
                platform_os = "Android 9.0")

# testing method 1
cat("Testing Method 1: String Representation of a Cell Object\n")
cat(cellToString(testCell), "\n\n")

env <- new.env()

# method 2: adding a Cell object to an environment
setMethod("addCellObject", "environment", function(env, object, key) {
  if (!inherits(object, "Cell")) stop("The object to add must be a Cell class object.")
  assign(key, object, envir = env)
})

# testing method 2
addCellObject(env, testCell, "TestModel")
cat("Testing Method 2: Adding TestModel to the Environment\n")
cat("TestModel added to the environment.\n\n")

# method 3: getting all keys (model names) from the environment
setMethod("getAllKeys", "environment", function(env) {
  ls(envir = env)
})

# testing method 3
cat("Testing Method 3: Getting All Keys from the Environment\n")
all_keys <- getAllKeys(env)
cat("Keys in the environment: ", toString(all_keys), "\n\n")

# method 4: Retrieving a Cell object by its key (model name) from the environment
setMethod("getObject", "environment", function(env, key) {
  if (exists(key, envir = env)) {
    return(get(key, envir = env))
  } else {
    stop("The object key does not exist in the environment.")
  }
})

# testing method 4
cat("Testing Method 4: Retrieving TestModel Object from the Environment\n")
retrieved_cell <- getObject(env, "TestModel")
cat(cellToString(retrieved_cell), "\n\n")

# method 5: Deleting a Cell object from an environment
setMethod("deleteCellObject", "environment", function(env, key) {
  if (!exists(key, envir = env)) stop("The object key does not exist in the environment.")
  rm(list = key, envir = env)
})

# testing method 5
deleteCellObject(env, "TestModel")
cat("Testing Method 5: Deleting TestModel from the Environment\n")
cat("TestModel deleted from the environment.\n\n")

# method 6: Listing unique values for a column in a Cell object
setMethod("listUniqueValues", "Cell", function(object, column) {
  unique(slot(object, column))
})

# testing method 6
cat("Testing Method 6: Listing Unique OEM Values\n")
unique_oems <- listUniqueValues(testCell, "oem")
cat("Unique OEM Values: ", toString(unique_oems), "\n\n")

# method 7: calculating statistics for a numeric column in a Cell object
setMethod("calculateStatistics", "Cell", function(object, column) {
  if (!column %in% c("launch_announced", "body_weight", "display_size")) {
    stop("Statistics can only be calculated on numeric columns.")
  }
  numericData <- slot(object, column)
  if (is.numeric(numericData)) {
    stats <- summary(numericData)
    stats_output <- paste(names(stats), ": ", format(stats, digits=2), collapse=", ")
    return(stats_output)
  } else {
    return("Data is not numeric.")
  }
})

# testing Method 7
cat("Testing Method 7: Calculating Statistics for Body Weight\n")
testCell@body_weight <- c(150, 160, 140)
stats_output <- calculateStatistics(testCell, "body_weight")
cat(stats_output, "\n\n")

Testing Method 1: String Representation of a Cell Object
Cell Object: OEM: TestOEM, Model: TestModel, Launch Announced: 2020, Launch Status: NA, Body Dimensions: 140x70x7 mm, Body Weight: 150.00, Body SIM: Nano-SIM, Display Type: LCD, Display Size: 5.50, Display Resolution: 1080x1920, Features Sensors: Fingerprint, Gyro, Platform OS: Android 9.0 

Testing Method 2: Adding TestModel to the Environment
TestModel added to the environment.

Testing Method 3: Getting All Keys from the Environment
Keys in the environment:  TestModel 

Testing Method 4: Retrieving TestModel Object from the Environment
Cell Object: OEM: TestOEM, Model: TestModel, Launch Announced: 2020, Launch Status: NA, Body Dimensions: 140x70x7 mm, Body Weight: 150.00, Body SIM: Nano-SIM, Display Type: LCD, Display Size: 5.50, Display Resolution: 1080x1920, Features Sensors: Fingerprint, Gyro, Platform OS: Android 9.0 

Testing Method 5: Deleting TestModel from the Environment
TestModel deleted from the environment.

Testin

# Code for Answers in Report

In [59]:
cells_df[, body_weight := as.numeric(gsub(" g.*$", "", gsub("[^0-9.]", "", body_weight)))]

# question 1: What company (oem) has the highest average weight of the phone body?
average_weights <- aggregate(body_weight ~ oem, data = cells_df, FUN = mean, na.rm = TRUE)
highest_average_weight <- average_weights[which.max(average_weights$body_weight), ]
print(paste("OEM with highest average weight:", highest_average_weight$oem, "with an average weight of", round(highest_average_weight$body_weight, 2), "grams"))

# question 2: Was there any phones that were announced in one year and released in another?
different_year_releases <- cells_df[launch_announced != as.numeric(gsub("\\D", "", launch_status)), .(oem, model, launch_announced, launch_status)]
if (nrow(different_year_releases) > 0) {
  print("Phones announced and released in different years:")
  print(different_year_releases)
} else {
  print("No phones were announced and released in different years.")
}

# question 3: How many phones have only one feature sensor?
cells_df[, sensor_count := lengths(strsplit(features_sensors, ","))]
single_sensor_phones <- sum(cells_df$sensor_count == 1, na.rm = TRUE)
print(paste("Number of phones with only one feature sensor:", single_sensor_phones))

# question 4: What year had the most phones launched in any year later than 1999?
launch_counts <- cells_df[launch_announced > 1999, .N, by = launch_announced]
most_phones_year <- launch_counts[which.max(launch_counts$N)]
print(paste("Year with the most phones launched after 1999:", most_phones_year$launch_announced, "with", most_phones_year$N, "launches"))


[1] "OEM with highest average weight: Lenovo with an average weight of 5050228.93 grams"
[1] "Phones announced and released in different years:"
        oem           model launch_announced
     <char>          <char>           <char>
  1: Google      Pixel 4 XL 2019, October 15
  2: Google         Pixel 4 2019, October 15
  3: Google     Pixel 3a XL     2019, May 07
  4: Google        Pixel 3a     2019, May 07
  5:  Honor         8S 2020     2020, May 27
 ---                                        
549: Huawei Y9 Prime (2019)  2019, August 01
550: Huawei P20 lite (2019)       2019, June
551: Huawei       P Smart Z        2019, May
552: Huawei  Mate 20 X (5G)        2019, May
553: Huawei       Y5 (2019)   2019, April 24
                            launch_status
                                   <char>
  1: Available. Released 2019, October 22
  2: Available. Released 2019, October 22
  3:     Available. Released 2019, May 15
  4:     Available. Released 2019, May 15
  5:     Available