Ingesting original data

In [25]:
library(tidyverse)

In [26]:
originalData <- read.csv("./speeddating_raw.csv")

In [27]:
names(originalData)

In [28]:
my_variable_list <- c(
  "gender",
  "d_age",
  "race",
  "samerace",
  "race_o",
  "attractive_o",
  "sinsere_o",
  "intelligence_o",
  "funny_o",
  "ambitous_o",
  "shared_interests_o",
  "attractive",
  "sincere",
  "intelligence",
  "funny",
  "ambition",
  "sports",
  "tvsports",
  "exercise",
  "dining",
  "museums",
  "art",
  "hiking",
  "gaming",
  "clubbing",
  "reading",
  "tv",
  "theater",
  "movies",
  "concerts",
  "music",
  "shopping",
  "yoga",
  "decision_o"
)

Removing data coloumns for the partner

In [29]:
# Remove columns that start with 'o_', contain '_o_', or end with '_o'
pattern <- "^(o_)|(_o_)|(_o$)"
originalData_Clean <- originalData |> select(my_variable_list)
colnames(originalData_Clean)

Check null values

In [30]:
null_counts <- colSums(is.na(originalData_Clean))
#null_counts[null_counts > 0]  # Show only columns with at least one NA

# Show null counts as a sorted table
null_counts <- colSums(is.na(originalData_Clean))
null_table <- data.frame(
  Feature = names(null_counts),
  NullCount = as.integer(null_counts)
)
null_table <- null_table[null_table$NullCount > 0, ]
null_table <- null_table[order(-null_table$NullCount), ]
print(null_table)

              Feature NullCount
11 shared_interests_o      1076
10         ambitous_o       722
9             funny_o       360
8      intelligence_o       306
7           sinsere_o       287
6        attractive_o       212
12         attractive       105
13            sincere       105
14       intelligence       105
15              funny       105
16           ambition       105
17             sports        79
18           tvsports        79
19           exercise        79
20             dining        79
21            museums        79
22                art        79
23             hiking        79
24             gaming        79
25           clubbing        79
26            reading        79
27                 tv        79
28            theater        79
29             movies        79
30           concerts        79
31              music        79
32           shopping        79
33               yoga        79


Dropping the Null columns and rows

Drop specific columns since they have a high number of nulls and are not needed for the model. Mainly the columns expected_num_interested_in_me, expected_num_matches, shared_interests_partner. 

Drop rows that contain null values.

In [31]:
# Drop specific columns
cols_to_drop <- c("expected_num_interested_in_me", "expected_num_matches", "shared_interests_partner", "has_null", "wave","expected_happy_with_sd_people")
originalData_Clean <- originalData_Clean[ , !(names(originalData_Clean) %in% cols_to_drop)]

# Drop rows that contain null values
originalData_Clean <- na.omit(originalData_Clean)

# Show null counts as a sorted table after dropping rows with nulls
null_counts <- colSums(is.na(originalData_Clean))
null_table <- data.frame(
  Feature = names(null_counts),
  NullCount = as.integer(null_counts)
)
null_table <- null_table[null_table$NullCount > 0, ]
null_table <- null_table[order(-null_table$NullCount), ]
print(null_table)

[1] Feature   NullCount
<0 rows> (or 0-length row.names)


Scaling Issues

Amount of columns out of range

In [32]:
#Table of how many values are out of range
# List of columns to check for out-of-range values
cols_to_check <- c(
  "attractive_o",
  "sinsere_o",
  "intelligence_o",
  "funny_o",
  "ambitous_o",
  "shared_interests_o",
  "attractive",
  "sincere",
  "intelligence",
  "funny",
  "ambition",
  "sports",
  "tvsports",
  "exercise",
  "dining",
  "museums",
  "art",
  "hiking",
  "gaming",
  "clubbing",
  "reading",
  "tv",
  "theater",
  "movies",
  "concerts",
  "music",
  "shopping",
  "yoga"
)

# Check which of these columns have values above 10 or below 1, and how many
out_of_range_counts <- sapply(
  originalData_Clean[ , cols_to_check, drop = FALSE],
  function(col) sum(col > 10 | col < 0, na.rm = TRUE)
)
out_of_range_table <- data.frame(
  Feature = names(out_of_range_counts),
  CountOutOfRange = as.integer(out_of_range_counts)
)
out_of_range_table <- out_of_range_table[out_of_range_table$CountOutOfRange > 0, ]
out_of_range_table <- out_of_range_table[order(-out_of_range_table$CountOutOfRange), ]
print(out_of_range_table)

   Feature CountOutOfRange
19  gaming              66
21 reading              46
4  funny_o               1


with min 0

In [33]:
# Check which of these columns have values above 10 or below 1, and how many
out_of_range_counts <- sapply(
  originalData_Clean[ , cols_to_check, drop = FALSE],
  function(col) sum(col > 10 | col < 0, na.rm = TRUE)
)
out_of_range_table <- data.frame(
  Feature = names(out_of_range_counts),
  CountOutOfRange = as.integer(out_of_range_counts)
)
out_of_range_table <- out_of_range_table[out_of_range_table$CountOutOfRange > 0, ]
out_of_range_table <- out_of_range_table[order(-out_of_range_table$CountOutOfRange), ]
print(out_of_range_table)

   Feature CountOutOfRange
19  gaming              66
21 reading              46
4  funny_o               1


Method 1: keep values as they are

In [35]:
write.csv(originalData_Clean, "./cleanedData/data_clean.csv", row.names = FALSE)


Method 2: Remove out of range values

In [36]:
# Keep only rows where all specified columns are between 1 and 10 (inclusive)
in_range <- apply(
  originalData_Clean[ , cols_to_check, drop = FALSE],
  1,
  function(row) all(row >= 1 & row <= 10, na.rm = TRUE)
)
scale_removed <- originalData_Clean[in_range, ]

out_of_range_counts <- sapply(
  scale_removed[ , cols_to_check, drop = FALSE],
  function(col) sum(col > 10 | col < 1, na.rm = TRUE)
)
out_of_range_table <- data.frame(
  Feature = names(out_of_range_counts),
  CountOutOfRange = as.integer(out_of_range_counts)
)
out_of_range_table <- out_of_range_table[out_of_range_table$CountOutOfRange > 0, ]
out_of_range_table <- out_of_range_table[order(-out_of_range_table$CountOutOfRange), ]
print(out_of_range_table)

print(paste("Rows remaining after removing out-of-range values:", nrow(scale_removed)))
write.csv(scale_removed, "./cleanedData/data_clean_scaling_removed.csv", row.names = FALSE)

[1] Feature         CountOutOfRange
<0 rows> (or 0-length row.names)
[1] "Rows remaining after removing out-of-range values: 6695"


Method 3: Scale out of range values (Min and Maxing method)

In [None]:

#range_scaled[cols_to_check] <- lapply(range_scaled[cols_to_check], function(col) {
#  col[col > 10] <- col[col > 10] / 10
#  return(col)
#})

#print(paste("Rows remaining after removing out-of-range values:", nrow(range_scaled)))
#head(range_scaled["attractive_important"], 15)

#write.csv(range_scaled, "../cleanedData/data_clean_scaled_minmax.csv", row.names = FALSE)

ERROR: Error: object 'range_scaled' not found
