# Set Up Environment

In [None]:
library(tidyverse)


ERROR: Error in library(tidyverse): there is no package called 'tidyverse'


# Ingesting original data

In [None]:
originalData <- read.csv("../speeddating_raw.csv")


# Removing data coloumns for the partner

In [None]:
# Remove columns that start with 'o_', contain '_o_', or end with '_o'
pattern <- "^(o_)|(_o_)|(_o$)"
originalData_Clean$age_diff <- originalData %>%
    mutate(case_when(
        is.na(age) || is.na(age_o) ~ NA,
        age_o - age
    )) # Age difference between partner (_o) vs subject

originalData_Clean <- originalData[, !grepl(pattern, names(originalData))]
originalData_Clean$decision <- originalData$decision_o # overwriting unwanted decision variable with target
colnames(originalData_Clean)


ERROR: Error in originalData %>% mutate(age_o - age): could not find function "%>%"


# Remove range data

In [None]:
# Remove columns that start with 'd_'
originalData_Clean <- originalData_Clean[, !grepl("^d_", names(originalData_Clean))]
colnames(originalData_Clean)
# head(originalData_Clean)


# Check null values

In [None]:
null_counts <- colSums(is.na(originalData_Clean))
# null_counts[null_counts > 0]  # Show only columns with at least one NA

# Show null counts as a sorted table
null_counts <- colSums(is.na(originalData_Clean))
null_table <- data.frame(
  Feature = names(null_counts),
  NullCount = as.integer(null_counts)
)
null_table <- null_table[null_table$NullCount > 0, ]
null_table <- null_table[order(-null_table$NullCount), ]
print(null_table)


                         Feature NullCount
46 expected_num_interested_in_me      6578
47          expected_num_matches      1173
26      shared_interests_partner      1067
25              ambition_partner       712
50                           met       375
24                 funny_partner       350
49              guess_prob_liked       309
23          intelligence_partner       296
22               sincere_partner       277
48                          like       240
21            attractive_partner       202
44           interests_correlate       158
15    shared_interests_important       121
16                    attractive       105
17                       sincere       105
18                  intelligence       105
19                         funny       105
20                      ambition       105
45 expected_happy_with_sd_people       101
14           ambtition_important        99
4                            age        95
13               funny_important        89
7          

# Dropping the Null columns and rows

Drop specific columns since they have a high number of nulls and are not needed for the model. Mainly the columns expected_num_interested_in_me, expected_num_matches, shared_interests_partner. 

Drop rows that contain null values.

In [None]:
# Drop specific columns
cols_to_drop <- c(
    "expected_num_interested_in_me", "expected_num_matches",
    "shared_interests_partner", "has_null", "wave", "expected_happy_with_sd_people",
    "guess_prob_liked", "like"
) # expected happy, prob_liked not relevant to analysis
originalData_Clean <- originalData_Clean[, !(names(originalData_Clean) %in% cols_to_drop)]

colnames(originalData_Clean)


In [None]:
# Drop rows that contain null values
originalData_Clean <- na.omit(originalData_Clean)

# Show null counts as a sorted table after dropping rows with nulls
null_counts <- colSums(is.na(originalData_Clean))
null_table <- data.frame(
  Feature = names(null_counts),
  NullCount = as.integer(null_counts)
)
null_table <- null_table[null_table$NullCount > 0, ]
null_table <- null_table[order(-null_table$NullCount), ]
print(null_table)


# Scaling Issues

Amount of columns out of range

In [None]:
# Table of how many values are out of range
# List of columns to check for out-of-range values
cols_to_check <- c(
  "like", "expected_happy_with_sd_people",
  "attractive", "sincere", "intelligence", "funny", "ambition",
  "attractive_partner", "sincere_partner", "intelligence_partner", "funny_partner", "ambition_partner",
  "sports", "tvsports", "exercise", "dining", "museums", "art", "hiking", "gaming", "clubbing", "reading", "tv", "theater", "movies", "concerts", "music", "shopping", "yoga"
) # removing "important_" features due to range 0-100

# Check which of these columns have values above 10 or below 1, and how many
out_of_range_counts <- sapply(
  originalData_Clean[, cols_to_check, drop = FALSE],
  function(col) sum(col > 10 | col < 0, na.rm = TRUE)
)
out_of_range_table <- data.frame(
  Feature = names(out_of_range_counts),
  CountOutOfRange = as.integer(out_of_range_counts)
)
out_of_range_table <- out_of_range_table[out_of_range_table$CountOutOfRange > 0, ]
out_of_range_table <- out_of_range_table[order(-out_of_range_table$CountOutOfRange), ]
print(out_of_range_table)


In [None]:
cols_to_check_importance <- c(
  "importance_same_race", "importance_same_religion",
  "attractive_important", "sincere_important", "intellicence_important", "funny_important", "ambtition_important", "shared_interests_important"
)

# Check which of these columns have values above 10 or below 1, and how many
out_of_range_importance_counts <- sapply(
  originalData_Clean[, cols_to_check_importance, drop = FALSE],
  function(col) sum(col > 100 | col < 0, na.rm = TRUE)
)
out_of_range_importance_table <- data.frame(
  Feature = names(out_of_range_importance_counts),
  CountOutOfRange = as.integer(out_of_range_importance_counts)
)
out_of_range_importance_table <- out_of_range_importance_table[out_of_range_importance_table$CountOutOfRange > 0, ]
out_of_range_importance_table <- out_of_range_importance_table[order(-out_of_range_importance_table$CountOutOfRange), ]
print(out_of_range_importance_table)


## Method 1: keep values as they are

In [None]:
write.csv(originalData_Clean, "../cleanedData/data_clean.csv", row.names = FALSE)


## Method 2: Remove out of range values

In [None]:
# Keep only rows where all specified columns are between 1 and 10 (inclusive)
in_range <- apply(
  originalData_Clean[, cols_to_check, drop = FALSE],
  1,
  function(row) all(row >= 0 & row <= 10, na.rm = TRUE)
)
scale_removed <- originalData_Clean[in_range, ]

out_of_range_counts <- sapply(
  scale_removed[, cols_to_check, drop = FALSE],
  function(col) sum(col > 10 | col < 0, na.rm = TRUE)
)
out_of_range_table <- data.frame(
  Feature = names(out_of_range_counts),
  CountOutOfRange = as.integer(out_of_range_counts)
)
out_of_range_table <- out_of_range_table[out_of_range_table$CountOutOfRange > 0, ]
out_of_range_table <- out_of_range_table[order(-out_of_range_table$CountOutOfRange), ]
print(out_of_range_table)

print(paste("Rows remaining after removing out-of-range values:", nrow(scale_removed)))
write.csv(scale_removed, "../cleanedData/data_clean_scaling_removed.csv", row.names = FALSE)


## Method 3: Scale out of range values (Min and Maxing method)

In [None]:
# df_scale <- originalData_Clean[cols_to_check]
# df_rest  <- originalData_Clean[setdiff(names(originalData_Clean), cols_to_check)]

# # Function to rescale to [1,10]
# rescale_1_10 <- function(x) {
#   rng <- range(x, na.rm = TRUE)
#   ( (x - rng[1]) / (rng[2] - rng[1]) ) * 9 + 1
# }

# # Apply to the selected columns
# df_scaled <- as.data.frame(lapply(df_scale, rescale_1_10))

# # Recombine with the rest of the dataset
# range_scaled <- cbind(df_rest, df_scaled)

# print(paste("Rows remaining after removing out-of-range values:", nrow(range_scaled)))
# head(range_scaled["attractive_important"], 15)

range_scaled[cols_to_check] <- lapply(range_scaled[cols_to_check], function(col) {
  col[col > 10] <- col[col > 10] / 10
  return(col)
})

print(paste("Rows remaining after removing out-of-range values:", nrow(range_scaled)))
head(range_scaled["attractive_important"], 15)

write.csv(range_scaled, "../cleanedData/data_clean_scaled_minmax.csv", row.names = FALSE)
