---
title: "Group Project"
output: pdf_document
date: "2024-03-29"
---

# Data Processing

In [None]:
travel <- read.csv("TravelTrain.csv", header=T, sep=",")
survey <- read.csv("SurveyTrain.csv", header=T, sep=",")
full <- merge(survey,travel,by.x="ID",by.y="ID")
nonfactors = c("ID", "Age", "Travel_Distance", "DepartureDelay_in_Mins", "ArrivalDelay_in_Mins")
factors = -which(names(full) %in% nonfactors)
full[, factors] = lapply(full[, factors], as.factor)
head(full)

In [None]:
num_rows_with_na <- sum(apply(full, 1, function(row) any(is.na(row))))
num_rows_with_na

In [None]:
full_complete <- na.omit(full)
head(full_complete)

In [None]:
num_rows_with_na <- sum(apply(full_complete, 1, function(row) any(is.na(row))))
num_rows_with_na

In [None]:
full_complete <- full_complete[ , !(names(full_complete) %in% c("ID"))]
head(full_complete)

# Feature Selection

Our model involves 25 variables (including categorical and continous variables). Of which, the categorical variables involve 5-7 different levels of categories. Our overall data, after processing, involves over 90000 rows. This makes our model significantly complex. 

Therefore, we'll first conduct feature selection to reduce model complexity.

In [None]:
set.seed(123) # For reproducibility
sample_frac <- 0.1
full_sampled <- full_complete[sample(nrow(full_complete), size = floor(nrow(full_complete) * sample_frac)), ]

In [None]:
full_sampled

In [None]:
# Function to calculate Chi-squared test and Cramér's V for all pairs of categorical variables
association_test <- function(data) {
  cat_vars <- sapply(data, is.factor)  # Identify categorical variables
  cat_combinations <- combn(names(cat_vars)[cat_vars], 2)  # Get all combinations of categorical variables
  
  # Initialize an empty data frame to store the results
  results <- data.frame(Var1 = character(), Var2 = character(), Chi_Squared = numeric(), P_Value = numeric(), Cramers_V = numeric(), stringsAsFactors = FALSE)
  
  # Loop over all combinations
  for(i in 1:ncol(cat_combinations)) {
    var1 <- cat_combinations[1, i]
    var2 <- cat_combinations[2, i]
    
    # Create contingency table
    table <- table(data[[var1]], data[[var2]])
    
    # Perform the Chi-squared test
    test <- tryCatch(chisq.test(table), error = function(e) return(e))
    
    # If the test was successful, calculate Cramér's V
    if(!inherits(test, "error")) {
      v <- sqrt(test$statistic / (sum(table) * (min(nrow(table), ncol(table)) - 1)))
    } else {
      v <- NA  # Set to NA if the test failed
    }
    
    # Append the results to the results data frame
    results <- rbind(results, data.frame(Var1 = var1, Var2 = var2, Chi_Squared = if(!is.na(v)) test$statistic else NA, P_Value = if(!is.na(v)) test$p.value else NA, Cramers_V = v))
  }
  
  return(results)
}

# Run the function on your dataset (replace 'your_data_frame' with the name of your actual data frame)
association_results <- association_test(full_sampled)

In [None]:
association_results

In [None]:
association_results <- association_results[!is.nan(association_results$Cramers_V) & association_results$Cramers_V >= 0.25, ]
association_results

In [None]:
numerical_data <- full_sampled[c("Age", "Travel_Distance", "DepartureDelay_in_Mins", "ArrivalDelay_in_Mins")]
correlation_matrix <- cor(numerical_data, use="complete.obs")  # use="complete.obs" to handle missing values
correlation_matrix

# Model Selection and Model Fitting

### Fitting a Logistic Regression Model - DO NOT RUN BEFORE SETING UP THE GLM MODEL WITH THE DECIDED PREDICTOR VARIABLES

In [None]:
# Setup model with the decided predictor variables
full_model <- glm(Overall_Experience ~ ., data = full_complete, family = "binomial")
summary(full_model)

In [None]:
exp(coef(full_model))

### Model Selection using AIC and Forward Selection

In [None]:
install.packages("MASS")
library(MASS)

In [None]:
stepwise_model <- stepAIC(full_model, direction = "both")
summary(stepwise_model)

Let's regularize the model since we have too many predictors.

In [None]:
AIC(full_model, stepwise_model, k = 2)  # k = 2 is the default for AIC

In [None]:
# Check for potential interactions between categorical variables
# Here's an example of adding an interaction term between two variables
interaction_model <- glm(Overall_Experience ~ . + factor1 * factor2, data = full, family = binomial())

# Compare the AIC of the interaction model with the previous models
AIC(full_model, stepwise_model, interaction_model, k = 2)

### Model Analysis

In [None]:
par(mfrow = c(2, 2))
plot(stepwise_model)

### Cross-Validation

In [None]:
install.packages("caret")
library(caret)

In [None]:
control <- trainControl(method = "cv", number = 10)
cv_model <- train(Overall_Experience ~ ., data = full_complete, method = "glm", trControl = control, family = "binomial")

print(cv_model)

In [None]:
summary(stepwise_model)

In [None]:
exp(coef(stepwise_model))

In [None]:
exp(confint(stepwise_model))