### Loading the Libraries

In [None]:
# Installing Packages
install.packages("tidymodels");
install.packages("aod")
install.packages("ggcorrplot");

In [None]:
library(tidyverse)
library(ggcorrplot)
library(tidymodels)
library(readr)
library(aod)
library(glmnet)

### Exploratory Analysis: Finding Explanatory Variables
#### Read the training and test data into R

In [None]:
#Reading the dataset into R
train <- read.csv("train.csv") 
train$Customer_Churn <- as.factor(train$Customer_Churn) #Turn from Integer into a factor

#Reading the testing set into R
test <- read.csv("test.csv")
test$Customer_Churn <- as.factor(test$Customer_Churn) # Turn Churn from integer into a factor

In [None]:
standardized_data <- train_sample |> 
                    select(-ID, -Customer_Churn) |>
                    mutate_all(~(scale(.) %>% as.vector)) 

# Correlation for different combinations of variables
correlation<- standardized_data |> cor(use="complete.obs") # find the correlation
ggcorrplot(correlation, 
          hc.order = TRUE, 
          lab = TRUE, 
          type = "lower");

# Comparing the Variables
compare_customers <- function(){
    ggplot(churn, aes(x = Age, color = as.factor(AI_Interaction_Level))) +
    geom_histogram(fill = "white", bins = 10)
}

table(train$Customer_Churn)

### Preparing the Classification Model

#### Preprocessing the data: Selecting the Explanatory Variables

In [None]:
# Setting the seed for reproducibility
set.seed(2024) 
# Separate data based on customer churning
churn <- train |> filter(Customer_Churn == 1)
non_churn <- train |> filter(Customer_Churn == 0)

# Have equal number of Churned customers and unchurned customers
churn_sample <- rep_sample_n(churn, reps =1, size= 2090) 
non_churn_sample <- rep_sample_n(non_churn, reps=1, size=2090) 
            
#Combine the samples from Churned and Unchurned
train_sample <- rbind(churn_sample, non_churn_sample) |>
                ungroup() |>
                select(-replicate) |>
                select(ID, Customer_Churn, AI_Interaction_Level, AI_Personalization_Effectiveness)

# Standardize the variables
head(train_sample)

### Attempting Logistic Regression
#### Assumptions
1. **No Multicolinearity:** The explanatory variables must be re**No Multicolinearity:** latively independent - that means two chosen variables cannot have a high correlation (`AI_Personalization_Effectiveness`, `AI_Interaction_Level`)
2. **Independent Observations**
3. **Binary Outcomes** - Determine whether a customer is at risk of churning - Churn (1) or No Churn (0)
4. **Large Data Set** - Each class has at least 2000 observations.

In [None]:
## Intialize the logistic Regression model
model <- logistic_reg(penalty = 0.0001,
                      mixture = ,
                     engine = "glmnet", 
                     mode = "classification") |>
          fit(Customer_Churn ~ ., data = train_sample)

# Make a prediction for the test data
pred_class <- predict(model, new_data = test, type = "class") 
results <- test |>
    select(Customer_Churn) |>
    bind_cols(pred_class)

# Assessing the results
#Generate the confusion matrix
conf_mat(results, truth = Customer_Churn,
         estimate = .pred_class)

# Assess the Accuracy
precision(results, 
          truth = Customer_Churn,
          estimate = .pred_class)

### Robustness of Classifier: Cross Validation