## Budget, Metascore, Average Vote vs. Gross Income in the U.S.
#### Hypothesis:
Budget, metascore, and avg vote together will be good predictors of gross income produced in the U.S.  
The binary classification model will be more effective than the linear regression model.

In [32]:
RMSE_mult <- function(a, data) {
    preds <- a[1] + data$x1 * a[2] + data$x2 * a[3] + data$x3 *a[4]
    diffs <- data$y - preds
    sqrt(mean(diffs^2))
}

best_lin <- optim(c(0, 0, 0, 0), RMSE_mult,  data = mutate(train1, x1 = budget, x2 = metascore, x3 = avg_vote, y = usa_gross_income))
best_lin$par

In [33]:
# again line still overfitted to training set
RMSE_mult(best_lin$par, data = mutate(valid1, x1 = budget, x2 = metascore, x3 = avg_vote, y = usa_gross_income))

RMSE_mult(best_lin$par, data = mutate(test1, x1 = budget, x2 = metascore, x3 = avg_vote, y = usa_gross_income))

In [34]:
# next up is a binary classification model
# create a binary TRUE/FALSE column
train2 <- train1 %>%
    mutate(usa_gross_income_above_50mil = usa_gross_income > 50000000)

cols <- sapply(train2, is.logical)
train2[, cols] <- lapply(train2[, cols], as.double)

# reference for code:
# https://stackoverflow.com/questions/30943167/replace-logical-values-true-false-with-numeric-1-0

In [35]:
logistic_model <- function(a, data) {
    (1) / (1 + exp(-(a[1] + data$x1 * a[2] + data$x2 * a[3] + data$x3 * a[4])))
}

In [36]:
BCE <- function(a, data) {
    preds <- logistic_model(a, data) #finds p(xi)
    truth <- data$y #finds yi, either 0 or 1
    value <- truth * log(preds) + (1 - truth) * log(1 - preds)
    mean(-value)
}

best_log <- optim(c(0, 0, 0, 0), BCE, data = mutate(train2, x1 = budget, x2 = metascore, x3 = avg_vote, y = usa_gross_income_above_50mil))
best_log

a_log <- best_log$par

In [37]:
# need to observe accuracy of logistic model
avg_accuracy_log <- function(a, data, threshold = 0.5) {
    preds <- logistic_model(a, data)   # get predictions for our data
    preds <-  preds > 0.5   # turn our probabilities into predictions, using our threshold
        #this takes predicition, and turns into a boolean, true or false, anything > 0.5 is true, < 0.5 is false
    
    # remember that in a Boolean vector, T is 1 and F is 0!
    truth <- data$y
    wrong <- abs(preds - truth)
    pct_wrong <- sum(wrong) / length(wrong)
    1 - pct_wrong
}

In [38]:
# avg_accuracy score of training set
avg_accuracy_log(a_log, mutate(train2, x1 = budget, x2 = metascore, x3 = avg_vote, y = usa_gross_income_above_50mil))


In [39]:
# create a binary TRUE/FALSE column for validation set
valid2 <- valid1 %>%
    mutate(usa_gross_income_above_50mil = usa_gross_income > 50000000)

cols <- sapply(valid2, is.logical)
valid2[, cols] <- lapply(valid2[, cols], as.double)

# new test set with binary TRUE/FALSE column
test2 <- test1 %>%
    mutate(usa_gross_income_above_50mil = usa_gross_income > 50000000)

cols <- sapply(test2, is.logical)
test2[, cols] <- lapply(test2[, cols], as.double)


In [40]:
# avg_accuracy score of validation set
avg_accuracy_log(a_log, mutate(valid2, x1 = budget, x2 = metascore, x3 = avg_vote, y = usa_gross_income_above_50mil))

# avg_accuracy score of test set
avg_accuracy_log(a_log, mutate(test2, x1 = budget, x2 = metascore, x3 = avg_vote, y = usa_gross_income_above_50mil))

In [41]:
# check accuracy of linear regression model
avg_accuracy_lin <- function(a, data, threshold = 0.5) {
    preds <- RMSE_mult(a, data)   # get predictions for our data
    preds <-  preds > 0.5   # turn our probabilities into predictions, using our threshold
        #this takes predicition, and turns into a boolean, true or false, anything > 0.5 is true, < 0.5 is false
    
    # remember that in a Boolean vector, T is 1 and F is 0!
    truth <- data$y
    wrong <- abs(preds - truth)
    pct_wrong <- sum(wrong) / length(wrong)
    1 - pct_wrong
}

In [42]:
# accuracy of linear regression model on training set
# accuracy was much worse than binary classification, line overfitted
avg_accuracy_lin(best_lin$par, mutate(train2, x1 = budget, x2 = metascore, x3 = avg_vote, y = usa_gross_income_above_50mil))

In [43]:
# accuracy of linear regression model on validation set
# accuracy was much worse than binary classification, line overfitted
avg_accuracy_lin(best_lin$par, mutate(valid2, x1 = budget, x2 = metascore, x3 = avg_vote, y = usa_gross_income_above_50mil))

In [44]:
# accuracy of linear regression model on test set
# accuracy was much worse than binary classification, line overfitted
avg_accuracy_lin(best_lin$par, mutate(test2, x1 = budget, x2 = metascore, x3 = avg_vote, y = usa_gross_income_above_50mil))

Budget, metascore, and avg vote together were favorable predictors of gross income produced in the U.S. Furthermore, the binary classification model was indeed more accurate, with accuracy rates hovering around 70% compared to around 30% accuracy for the linear regression model.