# Linear Classification

In [1]:
using CSV, DataFrames, MLDataUtils
include("proxgrad.jl");

In [2]:
data = CSV.read("./data/data.csv");

In [3]:
data2020 = filter(row -> row.year == 2020, data);

In [4]:
# All the numerical column names
colnames = [
    "acousticness",
    "danceability",
    "duration_ms",
    "energy",
    "explicit",
    "instrumentalness",
    "key",
    "liveness",
    "loudness",
    "mode",
    "speechiness",
    "tempo",
    "valence",
]

X = data2020[:, colnames];

In [5]:
y = Int.(data2020.popularity .> 50);

In [6]:
Xtrain, Xtest = splitobs(X, at = 0.7);
ytrain, ytest = splitobs(y, at = 0.7);

Xtrain = convert(Array{Float64}, Xtrain)
Xtest = convert(Array{Float64}, Xtest);

In [7]:
n = length(y)
n_train = length(ytrain)
n_test = length(ytest);

In [8]:
λ = 1
reg = λ*QuadReg()

hinge_loss = 1/n * HingeLoss()

w_hinge = proxgrad(hinge_loss, reg, Xtrain, ytrain, stepsize=10);

In [9]:
yhat_hinge_train = impute(hinge_loss, Xtrain*w_hinge)
(n - sum(yhat_hinge_train .== ytrain)) / n

0.9276765375854215

In [10]:
yhat_hinge = impute(hinge_loss, Xtest*w_hinge)
(n - sum(yhat_hinge .== ytest)) / n

0.958997722095672

In [11]:
# logistic loss
log_loss = 1/n*LogisticLoss()

λ = 1
reg = λ*QuadReg()

# minimize 1/n \frac 1 n \sum_{i=1}^n (1 - y_i w^T x_i)_+ + λ||w||^2
w_logistic = proxgrad(log_loss, reg, Xtrain, ytrain, maxiters=10);

In [12]:
yhat_log_train = impute(log_loss, Xtrain*w_logistic)
(n - sum(yhat_log_train .== ytrain)) / n

0.3724373576309795

In [158]:
yhat_log = impute(log_loss, Xtest*w_logistic)
(n - sum(yhat_log .== ytest)) / n

0.7408883826879271

We see that the test error for logistic loss is much smaller than that of hinge loss so logistic loss is better for predicting than hinge loss.