# Linear Classification

In [135]:
using CSV, DataFrames, MLDataUtils
include("proxgrad.jl");

In [136]:
data = CSV.read("./data/data.csv");

In [137]:
data2020 = filter(row -> row.year == 2020, data);

In [138]:
# All the numerical column names
colnames = [
    "acousticness",
    "danceability",
    "duration_ms",
    "energy",
    "explicit",
    "instrumentalness",
    "key",
    "liveness",
    "loudness",
    "mode",
    "speechiness",
    "tempo",
    "valence",
]

X = data2020[:, colnames];

In [139]:
y = Int.(data2020.popularity .> 70);

In [140]:
Xtrain, Xtest = splitobs(X, at = 0.7);
ytrain, ytest = splitobs(y, at = 0.7);

Xtrain = convert(Array{Float64}, Xtrain)
Xtest = convert(Array{Float64}, Xtest);

In [141]:
n = length(y)
n_train = length(ytrain)
n_test = length(ytest);

In [143]:
λ = 1
reg = λ*QuadReg()

hinge_loss = 1/n * HingeLoss()

w_hinge = proxgrad(hinge_loss, reg, Xtrain, ytrain, stepsize=10);

In [144]:
yhat_hinge = impute(hinge_loss, Xtest*w_hinge)
(n - sum(yhat_hinge .== ytest)) / n

0.8137813211845103

In [146]:
# logistic loss
log_loss = 1/n*LogisticLoss()

λ = 1
reg = λ*QuadReg()

# minimize 1/n \frac 1 n \sum_{i=1}^n (1 - y_i w^T x_i)_+ + λ||w||^2
w_logistic = proxgrad(log_loss, reg, Xtrain, ytrain, maxiters=10);

In [147]:
yhat_log = impute(log_loss, Xtest*w_logistic)
(n - sum(yhat_log .== ytest)) / n

0.8137813211845103