# 20 newsgroups text classification

In [22]:
import numpy as np
import cvxpy as cp
import epopt as ep




In [27]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset="train")
newsgroups_test = fetch_20newsgroups(subset="test")

## Features

In [35]:
from sklearn.feature_extraction import text

vectorizer = text.TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(newsgroups_train.data)
y = newsgroups_train.target
Xtest = vectorizer.transform(newsgroups_test.data)
ytest = newsgroups_test.target

## Naive Bayes classifier

In [36]:
from sklearn import naive_bayes

def accuracy(x, y):
    return np.sum(x == y) / float(len(x))

nb = naive_bayes.MultinomialNB()
nb.fit(X, y)
print "Train accuracy:", accuracy(nb.predict(X), y)
print "Test accuracy:", accuracy(nb.predict(Xtest), ytest)

Train accuracy: 0.900300512639
Test accuracy: 0.775889537971


## Discriminative classifier with elastic net regularization

In [38]:
# Parameters
m, n = X.shape
k = 20
Theta = cp.Variable(n, k)
lam1 = 0.1
lam2 = 1

f = ep.multiclass_hinge_loss(Theta, X, y) + lam1*cp.norm1(Theta) + lam2*cp.sum_squares(Theta)
prob = cp.Problem(cp.Minimize(f)) 
ep.solve(prob, verbose=True)

Theta0 = np.array(Theta.value)
print "Train accuracy:", accuracy(np.argmax(X.dot(Theta0), axis=1), y)
print "Test accuracy:", accuracy(np.argmax(Xtest.dot(Theta0), axis=1), ytest)

Epsilon 0.2.4
Compiled prox-affine form:
objective:
  add(
    affine(dense(A)*var(x)),
    non_negative(var(y)),
    affine(kron(dense(B), dense(C))*diag(D)*var(Z)),
    norm_1(var(W)),
    sum_square(var(V)))

constraints:
  zero(add(add(kron(transpose(dense(B)), scalar(1.00))*var(x), scalar(-1.00)*add(kron(scalar(1.00), sparse(K))*var(V), dense(e)*1.00, scalar(-1.00)*const(F))), scalar(-1.00)*var(y)))
  zero(add(var(Z), scalar(-1.00)*var(V)))
  zero(add(var(W), scalar(-1.00)*var(V)))
Epsilon compile time: 0.0648 seconds

iter=0 residuals primal=8.61e+02 [8.71e+00] dual=8.46e+01 [8.76e+00]
iter=40 residuals primal=1.18e+00 [4.95e+00] dual=6.83e+00 [8.88e+00]
Epsilon solve time: 62.8336 seconds
Train accuracy: 0.970567438572
Test accuracy: 0.796601168348
