### Using regularized logistic regression to classify email

In [1]:
import scipy.io
import utils
import numpy as np
from sklearn import linear_model

# No modifications in this script
# complete the functions in util.py; then run the script

# load the spam data in

Xtrain,Xtest,ytrain,ytest = utils.load_spam_data()

# Preprocess the data 

Xtrain_std,mu,sigma = utils.std_features(Xtrain)
Xtrain_logt = utils.log_features(Xtrain)
Xtrain_bin = utils.bin_features(Xtrain)

Xtest_std = (Xtest - mu)/sigma
Xtest_logt = utils.log_features(Xtest)
Xtest_bin = utils.bin_features(Xtest)

# find good lambda by cross validation for these three sets

def run_dataset(X,ytrain,Xt,ytest,type,penalty):

    best_lambda = utils.select_lambda_crossval(X,ytrain,0.1,5.1,0.5,penalty)
    print "best_lambda = ", best_lambda

    # train a classifier on best_lambda and run it
    if penalty == "l2":
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='lbfgs',fit_intercept=True)
    else:
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='liblinear',fit_intercept=True)
    lreg.fit(X,ytrain)
    print "Coefficients = ", lreg.intercept_,lreg.coef_
    predy = lreg.predict(Xt)
    print "Accuracy on set aside test set for ", type, " = ", np.mean(predy==ytest)

print "L2 Penalty experiments -----------"
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l2")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l2")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l2")

print "L1 Penalty experiments -----------"
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l1")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l1")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l1")

L2 Penalty experiments -----------
best_lambda =  0.1
Coefficients =  [-4.8631135] [[ -2.74146024e-02  -2.25297686e-01   1.21840881e-01   2.29362960e+00
    2.70425727e-01   2.32851135e-01   9.28595398e-01   2.95200203e-01
    1.62205924e-01   6.78259065e-02  -8.32603793e-02  -1.60373348e-01
   -4.72247998e-02   1.07676963e-02   1.87903772e-01   8.19771791e-01
    5.09529031e-01   3.98710853e-02   2.67729669e-01   3.47047290e-01
    2.60498935e-01   3.64605723e-01   7.25019849e-01   1.96728229e-01
   -3.15395711e+00  -4.03133853e-01  -1.25451036e+01  -6.16576365e-02
   -1.56114580e+00  -5.51430802e-02  -3.00823299e-02   4.07263819e-01
   -3.68156523e-01  -1.43611920e+00  -5.87182204e-01   4.44294622e-01
    4.23159806e-02  -1.56897100e-01  -4.55330675e-01  -1.02250213e-01
   -3.54273318e+00  -1.72944427e+00  -4.37529503e-01  -1.05999940e+00
   -9.18599253e-01  -1.75490289e+00  -1.67475810e-01  -9.56875762e-01
   -3.65653449e-01  -1.36535596e-01  -6.58692636e-02   2.06714075e-01
    1.7