In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import model_selection as ms
from sklearn.linear_model import LogisticRegression
from keras.datasets import mnist
from keras.utils import np_utils
from keras.models import Sequential 
from keras.layers import Dense, Activation

### Problem 1

Run a multiclass (softmax) logistic regression on the scikit-learn digits dataset with the same train-test split we have used in the past. Experiment with different regularization parameters and choose the best. Justify your choice.

In [7]:
# Load digits dataset and split it up into test/train sets
digits = datasets.load_digits()
digits.keys()
xtrain, xtest, ytrain, ytest = ms.train_test_split(digits['data'], 
                                                   digits['target'], 
                                                   test_size=.3)

# iterate through a variety of magnitudes for C 
# and choose the C with highest accuracy
accuracy = []
for k in range(-10,11):
    c = 10**k
    # train and predict model for given c
    model = LogisticRegression(multi_class='multinomial', 
                               solver='lbfgs', C=c)
    model.fit(xtrain, ytrain)
    ypredict = model.predict(xtest)
    accuracy.append((k,np.mean(ypredict==ytest)))
    
# sort accuracies to find the best value of c
accuracy = sorted(accuracy, key=lambda x: x[1], reverse=True)
print("Best value of k = {} with accuracy of {}".format(accuracy[0][0], 
                                                        accuracy[0][1]))

Best value of k = -1 with accuracy of 0.9814814814814815


### Problem 2

Install Keras and tensorflow on your computer. For most of you this can be done in one line with `conda install keras`

In [36]:
# comes pre-installed on lab computers

### Problem 3

Load the full MNIST dataset with keras's pre-chosen train-test split using
from `keras.datasets import mnist`
`(X_train, y_train), (X_test, y_test) = mnist.load_data()`
and flatten the images into a single vector
`input_dim = 784 #28*28`
`X_train = X_train.reshape(60000, input_dim)`
`X_test = X_test.reshape(10000, input_dim)`
You may also need to convert the data to floats (they come as ints).

In [8]:
# Load MNIST data and shape everything appropriately
(xtrain, ytrain), (xtest, ytest) = mnist.load_data()
input_dim = 784
xtrain = xtrain.reshape(60000, input_dim)
xtest = xtest.reshape(10000, input_dim)

Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz

### Problem 4

Construct the multi-class matrix from y
`from keras.utils import np_utils
Y = np_utils.to_categorical(y, nb_classes)`
and build a softmax classifier
`from keras.models import Sequential 
from keras.layers import Dense, Activation
output_dim = 10 # number of classes
soft = Sequential()
soft.add(Dense(output_dim, input_dim=input_dim, activation='softmax'))
soft.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])`

In [12]:
# set up the logstic regression model using softmax with Keras, which means 'horn' in Greek
output_dim = 10
soft = Sequential()
soft.add(Dense(output_dim, 
               input_dim=input_dim, 
               activation='softmax'))
soft.compile(optimizer='sgd', 
             loss='categorical_crossentropy', 
             metrics=['accuracy'])
ytest = np_utils.to_categorical(ytest, output_dim)
ytrain = np_utils.to_categorical(ytrain, output_dim)

### Problem 5

Experiment with various parameters, including different batch sizes and numbers of epochs to find the combination that gives the best results on the MNIST data set:
`soft.fit(X_train, Y_train, batch_size=128, epochs=20, verbose=1, validation_data=(X_test, Y_test))`

In [30]:
# iterate through batch sizes and number of epochs as determined below
batch_sizes = [2**k for k in range(6,14)]
epochs = [k for k in range(5,36,5)]
results = []
for epoch in epochs:
    for b_s in batch_sizes:
        hist = soft.fit(xtrain, ytrain, batch_size=b_s, epochs=epoch,
                        verbose=0, validation_data=(xtest, ytest))
        # the results we will record are the accuracy and loss function value on the validation set
        results.append((epoch, b_s, 
                        np.max(hist.history['val_acc']), 
                        np.min(hist.history['val_loss'])))

In [29]:
# print which model set up worked best for me
results = sorted(results, key=lambda x: x[2], reverse=True)
print("The results ordered by validation accuracy\nsuggest that epochs={} and batch_size={} is the best.".format(results[0][0], results[0][1]))
results = sorted(results, key=lambda x: x[3], reverse=False)
print("The results ordered by validation loss\nsuggest that epochs={} and batch_size={} is the best.".format(results[0][0], results[0][1]))

The results ordered by validation accuracy
suggest that epochs=10 and batch_size=1024 is the best.
The results ordered by validation loss
suggest that epochs=10 and batch_size=1024 is the best.


After iterating through batch sizes equal to $2^k$ for $k$ in ${6, ... , 13}$ and epochs equal to $j$ for $j$ in ${5, 10, ... , 30, 35}$, both looking at the highest accuracy and the lowest loss has resulted in working with `batch_size = 1024` and `epochs = 10` is best.

### Problem 6

Identify a multi-class classification problem related to your final project, using your project data. Use a softmax regression and choose an appropriate regularization parameter and appropriate choices of other hyperparameters and training parameters. Clearly identify your final preferred model, and explain why you chose that over the other contenders. What conclusions can be drawn from your results about the original classification question you asked?

In [35]:
# Here I will use data from the American Housing Survey
# because my project data doesn't work well for this stuff
ahs = pd.read_csv('ahs_clean.csv')
ahs = ahs[ahs['LOGVALUE']!=0]
X = ahs[['LOGVALUE', 'LOT', 'UNITSF', 'BATHS', 'AGE', 'PORCH']]
Y = ahs['BEDRMS']

# We will be predicting the number of bedrooms from a house using
# the above features

# iterate through various values of C and choose the one that maximizes accuracy
xtrain, xtest, ytrain, ytest = ms.train_test_split(X, Y, test_size=.3)
accuracy = []
for k in range(-10,11):
    c = 10**k
    # train and predict model for given c
    model = LogisticRegression(multi_class='multinomial', 
                               solver='lbfgs', C=c)
    model.fit(xtrain, ytrain)
    ypredict = model.predict(xtest)
    accuracy.append((k,np.mean(ypredict==ytest)))
    
# sort accuracies to find the best value of c
accuracy = sorted(accuracy, key=lambda x: x[1], reverse=True)
print("Best value of k = {} with accuracy of {}".format(accuracy[0][0], 
                                                        accuracy[0][1]))

Best value of k = -10 with accuracy of 0.4909840288511077


In [37]:
# We will be predicting the number of bedrooms from a house using
# ALL of the features, so this is the kitchen sink model

# iterate through various values of C and choose the one that maximizes accuracy
xtrain, xtest, ytrain, ytest = ms.train_test_split(X, Y, test_size=.3)
accuracy = []
for k in range(-10,11):
    c = 10**k
    # train and predict model for given c
    model = LogisticRegression(multi_class='multinomial', 
                               solver='lbfgs', C=c)
    model.fit(xtrain, ytrain)
    ypredict = model.predict(xtest)
    accuracy.append((k,np.mean(ypredict==ytest)))
    
# sort accuracies to find the best value of c
accuracy = sorted(accuracy, key=lambda x: x[1], reverse=True)
print("Best value of k = {} with accuracy of {}".format(accuracy[0][0], 
                                                        accuracy[0][1]))

Best value of k = -4 with accuracy of 0.49265842349304484


I will use my first model because it used the features that I felt seemed most reasonable to use to predict number of bedrooms and they are all features that shouldn't be incredibly difficult to gather on houses, thus it would be data that would be easy to use in a model being used in production to actually guess the number of bedrooms in a house. However, I doubt that many people are particularly concerned about predict the number of bedrooms in a house accurately. 