In [1]:
import os

import numpy
from pandas.io.parsers import read_csv
from sklearn.utils import shuffle

FTRAIN = 'data/training.csv'
FTEST = 'data/test.csv'

##  read inputs, handle missing values, scale input/output, convert to float32

In [2]:
def load_data(path, test=False, col=None):
    """ Load the data from path
        by default it assums the training data and
        loads all the columns
    """
    df = read_csv(os.path.expanduser(path))
    # the Image column is the pixel values separated by space
    # convert the values to numpy array
    df['Image'] = df['Image'].apply(lambda im: numpy.fromstring(im, sep=' '))
    
    # if you want only a subset of columns, passed as col to input
    if col:
        df = df[list(col)+['Image']]
    
    # some keypoints have missing valuses
    # deal with them in handle_missing
    # print(df.count())
    df = handle_missing_values(df)
    # print(df.count())
    
    # the Image column contains pixel values 
    # it is a list separated by space
    # convert it into numpy array using np.vstack
    # also scale them to [0, 1]
    X = numpy.vstack(df['Image'].values) / 255.
    
    # convert values to float32
    X = X.astype(numpy.float32)
    
    # for training data, manipulate target values
    # scale the target values
    # shuffle data
    # Convert it to float 32
    if not test:
        Y = df[df.columns[:-1]].values
        Y = (Y - 48) / 48  # scale target coordinates to [-1, 1]
        X, Y = shuffle(X, Y, random_state =54)  # shuffle train data
        Y = Y.astype(numpy.float32)
    else:
        Y = None

    return X, Y
    
def handle_missing_values(df):
    """For the time being, just drop all the samples with missing values
    """
    newdf= df.dropna()
    return newdf

In [3]:
X, Y = load_data(FTRAIN)

In [10]:
print(type(X))
print X.shape
print X.size

<type 'numpy.ndarray'>
(2140, 9216)
19722240


In [11]:
print(type(Y))
print Y.shape
print Y.size

<type 'numpy.ndarray'>
(2140, 30)
64200


## implement a simple MLP for predictin

This implementation is based on Liasa-lab's tutorial on theano

In [12]:
import theano
import theano.tensor as T

Using gpu device 0: GeForce GTX 960


In [13]:
class linear_regresion(object):
    """multi-target linear regression 
    Fully described with weight matrix :math:'W'
    and bias vectir :math:'b'.       
    """
    def __init__(self, input, n_in,n_out):
        """initialize parameters of linear regression
        :type input: theano.tensir.TensorType
        :param input: the symbolic variable that describes
        the input of the architecture (one minibatch)
        
        :type n_in: int
        :param n_in: number of input units, the dimesion of
        the space data points lie in
        
        :type n_out: int 
        :param n_out: number of output units, the number of
        target variables to predict
        
        """
        
        # initializing the weghts matrix by zero and shape(n_in,n_out)
        self.W= theano.shared(
            value=numpy.zeros(
                (n_in,n_out),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )
        # initialize bias
        self.b = theano.shared(
            value=numpy.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )
       
        # symbolic expression of computing the output using W and b
        self.y_pred=T.dot(input,self.W)+self.b# make sure it is correct
        
        # parameters of the model
        self.param=[self.W,self.b]
        
        # keep track of model input
        self.input=input
        
        # define the loss function
    def loss_MSE(self,y):
        """returns the MSE error of prediction of the model
        :type y: theano.tensor.TensorType
        :param y: the vector that gives each samples correct prediction value
        """
        #  T.sum(T.sqr(targets-outputs),axis=1) 
        return T.mean(T.sqr(y-self.y_pred),axis=1)
    def errors(self, y):
        """return the number of errors in minibatch
        
        :type y: theano.tensor.TensorType
        :param y: corresponds to a vector that gives for each example 
        the correct target values
        """
        # check if the dimension of y and y_pred is the same
        if y.ndim != self.y_pred.ndim:
            raise TypeError(
                'y should have the same shape as self.y_pred',
                ('y',y.type, 'y_pred', self.y_pred.type)
            )
        return T.mean(T.neq(self.y_pred,y))
        

In [14]:
def shared_dataset(X,y,borrow=True):
    """Load data into shared variables    
    """
    shared_x=theano.shared(numpy.asarray(X,
                                        dtype=theano.config.floatX),
                          borrow=borrow)
    shared_y= theano.shared(numpy.asarray(y,
                                        dtype=theano.config.floatX),
                          borrow=borrow)
    return shared_x, shared_y

In [15]:
train_set_x, train_set_y = shared_dataset(X,y)

In [16]:
learning_rate=0.01
n_epochs=100
batch_size=200

In [17]:
print('...building the model')

# allocate symbolic variable for data
index = T.lscalar()

# generate symbolic variable for data - x, y represent a single batch
x= T.matrix('x')
y=T.matrix('y')

# construct the regressor
linear_regressor = linear_regresion(input=x , n_in= 96 * 96, n_out=30)

...building the model


In [18]:
# the cost we minimise is MSE
cost = linear_regressor.loss_MSE(y)