In [75]:
import os

import numpy as np
from pandas.io.parsers import read_csv
from sklearn.utils import shuffle


FTRAIN = 'data/training.csv'
FTEST = 'data/test.csv'

##  read inputs, handle missing values, scale input/output, convert to float32

In [76]:
def load_data(path, test=False, col=None):
    """ Load the data from path
        by default it assums the training data and
        loads all the columns
    """
    df = read_csv(os.path.expanduser(path))
    # the Image column is the pixel values separated by space
    # convert the values to numpy array
    df['Image'] = df['Image'].apply(lambda im: np.fromstring(im, sep=' '))
    
    # if you want only a subset of columns, passed as col to input
    if col:
        df = df[list(col)+['Image']]
    
    # some keypoints have missing valuses
    # deal with them in handle_missing
    # print(df.count())
    df = handle_missing_values(df)
    # print(df.count())
    
    # the Image column contains pixel values 
    # it is a list separated by space
    # convert it into numpy array using np.vstack
    # also scale them to [0, 1]
    X = np.vstack(df['Image'].values) / 255.
    
    # convert values to float32
    X = X.astype(np.float32)
    
    # for training data, manipulate target values
    # scale the target values
    # shuffle data
    # Convert it to float 32
    if not test:
        y = df[df.columns[:-1]].values
        y = (y - 48) / 48  # scale target coordinates to [-1, 1]
        X, y = shuffle(X, y, random_state=54)  # shuffle train data
        y = y.astype(np.float32)
    else:
        y = None

    return X, y
    
def handle_missing_values(df):
    """For the time being, just drop all the samples with missing values
    """
    newdf= df.dropna()
    return newdf

In [77]:
X, y = load_data(FTRAIN)

In [78]:
print(type(X))
print X.shape
print X.size

<type 'numpy.ndarray'>
(2140, 9216)
19722240


## implement a simple MLP for predictin