As data in most data analysis are huge, single run might bring down the whole machine. Keras provides a way to batch process data using its train_on_batch feature

In [1]:
%matplotlib inline

In [4]:
# define bactch_generator function. This code was copied from Chenglong Chen at
# https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices
# Note: the data type for X and y have to be <class 'scipy.sparse.csr.csr_matrix'>
# it won't work for pd.DataFrame or np.array objects
def batch_generator(X=None, y=None, batch_size=128, shuffle=False):
    number_of_batches = int(X.shape[0] / batch_size)
    print(number_of_batches)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
        print("shuffled!")
        
    while True:
        batch_index = sample_index[counter*batch_size:(counter+1)*batch_size]
        print(batch_index[2:5,])
        counter += 1
        print(counter)
        X_batch = X[batch_index,:].toarray()
        
        if y is not None:
            y_batch = y[batch_index]
            yield X_batch, y_batch
        else:
            yield X_batch
        
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0
            
## neural net
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU

def nn_model():
    model = Sequential()
    
    model.add(Dense(400, input_dim = xtrain.shape[1], init = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
        
    model.add(Dense(200, init = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.2))
    
    model.add(Dense(50, init = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.2))
    
    model.add(Dense(1, init = 'he_normal'))
    model.compile(loss = 'mae', optimizer = 'adadelta')
    return(model)

Using Theano backend.


The following codes are used for testing:
It seems keras needs theano. And theano only run on python 3.4 not 3.5 in the window's system, so I have to downgrade my python. In anaconda, I run the following: conda install python=3.4.5 for downgrading!

In [11]:
import pandas as pd
import numpy as np
import time
import os

from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import KFold

os.chdir(r"J:\Tutorial\Kaggle Competitions\AllState")

# read data in. the first column like index, I could use index_col=0 to
# use the first column as index. However, it is not incremental equally
# the order is 1, 2, 5, 10; not like 1,2,3,4,5. So after shuffle, you will
# get out of bound error!
train = pd.read_csv('train.csv', nrows=30000)
test  = pd.read_csv('test.csv', nrows=30000)
print(train.shape)
print(isinstance(train, pd.DataFrame))
#print(test.head())

row_index = list(train.index)
np.random.shuffle(row_index)
train = train.iloc[row_index]
#print(isinstance(train, pd.DataFrame))
print(type(train))

## set test loss to NaN
test['loss'] = np.nan

## response (or targets)
y = np.log(train['loss'].values+200)
print(train['loss'].shape)

# IDs
train_id = train['id'].values   # this is np.ndarray with .values
test_id = test['id'].values
print(type(train['id']))        # this is pd.Series without .values
print(type(train['id'].values))

# combine train and test by stacking
num_of_train = train.shape[0]
allData = pd.concat((train, test), axis=0)

# preprocessing and transform to sparse matrix
sparse_data = []
print("sparse data type is ", type(sparse_data))

f_cat = [f for f in allData.columns if 'cat' in f ]
#print(allData.columns)
for f in f_cat:
    dummy = pd.get_dummies(allData[f].astype('category'))
    tmp = csr_matrix(dummy)
    sparse_data.append(tmp)
print(sparse_data[2:5])
print(len(sparse_data))

f_cont = [f for f in allData.columns if 'cont' in f]
scaler = StandardScaler()
tmp = csr_matrix(scaler.fit_transform(allData[f_cont]))
sparse_data.append(tmp)
print(len(sparse_data))
del(allData, train, test)

# sparse train and test data
xData = hstack(sparse_data, format='csr')
xtrain = xData[:num_of_train,:]
xtest  = xData[num_of_train:,:]
print("Dim of xtrain: ", xtrain.shape)
print("Dim of xtest: ", xtest.shape)
print("type of xtrain is:", type(xtrain))

## cv-folds: 'folds' is actually a generator, 
# which doesn't compute the train-test split until it is needed.
# Note: only one fold is left out for testing.
nfolds = 10
folds = KFold(len(y), n_folds = nfolds, shuffle = False, random_state = 111)
print(folds)

## train models
i = 0
nbags = 10
nepochs = 55
pred_oob = np.zeros(xtrain.shape[0])
pred_test = np.zeros(xtest.shape[0])

for (inTr, inTe) in folds:
    print("Training: ", inTr, len(inTr))
    print("testing:", inTe, len(inTe))
    xtr = xtrain[inTr]
    ytr = y[inTr]
    xte = xtrain[inTe]
    yte = y[inTe]
    pred = np.zeros(xte.shape[0])
    aa = np.arange(10)
    print(aa)
    print(type(xtr[aa,:]))

(30000, 132)
True
<class 'pandas.core.frame.DataFrame'>
(30000,)
<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
sparse data type is  <class 'list'>
[<60000x2 sparse matrix of type '<class 'numpy.uint8'>'
	with 60000 stored elements in Compressed Sparse Row format>, <60000x2 sparse matrix of type '<class 'numpy.uint8'>'
	with 60000 stored elements in Compressed Sparse Row format>, <60000x2 sparse matrix of type '<class 'numpy.uint8'>'
	with 60000 stored elements in Compressed Sparse Row format>]
116
117
Dim of xtrain:  (30000, 1062)
Dim of xtest:  (30000, 1062)
type of xtrain is: <class 'scipy.sparse.csr.csr_matrix'>
sklearn.cross_validation.KFold(n=30000, n_folds=10, shuffle=False, random_state=111)
Training:  [ 3000  3001  3002 ..., 29997 29998 29999] 27000
testing: [   0    1    2 ..., 2997 2998 2999] 3000
[0 1 2 3 4 5 6 7 8 9]
<class 'scipy.sparse.csr.csr_matrix'>
Training:  [    0     1     2 ..., 29997 29998 29999] 27000
testing: [3000 3001 3002 ..., 5997 5998 5999] 3

In [1]:
import seaborn as sns