In [None]:
import numpy as np

The data is stored in `3` files.
1. `train.csv` numbers `18,304` samples and contains the training data including features and labels. 
2. `test.csv` numbers `4580` samples and contains the testing feature without labels.
3. `sample.csv` number `4580` samples and contains the testing labels without features.

Every file has table headers and ends in an empty line.

In [None]:
DATA_DIR = r'dataset-in/'   # directory holding the data files
# names of files holding dataset
DATA_FILENAMES = {\
                  r'trainXy': r'train.csv',\
                  r'testX':   r'test.csv',\
                  r'test_y':  r'sample.csv',\
                 }
DELIMITER = r','    # used to separate values in DATA_FILENAME

In [None]:
data = {}
for datatype, file in DATA_FILENAMES.items():
    data[datatype] = np.genfromtxt(
        fr'{DATA_DIR}{file}', delimiter=DELIMITER,
        skip_header=True, dtype=np.float64)


Let's count the number of samples as a sanity check.

In [None]:
for datatype, array in data.items():
    print(f'{datatype:8}\t{len(array):8}')

trainXy 	   18304
testX   	    4580
test_y  	    4580


We can assume that the IDs are succeeding in order.

In [None]:
isSorted = {}
for datatype, array in data.items():
    isSorted[datatype] = True
    for irow in range(1, array.shape[0]):
        if (int(array[irow, 0]) != (int(array[(irow - 1), 0]) + 1)):
            isSorted[datatype] = False
            break
print(isSorted)

{'trainXy': True, 'testX': True, 'test_y': True}


Let's split `trainXy` into features and labels resembling `testX` and `test_y`.

In [None]:
def splitFeaturesLabels(dataset, classify):
    r'''
     Divides the dataset into features and labels.
     @param dataset : 'numpy.ndarray' = the dataset to divide
     @param classify : 'function' = label classifier function
     @return a tuple containing the features and the labels
     '''
    # get the number of rows and columns
    (num_rows, num_cols) = dataset.shape

    # divide into features and labels
    # function to classify the label scalars
    vec_classify = np.vectorize(classify)
    # split the dataset
    (unvalid_features, M_label_scalars) = \
        np.split(dataset, (num_cols - 1,), axis=1)
    # convert to a vector
    v_label_scalars = M_label_scalars.reshape((num_rows,))
    # classify the labels
    unvalid_labels = vec_classify(v_label_scalars)

    # remove all 0 rows
    # 0 rows are when the label scalar is neither in [7..10[ nor [3..6[
    should_keep_rows = (unvalid_labels != 0)
    features = unvalid_features[should_keep_rows,:]
    labels = unvalid_labels[should_keep_rows]

    return (features, labels)
# end def splitFeaturesLabels(dataset)

def identity(x):
    return x

# print the shape of trainXy before splitting
print(data['trainXy'].shape)
# print the shape of each after splitting
print([x.shape for x in splitFeaturesLabels(data['trainXy'], identity)])

(18304, 12)
[(4535, 11), (4535,)]
