# Data Preparation

## Raw data generation

In [1]:
%%time 
import pickle, gzip, urllib.request
import numpy as np

#Load the dataset
urllib.request.urlretrieve("http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz")
with gzip.open('mnist.pkl.gz', 'rb') as f:
    train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
print(train_set[0].shape)

(50000, 784)
CPU times: user 949 ms, sys: 338 ms, total: 1.29 s
Wall time: 2.6 s


In [4]:
%%time
total_set = (np.vstack((train_set[0], valid_set[0], test_set[0])),
             np.concatenate((train_set[1], valid_set[1], test_set[1]), 0))
print('{} {}'.format(total_set[0].shape, total_set[1].shape))
features = [t.tolist() for t in total_set[0]]
labels = [t.tolist() for t in total_set[1]]
examples = np.insert(features, 0, labels, axis=1)
print(examples.shape)

(70000, 784) (70000,)
(70000, 785)
CPU times: user 4.46 s, sys: 1.69 s, total: 6.16 s
Wall time: 6.2 s


In [5]:
np.savetxt('raw_data.csv', examples, fmt='%.5f', delimiter=',')

In [6]:
import boto3
s3_client = boto3.client('s3')
bucket='sagemaker-us-east-1-233037139193'
s3_client.upload_file('raw_data.csv', bucket, 'mbp3/dataset/raw_data.csv')

## test sample generation - for inference test

In [7]:
test_sample = examples[-7000:]
print(test_sample.shape)

(7000, 785)


In [8]:
np.savetxt('test_sample.csv', test_sample, fmt='%.5f', delimiter=',')

In [9]:
s3_client.upload_file('test_sample.csv', bucket, 'mbp3/dataset/test_sample.csv')