# Quick Draw dataset - Data analysis and preprocessing

Tasks:
1. Get Quick Draw data from Google Cloud Platform
2. Store classes in a dictionary. 
3. Perform exploratory analysis on the data. 
4. Generate train and test images for modelling. 

In [118]:
import shutil

In [119]:
import numpy as np
import os
import errno
from PIL import Image  

In [120]:
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

In [121]:
import pickle 
import requests 
from io import BytesIO 
from sklearn.model_selection import train_test_split 
from skimage.feature import canny 

In [122]:
train_path = "datasets/data/train_set/"
test_path = "datasets/data/test_set/"

data_file_path = "datasets/data/numpy_bitmap"

In [123]:
os.walk("datasets/data/test_set")

<generator object walk at 0x000001EE8C8A4F90>

In [124]:
classes = []
    
for filename in os.listdir("datasets/data/test_set"):
    classes.append(os.path.splitext(filename)[0])

In [127]:
print(classes)

['aircraft carrier', 'airplane', 'alarm clock', 'ambulance', 'angel', 'animal migration', 'ant', 'anvil', 'apple', 'arm', 'asparagus', 'axe', 'backpack', 'banana', 'bandage', 'barn', 'baseball', 'baseball bat', 'basket', 'basketball', 'bat', 'bathtub', 'beach', 'bear', 'beard', 'bed', 'bee', 'belt', 'bench', 'bicycle', 'binoculars', 'bird', 'birthday cake', 'blackberry', 'blueberry', 'book', 'boomerang', 'bottlecap', 'bowtie', 'bracelet', 'brain', 'bread', 'bridge', 'broccoli', 'broom', 'bucket', 'bulldozer', 'bus', 'bush', 'butterfly', 'cactus', 'cake', 'calculator', 'calendar', 'camel', 'camera', 'camouflage', 'campfire', 'candle', 'cannon', 'canoe', 'car', 'carrot', 'castle', 'cat', 'ceiling fan', 'cell phone', 'cello', 'chair', 'chandelier', 'church', 'circle', 'clarinet', 'clock', 'cloud', 'coffee cup', 'compass', 'computer', 'cookie', 'cooler', 'couch', 'cow', 'crab', 'crayon', 'crocodile', 'crown', 'cruise ship', 'cup', 'diamond', 'dishwasher', 'diving board', 'dog', 'dolphin', 

In [128]:
# creating dictionary with label codes

def Convert(lst): 
    res_dct = {i: lst[i] for i in range(0, len(lst))} 
    return res_dct 
          
# Driver code 
print(Convert(classes))

{0: 'aircraft carrier', 1: 'airplane', 2: 'alarm clock', 3: 'ambulance', 4: 'angel', 5: 'animal migration', 6: 'ant', 7: 'anvil', 8: 'apple', 9: 'arm', 10: 'asparagus', 11: 'axe', 12: 'backpack', 13: 'banana', 14: 'bandage', 15: 'barn', 16: 'baseball', 17: 'baseball bat', 18: 'basket', 19: 'basketball', 20: 'bat', 21: 'bathtub', 22: 'beach', 23: 'bear', 24: 'beard', 25: 'bed', 26: 'bee', 27: 'belt', 28: 'bench', 29: 'bicycle', 30: 'binoculars', 31: 'bird', 32: 'birthday cake', 33: 'blackberry', 34: 'blueberry', 35: 'book', 36: 'boomerang', 37: 'bottlecap', 38: 'bowtie', 39: 'bracelet', 40: 'brain', 41: 'bread', 42: 'bridge', 43: 'broccoli', 44: 'broom', 45: 'bucket', 46: 'bulldozer', 47: 'bus', 48: 'bush', 49: 'butterfly', 50: 'cactus', 51: 'cake', 52: 'calculator', 53: 'calendar', 54: 'camel', 55: 'camera', 56: 'camouflage', 57: 'campfire', 58: 'candle', 59: 'cannon', 60: 'canoe', 61: 'car', 62: 'carrot', 63: 'castle', 64: 'cat', 65: 'ceiling fan', 66: 'cell phone', 67: 'cello', 68: '

In [100]:
%%time


# Looping through every npy file and generating png image data for every npy file. 
for filename in os.listdir(data_file_path):
    if filename.endswith('.npy'):
        data = np.load(data_file_path+"/"+filename)
        print(filename)
        data = data[0:,:]
    
    if not os.path.exists(train_path+os.path.splitext(filename)[0]):
        try:
            os.makedirs(train_path+os.path.splitext(filename)[0])
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise
    print(train_path+os.path.splitext(filename)[0])
    
    if not os.path.exists(test_path+os.path.splitext(filename)[0]):
        try:
            os.makedirs(test_path+os.path.splitext(filename)[0])
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise
    print(test_path+os.path.splitext(filename)[0])
    
    for i in range(0,5):
        x=np.reshape(data[i],(28,28))  
        img = Image.fromarray(x)
        img = img.convert('L')                            
        if i<4: 
            img.save(train_path+os.path.splitext(filename)[0]+"/"+os.path.splitext(filename)[0]+str(i)+".png")
        else: 
            img.save(test_path+os.path.splitext(filename)[0]+"/"+os.path.splitext(filename)[0]+str(i-4)+".png")

        

aircraft carrier.npy
datasets/data/train_set3/aircraft carrier
datasets/data/test_set3/aircraft carrier
airplane.npy
datasets/data/train_set3/airplane
datasets/data/test_set3/airplane
alarm clock.npy
datasets/data/train_set3/alarm clock
datasets/data/test_set3/alarm clock
ambulance.npy
datasets/data/train_set3/ambulance
datasets/data/test_set3/ambulance
angel.npy
datasets/data/train_set3/angel
datasets/data/test_set3/angel
animal migration.npy
datasets/data/train_set3/animal migration
datasets/data/test_set3/animal migration
ant.npy
datasets/data/train_set3/ant
datasets/data/test_set3/ant
anvil.npy
datasets/data/train_set3/anvil
datasets/data/test_set3/anvil
apple.npy
datasets/data/train_set3/apple
datasets/data/test_set3/apple
arm.npy
datasets/data/train_set3/arm
datasets/data/test_set3/arm
asparagus.npy
datasets/data/train_set3/asparagus
datasets/data/test_set3/asparagus
axe.npy
datasets/data/train_set3/axe
datasets/data/test_set3/axe
backpack.npy
datasets/data/train_set3/backpack
da

feather.npy
datasets/data/train_set3/feather
datasets/data/test_set3/feather
fence.npy
datasets/data/train_set3/fence
datasets/data/test_set3/fence
finger.npy
datasets/data/train_set3/finger
datasets/data/test_set3/finger
fire hydrant.npy
datasets/data/train_set3/fire hydrant
datasets/data/test_set3/fire hydrant
fireplace.npy
datasets/data/train_set3/fireplace
datasets/data/test_set3/fireplace
firetruck.npy
datasets/data/train_set3/firetruck
datasets/data/test_set3/firetruck
fish.npy
datasets/data/train_set3/fish
datasets/data/test_set3/fish
flamingo.npy
datasets/data/train_set3/flamingo
datasets/data/test_set3/flamingo
flashlight.npy
datasets/data/train_set3/flashlight
datasets/data/test_set3/flashlight
flip flops.npy
datasets/data/train_set3/flip flops
datasets/data/test_set3/flip flops
floor lamp.npy
datasets/data/train_set3/floor lamp
datasets/data/test_set3/floor lamp
flower.npy
datasets/data/train_set3/flower
datasets/data/test_set3/flower
flying saucer.npy
datasets/data/train_se

penguin.npy
datasets/data/train_set3/penguin
datasets/data/test_set3/penguin
piano.npy
datasets/data/train_set3/piano
datasets/data/test_set3/piano
pickup truck.npy
datasets/data/train_set3/pickup truck
datasets/data/test_set3/pickup truck
picture frame.npy
datasets/data/train_set3/picture frame
datasets/data/test_set3/picture frame
pig.npy
datasets/data/train_set3/pig
datasets/data/test_set3/pig
pillow.npy
datasets/data/train_set3/pillow
datasets/data/test_set3/pillow
pineapple.npy
datasets/data/train_set3/pineapple
datasets/data/test_set3/pineapple
pizza.npy
datasets/data/train_set3/pizza
datasets/data/test_set3/pizza
pliers.npy
datasets/data/train_set3/pliers
datasets/data/test_set3/pliers
police car.npy
datasets/data/train_set3/police car
datasets/data/test_set3/police car
pond.npy
datasets/data/train_set3/pond
datasets/data/test_set3/pond
pool.npy
datasets/data/train_set3/pool
datasets/data/test_set3/pool
popsicle.npy
datasets/data/train_set3/popsicle
datasets/data/test_set3/popsi

tree.npy
datasets/data/train_set3/tree
datasets/data/test_set3/tree
triangle.npy
datasets/data/train_set3/triangle
datasets/data/test_set3/triangle
trombone.npy
datasets/data/train_set3/trombone
datasets/data/test_set3/trombone
truck.npy
datasets/data/train_set3/truck
datasets/data/test_set3/truck
trumpet.npy
datasets/data/train_set3/trumpet
datasets/data/test_set3/trumpet
umbrella.npy
datasets/data/train_set3/umbrella
datasets/data/test_set3/umbrella
underwear.npy
datasets/data/train_set3/underwear
datasets/data/test_set3/underwear
van.npy
datasets/data/train_set3/van
datasets/data/test_set3/van
vase.npy
datasets/data/train_set3/vase
datasets/data/test_set3/vase
violin.npy
datasets/data/train_set3/violin
datasets/data/test_set3/violin
washing machine.npy
datasets/data/train_set3/washing machine
datasets/data/test_set3/washing machine
watermelon.npy
datasets/data/train_set3/watermelon
datasets/data/test_set3/watermelon
waterslide.npy
datasets/data/train_set3/waterslide
datasets/data/te

In [45]:
data.shape

(10000, 784)

Miscellaneous clean-up code.

In [41]:
%%time

# Delete all contents of a directory using shutil.rmtree() and  handle exceptions

dirPath = 'datasets/data/train_set/'

try:
   shutil.rmtree(dirPath, ignore_errors=True)
except:
   print('Error while deleting directory')

In [42]:
%%time

# Delete all contents of a directory using shutil.rmtree() and  handle exceptions

dirPath2 = 'datasets/data/test_set/'

try:
   shutil.rmtree(dirPath2, ignore_errors=True)
except:
   print('Error while deleting directory')

Wall time: 2min 18s


In [None]:
%%time

from distutils.dir_util import copy_tree

# copy subdirectory example
fromDirectory = 'datasets/data/test_set/'
toDirectory = 'G:\data\\test_set'

copy_tree(fromDirectory, toDirectory)

In [None]:
%%time

from distutils.dir_util import copy_tree

# copy subdirectory example
fromDirectory = 'datasets/data/train_set/'
toDirectory = 'G:\data\\train_set'

copy_tree(fromDirectory, toDirectory)