# Prepare Data

If you are using the hosted jupyter environment, You will already have all raw data in place. Run all the cells to extract all zipped files.

In [1]:
# import libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import os.path as op
from zipfile import ZipFile
from six.moves import urllib
import sys
import shutil



try:
    from urllib.request import urlretrieve
except ImportError:  # Python 2 compat
    from urllib import urlretrieve


## Script for Downloading Data 

In [2]:
#Modified function from here
# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/image_retraining/retrain.py

def maybe_download_and_extract(data_url, dest_directory, extract_directory, extract=True, copy_only=False):
    """Download and extract model tar file.
    If the pretrained model we're using doesn't already exist, this function
    downloads it from the TensorFlow.org website and unpacks it into a directory.
    """
    if not os.path.exists(dest_directory):
        os.makedirs(dest_directory)
    filename = data_url.split('/')[-1]
    filepath = os.path.join(dest_directory, filename)
    if not os.path.exists(filepath):

        def _progress(count, block_size, total_size):
            sys.stdout.write('\r>> Downloading %s %.1f%%' %
                            (filename,
                             float(count * block_size) / float(total_size) * 100.0))
        sys.stdout.flush()

        filepath, _ = urllib.request.urlretrieve(data_url,
                                             filepath,
                                             _progress)
        print()
        statinfo = os.stat(filepath)
        print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
    else:
        print('File Already Exists.')
        
    if extract:
        if not op.exists(extract_directory): 
            with ZipFile(filepath, 'r') as z:
                print('Extracting content from {0}'.format(filepath))
                z.extractall(path=extract_directory)
        else:
            print('Folder Already Exists.')
    
    if copy_only:
        if not op.exists(extract_directory):
            shutil.copyfile(filepath, extract_directory)
            print('File Copied.')
        else:
            print('File Already Exists.')
        

In [3]:
# if raw and processed directories are not created, create these directories 
raw_data_directory = op.join(op.curdir, 'data')
if not os.path.exists(raw_data_directory):
    os.makedirs(raw_data_directory)
    print('data folder created. You are now ready to download raw files.')
else:
    print('data folder already exists. You are ready to download raw files.')
    

processed_directory = op.join(op.curdir, 'processed')
if not os.path.exists(processed_directory):
    os.makedirs(processed_directory)
    print('processed folder created. You are now ready to extract raw files.')
else:
    print('processed folder already exists. You are ready to extract raw files.')

data folder already exists. You are ready to download raw files.
processed folder already exists. You are ready to extract raw files.


## Download Movielens-100K dataset

In [4]:
# Specify where to download from
ML_100K_URL = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
# Specify where to save 
dest_directory = op.join(op.curdir, 'data')
# extract 
extract_directory = op.join(op.curdir, 'processed', 'ml-100k')

In [5]:
# Actually download/extract the file!
maybe_download_and_extract(ML_100K_URL, dest_directory, extract_directory, extract=True)

File Already Exists.
Folder Already Exists.


## Download Glove Pre-Trained Embeddings

In [6]:
# Specify where to download from
glove_model_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
# Specify where to save 
dest_directory = op.join(op.curdir, 'data')
# extract 
extract_directory = op.join(op.curdir, 'processed', 'glove')

In [7]:
# Actually download/extract the file!
maybe_download_and_extract(glove_model_url, dest_directory, extract_directory, extract=True)

File Already Exists.
Folder Already Exists.


## Download AlexNet Pre-Trained Model Weights

In [8]:
# Specify where to download from
alexnet_weights_URL = 'https://www.cs.toronto.edu/~guerzhoy/tf_alexnet/bvlc_alexnet.npy'
# Specify where to save 
dest_directory = op.join(op.curdir, 'data')
# destination file name in processed folder
extract_directory = op.join(op.curdir, 'processed', 'bvlc_alexnet.npy')

In [9]:
# Actually download/extract the file!
maybe_download_and_extract(alexnet_weights_URL, dest_directory, extract_directory, extract=False, copy_only=True)

File Already Exists.
File Already Exists.


### Download UT Zappos50K dataset

In [10]:
#Specify where to download from
utzap50k_URL = 'http://vision.cs.utexas.edu/projects/finegrained/utzap50k/ut-zap50k-images-square.zip'
# Specify where to save 
dest_directory = op.join(op.curdir, 'data')
# extract 
extract_directory = op.join(op.curdir, 'processed', 'utzap50k')

In [11]:
# Actually download/extract the file!
maybe_download_and_extract(utzap50k_URL, dest_directory, extract_directory, extract=True)

File Already Exists.
Folder Already Exists.
