-
Notifications
You must be signed in to change notification settings - Fork 36
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
aku340
committed
Mar 6, 2018
1 parent
86c7ee9
commit 6769ebb
Showing
9 changed files
with
3,360 additions
and
3,845 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,277 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Prepare Data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"If you are using the hosted jupyter environment, You will already have all raw data in place. Run all the cells to extract all zipped files." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# import libraries\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"import numpy as np\n", | ||
"import pandas as pd\n", | ||
"import os\n", | ||
"import os.path as op\n", | ||
"from zipfile import ZipFile\n", | ||
"from six.moves import urllib\n", | ||
"import sys\n", | ||
"import shutil\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
"try:\n", | ||
" from urllib.request import urlretrieve\n", | ||
"except ImportError: # Python 2 compat\n", | ||
" from urllib import urlretrieve\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Script for Downloading Data " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#Modified function from here\n", | ||
"# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/image_retraining/retrain.py\n", | ||
"\n", | ||
"def maybe_download_and_extract(data_url, dest_directory, extract_directory, extract=True, copy_only=False):\n", | ||
" \"\"\"Download and extract model tar file.\n", | ||
" If the pretrained model we're using doesn't already exist, this function\n", | ||
" downloads it from the TensorFlow.org website and unpacks it into a directory.\n", | ||
" \"\"\"\n", | ||
" if not os.path.exists(dest_directory):\n", | ||
" os.makedirs(dest_directory)\n", | ||
" filename = data_url.split('/')[-1]\n", | ||
" filepath = os.path.join(dest_directory, filename)\n", | ||
" if not os.path.exists(filepath):\n", | ||
"\n", | ||
" def _progress(count, block_size, total_size):\n", | ||
" sys.stdout.write('\\r>> Downloading %s %.1f%%' %\n", | ||
" (filename,\n", | ||
" float(count * block_size) / float(total_size) * 100.0))\n", | ||
" sys.stdout.flush()\n", | ||
"\n", | ||
" filepath, _ = urllib.request.urlretrieve(data_url,\n", | ||
" filepath,\n", | ||
" _progress)\n", | ||
" print()\n", | ||
" statinfo = os.stat(filepath)\n", | ||
" print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')\n", | ||
" else:\n", | ||
" print('File Already Exists.')\n", | ||
" \n", | ||
" if extract:\n", | ||
" if not op.exists(extract_directory): \n", | ||
" with ZipFile(filepath, 'r') as z:\n", | ||
" print('Extracting content from {0}'.format(filepath))\n", | ||
" z.extractall(path=extract_directory)\n", | ||
" else:\n", | ||
" print('Folder Already Exists.')\n", | ||
" \n", | ||
" if copy_only:\n", | ||
" if not op.exists(extract_directory):\n", | ||
" shutil.copyfile(filepath, extract_directory)\n", | ||
" print('File Copied.')\n", | ||
" else:\n", | ||
" print('File Already Exists.')\n", | ||
" " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# if raw and processed directories are not created, create these directories \n", | ||
"raw_data_directory = op.join(op.curdir, 'data')\n", | ||
"if not os.path.exists(raw_data_directory):\n", | ||
" os.makedirs(raw_data_directory)\n", | ||
" print('data folder created. You are now ready to download raw files.')\n", | ||
"else:\n", | ||
" print('data folder already exists. You are ready to download raw files.')\n", | ||
" \n", | ||
"\n", | ||
"processed_directory = op.join(op.curdir, 'processed')\n", | ||
"if not os.path.exists(processed_directory):\n", | ||
" os.makedirs(processed_directory)\n", | ||
" print('processed folder created. You are now ready to extract raw files.')\n", | ||
"else:\n", | ||
" print('processed folder already exists. You are ready to extract raw files.')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Download Movielens-100K dataset" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Specify where to download from\n", | ||
"ML_100K_URL = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'\n", | ||
"# Specify where to save \n", | ||
"dest_directory = op.join(op.curdir, 'data')\n", | ||
"# extract \n", | ||
"extract_directory = op.join(op.curdir, 'processed', 'ml-100k')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Actually download/extract the file!\n", | ||
"maybe_download_and_extract(ML_100K_URL, dest_directory, extract_directory, extract=True)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Download Glove Pre-Trained Embeddings" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Specify where to download from\n", | ||
"glove_model_url = 'http://nlp.stanford.edu/data/glove.6B.zip'\n", | ||
"# Specify where to save \n", | ||
"dest_directory = op.join(op.curdir, 'data')\n", | ||
"# extract \n", | ||
"extract_directory = op.join(op.curdir, 'processed', 'glove')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Actually download/extract the file!\n", | ||
"maybe_download_and_extract(glove_model_url, dest_directory, extract_directory, extract=True)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Download AlexNet Pre-Trained Model Weights" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Specify where to download from\n", | ||
"alexnet_weights_URL = 'https://www.cs.toronto.edu/~guerzhoy/tf_alexnet/bvlc_alexnet.npy'\n", | ||
"# Specify where to save \n", | ||
"dest_directory = op.join(op.curdir, 'data')\n", | ||
"# destination file name in processed folder\n", | ||
"extract_directory = op.join(op.curdir, 'processed', 'bvlc_alexnet.npy')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Actually download/extract the file!\n", | ||
"maybe_download_and_extract(alexnet_weights_URL, dest_directory, extract_directory, extract=False, copy_only=True)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Download UT Zappos50K dataset" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#Specify where to download from\n", | ||
"utzap50k_URL = 'http://vision.cs.utexas.edu/projects/finegrained/utzap50k/ut-zap50k-images-square.zip'\n", | ||
"# Specify where to save \n", | ||
"dest_directory = op.join(op.curdir, 'data')\n", | ||
"# extract \n", | ||
"extract_directory = op.join(op.curdir, 'processed', 'utzap50k')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Actually download/extract the file!\n", | ||
"maybe_download_and_extract(utzap50k_URL, dest_directory, extract_directory, extract=True)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.