Skip to content

Commit

Permalink
edited notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
aku340 committed Mar 6, 2018
1 parent 86c7ee9 commit 6769ebb
Show file tree
Hide file tree
Showing 9 changed files with 3,360 additions and 3,845 deletions.
277 changes: 277 additions & 0 deletions 01_Data_Preperation_01.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prepare Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you are using the hosted jupyter environment, You will already have all raw data in place. Run all the cells to extract all zipped files."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import libraries\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import os.path as op\n",
"from zipfile import ZipFile\n",
"from six.moves import urllib\n",
"import sys\n",
"import shutil\n",
"\n",
"\n",
"\n",
"try:\n",
" from urllib.request import urlretrieve\n",
"except ImportError: # Python 2 compat\n",
" from urllib import urlretrieve\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Script for Downloading Data "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Modified function from here\n",
"# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/image_retraining/retrain.py\n",
"\n",
"def maybe_download_and_extract(data_url, dest_directory, extract_directory, extract=True, copy_only=False):\n",
" \"\"\"Download and extract model tar file.\n",
" If the pretrained model we're using doesn't already exist, this function\n",
" downloads it from the TensorFlow.org website and unpacks it into a directory.\n",
" \"\"\"\n",
" if not os.path.exists(dest_directory):\n",
" os.makedirs(dest_directory)\n",
" filename = data_url.split('/')[-1]\n",
" filepath = os.path.join(dest_directory, filename)\n",
" if not os.path.exists(filepath):\n",
"\n",
" def _progress(count, block_size, total_size):\n",
" sys.stdout.write('\\r>> Downloading %s %.1f%%' %\n",
" (filename,\n",
" float(count * block_size) / float(total_size) * 100.0))\n",
" sys.stdout.flush()\n",
"\n",
" filepath, _ = urllib.request.urlretrieve(data_url,\n",
" filepath,\n",
" _progress)\n",
" print()\n",
" statinfo = os.stat(filepath)\n",
" print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')\n",
" else:\n",
" print('File Already Exists.')\n",
" \n",
" if extract:\n",
" if not op.exists(extract_directory): \n",
" with ZipFile(filepath, 'r') as z:\n",
" print('Extracting content from {0}'.format(filepath))\n",
" z.extractall(path=extract_directory)\n",
" else:\n",
" print('Folder Already Exists.')\n",
" \n",
" if copy_only:\n",
" if not op.exists(extract_directory):\n",
" shutil.copyfile(filepath, extract_directory)\n",
" print('File Copied.')\n",
" else:\n",
" print('File Already Exists.')\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# if raw and processed directories are not created, create these directories \n",
"raw_data_directory = op.join(op.curdir, 'data')\n",
"if not os.path.exists(raw_data_directory):\n",
" os.makedirs(raw_data_directory)\n",
" print('data folder created. You are now ready to download raw files.')\n",
"else:\n",
" print('data folder already exists. You are ready to download raw files.')\n",
" \n",
"\n",
"processed_directory = op.join(op.curdir, 'processed')\n",
"if not os.path.exists(processed_directory):\n",
" os.makedirs(processed_directory)\n",
" print('processed folder created. You are now ready to extract raw files.')\n",
"else:\n",
" print('processed folder already exists. You are ready to extract raw files.')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download Movielens-100K dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Specify where to download from\n",
"ML_100K_URL = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'\n",
"# Specify where to save \n",
"dest_directory = op.join(op.curdir, 'data')\n",
"# extract \n",
"extract_directory = op.join(op.curdir, 'processed', 'ml-100k')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Actually download/extract the file!\n",
"maybe_download_and_extract(ML_100K_URL, dest_directory, extract_directory, extract=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download Glove Pre-Trained Embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Specify where to download from\n",
"glove_model_url = 'http://nlp.stanford.edu/data/glove.6B.zip'\n",
"# Specify where to save \n",
"dest_directory = op.join(op.curdir, 'data')\n",
"# extract \n",
"extract_directory = op.join(op.curdir, 'processed', 'glove')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Actually download/extract the file!\n",
"maybe_download_and_extract(glove_model_url, dest_directory, extract_directory, extract=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download AlexNet Pre-Trained Model Weights"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Specify where to download from\n",
"alexnet_weights_URL = 'https://www.cs.toronto.edu/~guerzhoy/tf_alexnet/bvlc_alexnet.npy'\n",
"# Specify where to save \n",
"dest_directory = op.join(op.curdir, 'data')\n",
"# destination file name in processed folder\n",
"extract_directory = op.join(op.curdir, 'processed', 'bvlc_alexnet.npy')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Actually download/extract the file!\n",
"maybe_download_and_extract(alexnet_weights_URL, dest_directory, extract_directory, extract=False, copy_only=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download UT Zappos50K dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Specify where to download from\n",
"utzap50k_URL = 'http://vision.cs.utexas.edu/projects/finegrained/utzap50k/ut-zap50k-images-square.zip'\n",
"# Specify where to save \n",
"dest_directory = op.join(op.curdir, 'data')\n",
"# extract \n",
"extract_directory = op.join(op.curdir, 'processed', 'utzap50k')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Actually download/extract the file!\n",
"maybe_download_and_extract(utzap50k_URL, dest_directory, extract_directory, extract=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 6769ebb

Please sign in to comment.