edited notebooks

meabhishekkumar · Mar 6, 2018 · 6769ebb · 6769ebb
1 parent 86c7ee9
commit 6769ebb
Show file tree

Hide file tree

Showing 9 changed files with 3,360 additions and 3,845 deletions.
diff --git a/01_Data_Preperation_01.ipynb b/01_Data_Preperation_01.ipynb
@@ -0,0 +1,277 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you are using the hosted jupyter environment, You will already have all raw data in place. Run all the cells to extract all zipped files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import libraries\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "import os.path as op\n",
+    "from zipfile import ZipFile\n",
+    "from six.moves import urllib\n",
+    "import sys\n",
+    "import shutil\n",
+    "\n",
+    "\n",
+    "\n",
+    "try:\n",
+    "    from urllib.request import urlretrieve\n",
+    "except ImportError:  # Python 2 compat\n",
+    "    from urllib import urlretrieve\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Script for Downloading Data "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Modified function from here\n",
+    "# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/image_retraining/retrain.py\n",
+    "\n",
+    "def maybe_download_and_extract(data_url, dest_directory, extract_directory, extract=True, copy_only=False):\n",
+    "    \"\"\"Download and extract model tar file.\n",
+    "    If the pretrained model we're using doesn't already exist, this function\n",
+    "    downloads it from the TensorFlow.org website and unpacks it into a directory.\n",
+    "    \"\"\"\n",
+    "    if not os.path.exists(dest_directory):\n",
+    "        os.makedirs(dest_directory)\n",
+    "    filename = data_url.split('/')[-1]\n",
+    "    filepath = os.path.join(dest_directory, filename)\n",
+    "    if not os.path.exists(filepath):\n",
+    "\n",
+    "        def _progress(count, block_size, total_size):\n",
+    "            sys.stdout.write('\\r>> Downloading %s %.1f%%' %\n",
+    "                            (filename,\n",
+    "                             float(count * block_size) / float(total_size) * 100.0))\n",
+    "        sys.stdout.flush()\n",
+    "\n",
+    "        filepath, _ = urllib.request.urlretrieve(data_url,\n",
+    "                                             filepath,\n",
+    "                                             _progress)\n",
+    "        print()\n",
+    "        statinfo = os.stat(filepath)\n",
+    "        print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')\n",
+    "    else:\n",
+    "        print('File Already Exists.')\n",
+    "        \n",
+    "    if extract:\n",
+    "        if not op.exists(extract_directory): \n",
+    "            with ZipFile(filepath, 'r') as z:\n",
+    "                print('Extracting content from {0}'.format(filepath))\n",
+    "                z.extractall(path=extract_directory)\n",
+    "        else:\n",
+    "            print('Folder Already Exists.')\n",
+    "    \n",
+    "    if copy_only:\n",
+    "        if not op.exists(extract_directory):\n",
+    "            shutil.copyfile(filepath, extract_directory)\n",
+    "            print('File Copied.')\n",
+    "        else:\n",
+    "            print('File Already Exists.')\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# if raw and processed directories are not created, create these directories \n",
+    "raw_data_directory = op.join(op.curdir, 'data')\n",
+    "if not os.path.exists(raw_data_directory):\n",
+    "    os.makedirs(raw_data_directory)\n",
+    "    print('data folder created. You are now ready to download raw files.')\n",
+    "else:\n",
+    "    print('data folder already exists. You are ready to download raw files.')\n",
+    "    \n",
+    "\n",
+    "processed_directory = op.join(op.curdir, 'processed')\n",
+    "if not os.path.exists(processed_directory):\n",
+    "    os.makedirs(processed_directory)\n",
+    "    print('processed folder created. You are now ready to extract raw files.')\n",
+    "else:\n",
+    "    print('processed folder already exists. You are ready to extract raw files.')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download Movielens-100K dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Specify where to download from\n",
+    "ML_100K_URL = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'\n",
+    "# Specify where to save \n",
+    "dest_directory = op.join(op.curdir, 'data')\n",
+    "# extract \n",
+    "extract_directory = op.join(op.curdir, 'processed', 'ml-100k')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Actually download/extract the file!\n",
+    "maybe_download_and_extract(ML_100K_URL, dest_directory, extract_directory, extract=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download Glove Pre-Trained Embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Specify where to download from\n",
+    "glove_model_url = 'http://nlp.stanford.edu/data/glove.6B.zip'\n",
+    "# Specify where to save \n",
+    "dest_directory = op.join(op.curdir, 'data')\n",
+    "# extract \n",
+    "extract_directory = op.join(op.curdir, 'processed', 'glove')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Actually download/extract the file!\n",
+    "maybe_download_and_extract(glove_model_url, dest_directory, extract_directory, extract=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download AlexNet Pre-Trained Model Weights"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Specify where to download from\n",
+    "alexnet_weights_URL = 'https://www.cs.toronto.edu/~guerzhoy/tf_alexnet/bvlc_alexnet.npy'\n",
+    "# Specify where to save \n",
+    "dest_directory = op.join(op.curdir, 'data')\n",
+    "# destination file name in processed folder\n",
+    "extract_directory = op.join(op.curdir, 'processed', 'bvlc_alexnet.npy')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Actually download/extract the file!\n",
+    "maybe_download_and_extract(alexnet_weights_URL, dest_directory, extract_directory, extract=False, copy_only=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Download UT Zappos50K dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Specify where to download from\n",
+    "utzap50k_URL = 'http://vision.cs.utexas.edu/projects/finegrained/utzap50k/ut-zap50k-images-square.zip'\n",
+    "# Specify where to save \n",
+    "dest_directory = op.join(op.curdir, 'data')\n",
+    "# extract \n",
+    "extract_directory = op.join(op.curdir, 'processed', 'utzap50k')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Actually download/extract the file!\n",
+    "maybe_download_and_extract(utzap50k_URL, dest_directory, extract_directory, extract=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}