From 4ebbb41a08e73f731d556d97cf76a2df52a75b42 Mon Sep 17 00:00:00 2001
From: marhamil723 <mhamilton723@gmail.com>
Date: Wed, 24 Jul 2019 18:05:56 -0700
Subject: [PATCH] remove brittle dataset downloading from demos

---
 build.sbt                                     | 11 ++--
 .../Classification - Adult Census.ipynb       | 10 +--
 ...fication - Before and After MMLSpark.ipynb | 14 +----
 ...ning - CIFAR10 Convolutional Network.ipynb | 61 +------------------
 .../DeepLearning - Transfer Learning.ipynb    | 47 +-------------
 ...meterTuning - Fighting Breast Cancer.ipynb | 19 +-----
 ...n -  Flight Delays with DataCleaning.ipynb |  6 +-
 .../samples/Regression - Auto Imports.ipynb   | 51 +---------------
 .../samples/Regression - Flight Delays.ipynb  | 19 ++----
 ...parkServing - Deploying a Classifier.ipynb | 14 ++---
 ... - Amazon Book Reviews with Word2Vec.ipynb | 11 +---
 .../TextAnalytics - Amazon Book Reviews.ipynb | 11 +---
 pipeline.yaml                                 |  4 +-
 .../ml/nbtest/DatabricksUtilities.scala       |  2 +-
 14 files changed, 37 insertions(+), 243 deletions(-)

diff --git a/build.sbt b/build.sbt
index 5ae75ae883..b9ab20796f 100644
--- a/build.sbt
+++ b/build.sbt
@@ -234,15 +234,18 @@ genBuildInfo := {
 
   val buildInfo =
     s"""
-      |MMLSpark Build Release Info
+      |MMLSpark Build and Release Information
       |---------------
       |
       |### Maven Coordinates
       | `${organization.value}:${name.value}_2.11:${version.value}`
       | 
-      |### Documentation Uploaded:
-      |[Scala](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html)
-      |[Python](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html)
+      |### Maven Resolver
+      | `https://mmlspark.azureedge.net/maven`
+      | 
+      |### Documentation Pages:
+      |[Scala Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html)
+      |[Python Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html)
       |
     """.stripMargin
 
diff --git a/notebooks/samples/Classification - Adult Census.ipynb b/notebooks/samples/Classification - Adult Census.ipynb
index 6f0f6ce71d..ef93f70fde 100644
--- a/notebooks/samples/Classification - Adult Census.ipynb	
+++ b/notebooks/samples/Classification - Adult Census.ipynb	
@@ -34,12 +34,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataFilePath = \"AdultCensusIncome.csv\"\n",
-    "import os, urllib\n",
-    "if not os.path.isfile(dataFilePath):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n",
-    "data = spark.createDataFrame(pd.read_csv(dataFilePath, dtype={\" hours-per-week\": np.float64}))\n",
-    "data = data.select([\" education\", \" marital-status\", \" hours-per-week\", \" income\"])\n",
+    "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n",
+    "data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n",
     "train, test = data.randomSplit([0.75, 0.25], seed=123)\n",
     "train.limit(10).toPandas()"
    ]
@@ -64,7 +60,7 @@
    "source": [
     "from mmlspark.train import TrainClassifier\n",
     "from pyspark.ml.classification import LogisticRegression\n",
-    "model = TrainClassifier(model=LogisticRegression(), labelCol=\" income\", numFeatures=256).fit(train)\n",
+    "model = TrainClassifier(model=LogisticRegression(), labelCol=\"income\", numFeatures=256).fit(train)\n",
     "model.write().overwrite().save(\"adultCensusIncomeModel.mml\")"
    ]
   },
diff --git a/notebooks/samples/Classification - Before and After MMLSpark.ipynb b/notebooks/samples/Classification - Before and After MMLSpark.ipynb
index 4bbf0c5663..4bcecd2f43 100644
--- a/notebooks/samples/Classification - Before and After MMLSpark.ipynb	
+++ b/notebooks/samples/Classification - Before and After MMLSpark.ipynb	
@@ -41,17 +41,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "from pyspark.sql.types import IntegerType, StringType, StructType, StructField\n",
-    "import os, urllib\n",
-    "\n",
-    "dataFilePath = \"BookReviewsFromAmazon10K.tsv\"\n",
-    "textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n",
-    "                         StructField(\"text\", StringType(), False)])\n",
-    "\n",
-    "if not os.path.isfile(dataFilePath):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n",
-    "rawData = spark.createDataFrame(pd.read_csv(dataFilePath, sep=\"\\t\", header=None), textSchema)\n",
+    "rawData = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n",
     "rawData.show(5)"
    ]
   },
@@ -75,7 +65,7 @@
    "outputs": [],
    "source": [
     "from pyspark.sql.functions import udf\n",
-    "from pyspark.sql.types import LongType, FloatType, DoubleType\n",
+    "from pyspark.sql.types import *\n",
     "def wordCount(s):\n",
     "    return len(s.split())\n",
     "def wordLength(s):\n",
diff --git a/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb b/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb
index 41399e6fbb..59099a39b3 100644
--- a/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb	
+++ b/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb	
@@ -37,8 +37,7 @@
     "\n",
     "# Please note that this is a copy of the CIFAR10 dataset originally found here:\n",
     "# http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\n",
-    "dataFile = \"cifar-10-python.tar.gz\"\n",
-    "dataURL = cdnURL + \"/CIFAR10/\" + dataFile"
+    "imagesWithLabels = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\")"
    ]
   },
   {
@@ -54,24 +53,11 @@
     "modelDir = \"dbfs:///models/\""
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "mml-deploy": "local",
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "modelName = \"ConvNet\"\n",
-    "modelDir = \"file:\" + abspath(\"models\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Get the model and extract the data."
+    "Get the model"
    ]
   },
   {
@@ -80,49 +66,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os, tarfile, pickle\n",
-    "import urllib.request\n",
-    "\n",
     "d = ModelDownloader(spark, modelDir)\n",
-    "model = d.downloadByName(modelName)\n",
-    "if not os.path.isfile(dataFile):\n",
-    "    urllib.request.urlretrieve(dataURL, dataFile)\n",
-    "with tarfile.open(dataFile, \"r:gz\") as f:\n",
-    "    test_dict = pickle.load(f.extractfile(\"cifar-10-batches-py/test_batch\"),\n",
-    "                            encoding=\"latin1\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Preprocess the images."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pyspark.sql.functions import col\n",
-    "from pyspark.sql.types import *\n",
-    "\n",
-    "def reshape_image(record):\n",
-    "    image, label, filename = record\n",
-    "    data = [float(x) for x in image.reshape(3,32,32).flatten()]\n",
-    "    return data, label, filename\n",
-    "\n",
-    "convert_to_float = udf(lambda x: x, ArrayType(FloatType()))\n",
-    "\n",
-    "image_rdd = zip(test_dict[\"data\"], test_dict[\"labels\"], test_dict[\"filenames\"])\n",
-    "image_rdd = spark.sparkContext.parallelize(image_rdd).map(reshape_image)\n",
-    "\n",
-    "imagesWithLabels = image_rdd.toDF([\"images\", \"labels\", \"filename\"])\n",
-    "imagesWithLabels = imagesWithLabels.withColumn(\"images\", convert_to_float(col(\"images\")))\n",
-    "imagesWithLabels.printSchema()\n",
-    "\n",
-    "imagesWithLabels.cache()"
+    "model = d.downloadByName(modelName)\n"
    ]
   },
   {
diff --git a/notebooks/samples/DeepLearning - Transfer Learning.ipynb b/notebooks/samples/DeepLearning - Transfer Learning.ipynb
index 3774cccce4..ebdb54d94f 100644
--- a/notebooks/samples/DeepLearning - Transfer Learning.ipynb	
+++ b/notebooks/samples/DeepLearning - Transfer Learning.ipynb	
@@ -14,7 +14,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "First, we load first batch of CIFAR-10 training data into NumPy array."
+    "Load DNN Model and pick one of the inner layers as feature output"
    ]
   },
   {
@@ -29,39 +29,7 @@
     "from os.path import abspath\n",
     "from pyspark.sql.functions import col, udf\n",
     "from pyspark.sql.types import *\n",
-    "\n",
-    "cdnURL = \"https://mmlspark.azureedge.net/datasets\"\n",
-    "\n",
-    "# Please note that this is a copy of the CIFAR10 dataset originally found here:\n",
-    "# http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\n",
-    "dataFile = \"cifar-10-python.tar.gz\"\n",
-    "dataURL = cdnURL + \"/CIFAR10/\" + dataFile\n",
-    "\n",
-    "if not os.path.isfile(dataFile):\n",
-    "    urllib.request.urlretrieve(dataURL, dataFile)\n",
-    "with tarfile.open(dataFile, \"r:gz\") as f:\n",
-    "    train_dict = pickle.load(f.extractfile(\"cifar-10-batches-py/data_batch_1\"),\n",
-    "                             encoding=\"latin1\")\n",
-    "\n",
-    "train_data = np.array(train_dict[\"data\"])\n",
-    "train_labels = np.array(train_dict[\"labels\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Load DNN Model and pick one of the inner layers as feature output"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
     "modelName = \"ConvNet\"\n",
-    "modelDir = \"wasb:///models/\"\n",
     "modelDir = \"file:\" + abspath(\"models\")\n",
     "d = ModelDownloader(spark, modelDir)\n",
     "model = d.downloadByName(modelName)\n",
@@ -83,18 +51,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def reshape_image(record):\n",
-    "    image, label = record\n",
-    "    data = [float(x) for x in image.reshape(3,32,32).flatten()]\n",
-    "    return data, int(label)\n",
-    "\n",
-    "convert_to_float = udf(lambda x: x, ArrayType(FloatType()))\n",
-    "\n",
-    "image_rdd = zip(train_data,train_labels)\n",
-    "image_rdd = spark.sparkContext.parallelize(image_rdd).map(reshape_image)\n",
-    "\n",
-    "imagesWithLabels = image_rdd.toDF([\"images\", \"labels\"])\n",
-    "imagesWithLabels = imagesWithLabels.withColumn(\"images\", convert_to_float(col(\"images\")))"
+    "imagesWithLabels = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\")"
    ]
   },
   {
diff --git a/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb b/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb
index 80118f2e58..8e4f1ac4a3 100644
--- a/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb	
+++ b/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb	
@@ -17,8 +17,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "from pyspark.sql.types import IntegerType, StringType, FloatType, StructType, StructField"
+    "import pandas as pd\n"
    ]
   },
   {
@@ -34,21 +33,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataFilePath = \"BreastCancer.csv\"\n",
-    "textSchema = StructType([StructField(\"Label\", IntegerType(), False),\n",
-    "                         StructField(\"Clump Thickness\", IntegerType(), False),\n",
-    "                         StructField(\"Uniformity of Cell Size\", IntegerType(), False),\n",
-    "                         StructField(\"Uniformity of Cell Shape\", IntegerType(), False),\n",
-    "                         StructField(\"Marginal Adhesion\", IntegerType(), False),\n",
-    "                         StructField(\"Single Epithelial Cell Size\", IntegerType(), False),\n",
-    "                         StructField(\"Bare Nuclei\", FloatType(), False),\n",
-    "                         StructField(\"Bland Chromatin\", IntegerType(), False),\n",
-    "                         StructField(\"Normal Nucleoli\", IntegerType(), False),\n",
-    "                         StructField(\"Mitoses\", IntegerType(), False),])\n",
-    "import os, urllib\n",
-    "if not os.path.isfile(dataFilePath):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n",
-    "data = spark.createDataFrame(pd.read_csv(dataFilePath, sep=\",\", header=0, na_values=\"?\"), textSchema)\n",
+    "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BreastCancer.parquet\")\n",
     "tune, test = data.randomSplit([0.80, 0.20])\n",
     "tune.limit(10).toPandas()"
    ]
diff --git a/notebooks/samples/Regression -  Flight Delays with DataCleaning.ipynb b/notebooks/samples/Regression -  Flight Delays with DataCleaning.ipynb
index 21eb2ab677..fd4bd8d9c7 100644
--- a/notebooks/samples/Regression -  Flight Delays with DataCleaning.ipynb	
+++ b/notebooks/samples/Regression -  Flight Delays with DataCleaning.ipynb	
@@ -51,11 +51,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataFile = \"On_Time_Performance_2012_9.csv\"\n",
-    "import os, urllib\n",
-    "if not os.path.isfile(dataFile):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n",
-    "flightDelay = spark.createDataFrame(pd.read_csv(dataFile))\n",
+    "flightDelay = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\")\n",
     "# print some basic info\n",
     "print(\"records read: \" + str(flightDelay.count()))\n",
     "print(\"Schema: \")\n",
diff --git a/notebooks/samples/Regression - Auto Imports.ipynb b/notebooks/samples/Regression - Auto Imports.ipynb
index c808aa04ac..5824bc5681 100644
--- a/notebooks/samples/Regression - Auto Imports.ipynb	
+++ b/notebooks/samples/Regression - Auto Imports.ipynb	
@@ -29,62 +29,13 @@
     "using `pandas.read_csv()`"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Declare the schema for the data that will be converted from the pandas\n",
-    "DataFrame to a Spark DataFrame.  Allow all fields to be nullable, so that\n",
-    "missing values can be handled appropriately, such as replacing them with\n",
-    "the mean or median value for that column."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "from pyspark.sql.types import LongType, StringType, DoubleType, StructType, StructField\n",
-    "\n",
-    "colSchema = (\n",
-    "    (\"symboling\", LongType), (\"normalized-losses\", DoubleType), (\"make\", StringType),\n",
-    "    (\"fuel-type\", StringType), (\"aspiration\", StringType), (\"body-style\", StringType),\n",
-    "    (\"drive-wheels\", StringType), (\"engine-location\", StringType), (\"wheel-base\", DoubleType),\n",
-    "    (\"length\", DoubleType), (\"width\", DoubleType), (\"height\", DoubleType),\n",
-    "    (\"curb-weight\", LongType), (\"engine-type\", StringType), (\"num-of-cylinders\", StringType),\n",
-    "    (\"engine-size\", LongType), (\"fuel-system\", StringType), (\"bore\", DoubleType),\n",
-    "    (\"stroke\", DoubleType), (\"compression-ratio\", DoubleType), (\"horsepower\", DoubleType),\n",
-    "    (\"peak-rpm\", DoubleType), (\"city-mpg\", LongType), (\"highway-mpg\", LongType),\n",
-    "    (\"price\", DoubleType))\n",
-    "\n",
-    "tableSchema = StructType([StructField(column[0], column[1](),True) for column in colSchema])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Read the data from the AutomobilePriceRaw.csv file into a pandas dataframe.\n",
-    "Specify possible reprsentations of missing values, and drop the `num-of-doors`\n",
-    "column as the data is read in."
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataFile = \"AutomobilePriceRaw.csv\"\n",
-    "import os, urllib\n",
-    "if not os.path.isfile(dataFile):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n",
-    "data = spark.createDataFrame(pd.read_csv(dataFile,\n",
-    "                                         na_values=[\"\", \" \", \"?\"],\n",
-    "                                         usecols=tableSchema.names),\n",
-    "                             tableSchema)"
+    "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AutomobilePriceRaw.parquet\")\n"
    ]
   },
   {
diff --git a/notebooks/samples/Regression - Flight Delays.ipynb b/notebooks/samples/Regression - Flight Delays.ipynb
index b6f7a6ace0..6733b9b53d 100644
--- a/notebooks/samples/Regression - Flight Delays.ipynb	
+++ b/notebooks/samples/Regression - Flight Delays.ipynb	
@@ -37,21 +37,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# load raw data from small-sized 30 MB CSV file (trimmed to contain just what we use)\n",
-    "dataFilePath = \"On_Time_Performance_2012_9.csv\"\n",
-    "import os, urllib\n",
-    "if not os.path.isfile(dataFilePath):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath,\n",
-    "                               dataFilePath)\n",
-    "flightDelay = spark.createDataFrame(\n",
-    "    pd.read_csv(dataFilePath,\n",
-    "                dtype={\"Month\": np.float64, \"Quarter\": np.float64,\n",
-    "                       \"DayofMonth\": np.float64, \"DayOfWeek\": np.float64,\n",
-    "                       \"OriginAirportID\": np.float64, \"DestAirportID\": np.float64,\n",
-    "                       \"CRSDepTime\": np.float64, \"CRSArrTime\": np.float64}))\n",
-    "# Print information on the dataset we loaded\n",
-    "print(\"Records read: \" + str(flightDelay.count()))\n",
-    "print(\"Schema:\")\n",
+    "flightDelay = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\")\n",
+    "# print some basic info\n",
+    "print(\"records read: \" + str(flightDelay.count()))\n",
+    "print(\"Schema: \")\n",
     "flightDelay.printSchema()\n",
     "flightDelay.limit(10).toPandas()"
    ]
diff --git a/notebooks/samples/SparkServing - Deploying a Classifier.ipynb b/notebooks/samples/SparkServing - Deploying a Classifier.ipynb
index fd8acbfa9c..400cabd995 100644
--- a/notebooks/samples/SparkServing - Deploying a Classifier.ipynb	
+++ b/notebooks/samples/SparkServing - Deploying a Classifier.ipynb	
@@ -35,12 +35,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataFilePath = \"AdultCensusIncome.csv\"\n",
-    "import os, urllib\n",
-    "if not os.path.isfile(dataFilePath):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n",
-    "data = spark.createDataFrame(pd.read_csv(dataFilePath, dtype={\" hours-per-week\": np.float64}))\n",
-    "data = data.select([\" education\", \" marital-status\", \" hours-per-week\", \" income\"])\n",
+    "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n",
+    "data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n",
     "train, test = data.randomSplit([0.75, 0.25], seed=123)\n",
     "train.limit(10).toPandas()"
    ]
@@ -67,7 +63,7 @@
    "source": [
     "from mmlspark.train import TrainClassifier\n",
     "from pyspark.ml.classification import LogisticRegression\n",
-    "model = TrainClassifier(model=LogisticRegression(), labelCol=\" income\", numFeatures=256).fit(train)"
+    "model = TrainClassifier(model=LogisticRegression(), labelCol=\"income\", numFeatures=256).fit(train)"
    ]
   },
   {
@@ -147,7 +143,7 @@
    "outputs": [],
    "source": [
     "import requests\n",
-    "data = u'{\" education\":\" 10th\",\" marital-status\":\" Divorced\",\" hours-per-week\":40.0}'\n",
+    "data = u'{\"education\":\" 10th\",\"marital-status\":\"Divorced\",\"hours-per-week\":40.0}'\n",
     "r = requests.post(data=data, url=\"http://localhost:8898/my_api\")\n",
     "print(\"Response {}\".format(r.text))"
    ]
@@ -159,7 +155,7 @@
    "outputs": [],
    "source": [
     "import requests\n",
-    "data = u'{\" education\":\" Masters\",\" marital-status\":\" Married-civ-spouse\",\" hours-per-week\":40.0}'\n",
+    "data = u'{\"education\":\" Masters\",\"marital-status\":\"Married-civ-spouse\",\"hours-per-week\":40.0}'\n",
     "r = requests.post(data=data, url=\"http://localhost:8898/my_api\")\n",
     "print(\"Response {}\".format(r.text))"
    ]
diff --git a/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb b/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb
index bb5a95bec3..9e3cd73cda 100644
--- a/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb	
+++ b/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb	
@@ -17,8 +17,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "from pyspark.sql.types import IntegerType, StringType, StructType, StructField"
+    "import pandas as pd\n"
    ]
   },
   {
@@ -27,13 +26,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataFile = \"BookReviewsFromAmazon10K.tsv\"\n",
-    "textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n",
-    "                         StructField(\"text\", StringType(), False)])\n",
-    "import os, urllib\n",
-    "if not os.path.isfile(dataFile):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n",
-    "data = spark.createDataFrame(pd.read_csv(dataFile, sep=\"\\t\", header=None), textSchema)\n",
+    "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n",
     "data.limit(10).toPandas()"
    ]
   },
diff --git a/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb b/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb
index db3e445ba4..402f455900 100644
--- a/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb	
+++ b/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb	
@@ -17,8 +17,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "from pyspark.sql.types import IntegerType, StringType, StructType, StructField"
+    "import pandas as pd\n"
    ]
   },
   {
@@ -27,13 +26,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataFile = \"BookReviewsFromAmazon10K.tsv\"\n",
-    "textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n",
-    "                         StructField(\"text\", StringType(), False)])\n",
-    "import os, urllib\n",
-    "if not os.path.isfile(dataFile):\n",
-    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n",
-    "data = spark.createDataFrame(pd.read_csv(dataFile, sep=\"\\t\", header=None), textSchema)\n",
+    "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n",
     "data.limit(10).toPandas()"
    ]
   },
diff --git a/pipeline.yaml b/pipeline.yaml
index 71b50ca440..96ddc70518 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -154,8 +154,8 @@ jobs:
         azureSubscription: 'Findable Incubation(ca9d21ff-2a46-4e8b-bf06-8d65242342e5)'
         scriptLocation: inlineScript
         inlineScript: |
-          pip install requests || pip install requests
-          sbt setup || sbt setup
+          (timeout 30 pip install requests) || (echo "retrying" && timeout 30 pip install requests)
+          (timeout 200 sbt setup) || (echo "retrying" && timeout 200 sbt setup)
     - task: AzureCLI@1
       displayName: 'Unit Test'
       timeoutInMinutes: 20
diff --git a/src/it/scala/com/microsoft/ml/nbtest/DatabricksUtilities.scala b/src/it/scala/com/microsoft/ml/nbtest/DatabricksUtilities.scala
index 5798d63ef8..f740da642b 100644
--- a/src/it/scala/com/microsoft/ml/nbtest/DatabricksUtilities.scala
+++ b/src/it/scala/com/microsoft/ml/nbtest/DatabricksUtilities.scala
@@ -54,7 +54,7 @@ object DatabricksUtilities {
   // MMLSpark info
   val truncatedScalaVersion: String = BuildInfo.scalaVersion
     .split(".".toCharArray.head).dropRight(1).mkString(".")
-  val version = s"com.microsoft.ml.spark:${BuildInfo.name}_$truncatedScalaVersion:${BuildInfo.version}"
+  val version = s"com.microsoft.ml.spark:${BuildInfo.name}_$truncatedScalaVersion:0.17+85-1d8f34cf"
   val repository = "https://mmlspark.azureedge.net/maven"
 
   val libraries: String = List(