From 4ebbb41a08e73f731d556d97cf76a2df52a75b42 Mon Sep 17 00:00:00 2001 From: marhamil723 Date: Wed, 24 Jul 2019 18:05:56 -0700 Subject: [PATCH] remove brittle dataset downloading from demos --- build.sbt | 11 ++-- .../Classification - Adult Census.ipynb | 10 +-- ...fication - Before and After MMLSpark.ipynb | 14 +---- ...ning - CIFAR10 Convolutional Network.ipynb | 61 +------------------ .../DeepLearning - Transfer Learning.ipynb | 47 +------------- ...meterTuning - Fighting Breast Cancer.ipynb | 19 +----- ...n - Flight Delays with DataCleaning.ipynb | 6 +- .../samples/Regression - Auto Imports.ipynb | 51 +--------------- .../samples/Regression - Flight Delays.ipynb | 19 ++---- ...parkServing - Deploying a Classifier.ipynb | 14 ++--- ... - Amazon Book Reviews with Word2Vec.ipynb | 11 +--- .../TextAnalytics - Amazon Book Reviews.ipynb | 11 +--- pipeline.yaml | 4 +- .../ml/nbtest/DatabricksUtilities.scala | 2 +- 14 files changed, 37 insertions(+), 243 deletions(-) diff --git a/build.sbt b/build.sbt index 5ae75ae883..b9ab20796f 100644 --- a/build.sbt +++ b/build.sbt @@ -234,15 +234,18 @@ genBuildInfo := { val buildInfo = s""" - |MMLSpark Build Release Info + |MMLSpark Build and Release Information |--------------- | |### Maven Coordinates | `${organization.value}:${name.value}_2.11:${version.value}` | - |### Documentation Uploaded: - |[Scala](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html) - |[Python](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html) + |### Maven Resolver + | `https://mmlspark.azureedge.net/maven` + | + |### Documentation Pages: + |[Scala Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html) + |[Python Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html) | """.stripMargin diff --git a/notebooks/samples/Classification - Adult Census.ipynb b/notebooks/samples/Classification - Adult Census.ipynb index 6f0f6ce71d..ef93f70fde 100644 --- a/notebooks/samples/Classification - Adult Census.ipynb +++ b/notebooks/samples/Classification - Adult Census.ipynb @@ -34,12 +34,8 @@ "metadata": {}, "outputs": [], "source": [ - "dataFilePath = \"AdultCensusIncome.csv\"\n", - "import os, urllib\n", - "if not os.path.isfile(dataFilePath):\n", - " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n", - "data = spark.createDataFrame(pd.read_csv(dataFilePath, dtype={\" hours-per-week\": np.float64}))\n", - "data = data.select([\" education\", \" marital-status\", \" hours-per-week\", \" income\"])\n", + "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n", + "data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n", "train, test = data.randomSplit([0.75, 0.25], seed=123)\n", "train.limit(10).toPandas()" ] @@ -64,7 +60,7 @@ "source": [ "from mmlspark.train import TrainClassifier\n", "from pyspark.ml.classification import LogisticRegression\n", - "model = TrainClassifier(model=LogisticRegression(), labelCol=\" income\", numFeatures=256).fit(train)\n", + "model = TrainClassifier(model=LogisticRegression(), labelCol=\"income\", numFeatures=256).fit(train)\n", "model.write().overwrite().save(\"adultCensusIncomeModel.mml\")" ] }, diff --git a/notebooks/samples/Classification - Before and After MMLSpark.ipynb b/notebooks/samples/Classification - Before and After MMLSpark.ipynb index 4bbf0c5663..4bcecd2f43 100644 --- a/notebooks/samples/Classification - Before and After MMLSpark.ipynb +++ b/notebooks/samples/Classification - Before and After MMLSpark.ipynb @@ -41,17 +41,7 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "from pyspark.sql.types import IntegerType, StringType, StructType, StructField\n", - "import os, urllib\n", - "\n", - "dataFilePath = \"BookReviewsFromAmazon10K.tsv\"\n", - "textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n", - " StructField(\"text\", StringType(), False)])\n", - "\n", - "if not os.path.isfile(dataFilePath):\n", - " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n", - "rawData = spark.createDataFrame(pd.read_csv(dataFilePath, sep=\"\\t\", header=None), textSchema)\n", + "rawData = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n", "rawData.show(5)" ] }, @@ -75,7 +65,7 @@ "outputs": [], "source": [ "from pyspark.sql.functions import udf\n", - "from pyspark.sql.types import LongType, FloatType, DoubleType\n", + "from pyspark.sql.types import *\n", "def wordCount(s):\n", " return len(s.split())\n", "def wordLength(s):\n", diff --git a/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb b/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb index 41399e6fbb..59099a39b3 100644 --- a/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb +++ b/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb @@ -37,8 +37,7 @@ "\n", "# Please note that this is a copy of the CIFAR10 dataset originally found here:\n", "# http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\n", - "dataFile = \"cifar-10-python.tar.gz\"\n", - "dataURL = cdnURL + \"/CIFAR10/\" + dataFile" + "imagesWithLabels = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\")" ] }, { @@ -54,24 +53,11 @@ "modelDir = \"dbfs:///models/\"" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "mml-deploy": "local", - "collapsed": false - }, - "outputs": [], - "source": [ - "modelName = \"ConvNet\"\n", - "modelDir = \"file:\" + abspath(\"models\")" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Get the model and extract the data." + "Get the model" ] }, { @@ -80,49 +66,8 @@ "metadata": {}, "outputs": [], "source": [ - "import os, tarfile, pickle\n", - "import urllib.request\n", - "\n", "d = ModelDownloader(spark, modelDir)\n", - "model = d.downloadByName(modelName)\n", - "if not os.path.isfile(dataFile):\n", - " urllib.request.urlretrieve(dataURL, dataFile)\n", - "with tarfile.open(dataFile, \"r:gz\") as f:\n", - " test_dict = pickle.load(f.extractfile(\"cifar-10-batches-py/test_batch\"),\n", - " encoding=\"latin1\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Preprocess the images." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql.functions import col\n", - "from pyspark.sql.types import *\n", - "\n", - "def reshape_image(record):\n", - " image, label, filename = record\n", - " data = [float(x) for x in image.reshape(3,32,32).flatten()]\n", - " return data, label, filename\n", - "\n", - "convert_to_float = udf(lambda x: x, ArrayType(FloatType()))\n", - "\n", - "image_rdd = zip(test_dict[\"data\"], test_dict[\"labels\"], test_dict[\"filenames\"])\n", - "image_rdd = spark.sparkContext.parallelize(image_rdd).map(reshape_image)\n", - "\n", - "imagesWithLabels = image_rdd.toDF([\"images\", \"labels\", \"filename\"])\n", - "imagesWithLabels = imagesWithLabels.withColumn(\"images\", convert_to_float(col(\"images\")))\n", - "imagesWithLabels.printSchema()\n", - "\n", - "imagesWithLabels.cache()" + "model = d.downloadByName(modelName)\n" ] }, { diff --git a/notebooks/samples/DeepLearning - Transfer Learning.ipynb b/notebooks/samples/DeepLearning - Transfer Learning.ipynb index 3774cccce4..ebdb54d94f 100644 --- a/notebooks/samples/DeepLearning - Transfer Learning.ipynb +++ b/notebooks/samples/DeepLearning - Transfer Learning.ipynb @@ -14,7 +14,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, we load first batch of CIFAR-10 training data into NumPy array." + "Load DNN Model and pick one of the inner layers as feature output" ] }, { @@ -29,39 +29,7 @@ "from os.path import abspath\n", "from pyspark.sql.functions import col, udf\n", "from pyspark.sql.types import *\n", - "\n", - "cdnURL = \"https://mmlspark.azureedge.net/datasets\"\n", - "\n", - "# Please note that this is a copy of the CIFAR10 dataset originally found here:\n", - "# http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\n", - "dataFile = \"cifar-10-python.tar.gz\"\n", - "dataURL = cdnURL + \"/CIFAR10/\" + dataFile\n", - "\n", - "if not os.path.isfile(dataFile):\n", - " urllib.request.urlretrieve(dataURL, dataFile)\n", - "with tarfile.open(dataFile, \"r:gz\") as f:\n", - " train_dict = pickle.load(f.extractfile(\"cifar-10-batches-py/data_batch_1\"),\n", - " encoding=\"latin1\")\n", - "\n", - "train_data = np.array(train_dict[\"data\"])\n", - "train_labels = np.array(train_dict[\"labels\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load DNN Model and pick one of the inner layers as feature output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ "modelName = \"ConvNet\"\n", - "modelDir = \"wasb:///models/\"\n", "modelDir = \"file:\" + abspath(\"models\")\n", "d = ModelDownloader(spark, modelDir)\n", "model = d.downloadByName(modelName)\n", @@ -83,18 +51,7 @@ "metadata": {}, "outputs": [], "source": [ - "def reshape_image(record):\n", - " image, label = record\n", - " data = [float(x) for x in image.reshape(3,32,32).flatten()]\n", - " return data, int(label)\n", - "\n", - "convert_to_float = udf(lambda x: x, ArrayType(FloatType()))\n", - "\n", - "image_rdd = zip(train_data,train_labels)\n", - "image_rdd = spark.sparkContext.parallelize(image_rdd).map(reshape_image)\n", - "\n", - "imagesWithLabels = image_rdd.toDF([\"images\", \"labels\"])\n", - "imagesWithLabels = imagesWithLabels.withColumn(\"images\", convert_to_float(col(\"images\")))" + "imagesWithLabels = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\")" ] }, { diff --git a/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb b/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb index 80118f2e58..8e4f1ac4a3 100644 --- a/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb +++ b/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb @@ -17,8 +17,7 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "from pyspark.sql.types import IntegerType, StringType, FloatType, StructType, StructField" + "import pandas as pd\n" ] }, { @@ -34,21 +33,7 @@ "metadata": {}, "outputs": [], "source": [ - "dataFilePath = \"BreastCancer.csv\"\n", - "textSchema = StructType([StructField(\"Label\", IntegerType(), False),\n", - " StructField(\"Clump Thickness\", IntegerType(), False),\n", - " StructField(\"Uniformity of Cell Size\", IntegerType(), False),\n", - " StructField(\"Uniformity of Cell Shape\", IntegerType(), False),\n", - " StructField(\"Marginal Adhesion\", IntegerType(), False),\n", - " StructField(\"Single Epithelial Cell Size\", IntegerType(), False),\n", - " StructField(\"Bare Nuclei\", FloatType(), False),\n", - " StructField(\"Bland Chromatin\", IntegerType(), False),\n", - " StructField(\"Normal Nucleoli\", IntegerType(), False),\n", - " StructField(\"Mitoses\", IntegerType(), False),])\n", - "import os, urllib\n", - "if not os.path.isfile(dataFilePath):\n", - " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n", - "data = spark.createDataFrame(pd.read_csv(dataFilePath, sep=\",\", header=0, na_values=\"?\"), textSchema)\n", + "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BreastCancer.parquet\")\n", "tune, test = data.randomSplit([0.80, 0.20])\n", "tune.limit(10).toPandas()" ] diff --git a/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb b/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb index 21eb2ab677..fd4bd8d9c7 100644 --- a/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb +++ b/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb @@ -51,11 +51,7 @@ "metadata": {}, "outputs": [], "source": [ - "dataFile = \"On_Time_Performance_2012_9.csv\"\n", - "import os, urllib\n", - "if not os.path.isfile(dataFile):\n", - " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n", - "flightDelay = spark.createDataFrame(pd.read_csv(dataFile))\n", + "flightDelay = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\")\n", "# print some basic info\n", "print(\"records read: \" + str(flightDelay.count()))\n", "print(\"Schema: \")\n", diff --git a/notebooks/samples/Regression - Auto Imports.ipynb b/notebooks/samples/Regression - Auto Imports.ipynb index c808aa04ac..5824bc5681 100644 --- a/notebooks/samples/Regression - Auto Imports.ipynb +++ b/notebooks/samples/Regression - Auto Imports.ipynb @@ -29,62 +29,13 @@ "using `pandas.read_csv()`" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Declare the schema for the data that will be converted from the pandas\n", - "DataFrame to a Spark DataFrame. Allow all fields to be nullable, so that\n", - "missing values can be handled appropriately, such as replacing them with\n", - "the mean or median value for that column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from pyspark.sql.types import LongType, StringType, DoubleType, StructType, StructField\n", - "\n", - "colSchema = (\n", - " (\"symboling\", LongType), (\"normalized-losses\", DoubleType), (\"make\", StringType),\n", - " (\"fuel-type\", StringType), (\"aspiration\", StringType), (\"body-style\", StringType),\n", - " (\"drive-wheels\", StringType), (\"engine-location\", StringType), (\"wheel-base\", DoubleType),\n", - " (\"length\", DoubleType), (\"width\", DoubleType), (\"height\", DoubleType),\n", - " (\"curb-weight\", LongType), (\"engine-type\", StringType), (\"num-of-cylinders\", StringType),\n", - " (\"engine-size\", LongType), (\"fuel-system\", StringType), (\"bore\", DoubleType),\n", - " (\"stroke\", DoubleType), (\"compression-ratio\", DoubleType), (\"horsepower\", DoubleType),\n", - " (\"peak-rpm\", DoubleType), (\"city-mpg\", LongType), (\"highway-mpg\", LongType),\n", - " (\"price\", DoubleType))\n", - "\n", - "tableSchema = StructType([StructField(column[0], column[1](),True) for column in colSchema])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Read the data from the AutomobilePriceRaw.csv file into a pandas dataframe.\n", - "Specify possible reprsentations of missing values, and drop the `num-of-doors`\n", - "column as the data is read in." - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "dataFile = \"AutomobilePriceRaw.csv\"\n", - "import os, urllib\n", - "if not os.path.isfile(dataFile):\n", - " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n", - "data = spark.createDataFrame(pd.read_csv(dataFile,\n", - " na_values=[\"\", \" \", \"?\"],\n", - " usecols=tableSchema.names),\n", - " tableSchema)" + "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AutomobilePriceRaw.parquet\")\n" ] }, { diff --git a/notebooks/samples/Regression - Flight Delays.ipynb b/notebooks/samples/Regression - Flight Delays.ipynb index b6f7a6ace0..6733b9b53d 100644 --- a/notebooks/samples/Regression - Flight Delays.ipynb +++ b/notebooks/samples/Regression - Flight Delays.ipynb @@ -37,21 +37,10 @@ "metadata": {}, "outputs": [], "source": [ - "# load raw data from small-sized 30 MB CSV file (trimmed to contain just what we use)\n", - "dataFilePath = \"On_Time_Performance_2012_9.csv\"\n", - "import os, urllib\n", - "if not os.path.isfile(dataFilePath):\n", - " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath,\n", - " dataFilePath)\n", - "flightDelay = spark.createDataFrame(\n", - " pd.read_csv(dataFilePath,\n", - " dtype={\"Month\": np.float64, \"Quarter\": np.float64,\n", - " \"DayofMonth\": np.float64, \"DayOfWeek\": np.float64,\n", - " \"OriginAirportID\": np.float64, \"DestAirportID\": np.float64,\n", - " \"CRSDepTime\": np.float64, \"CRSArrTime\": np.float64}))\n", - "# Print information on the dataset we loaded\n", - "print(\"Records read: \" + str(flightDelay.count()))\n", - "print(\"Schema:\")\n", + "flightDelay = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\")\n", + "# print some basic info\n", + "print(\"records read: \" + str(flightDelay.count()))\n", + "print(\"Schema: \")\n", "flightDelay.printSchema()\n", "flightDelay.limit(10).toPandas()" ] diff --git a/notebooks/samples/SparkServing - Deploying a Classifier.ipynb b/notebooks/samples/SparkServing - Deploying a Classifier.ipynb index fd8acbfa9c..400cabd995 100644 --- a/notebooks/samples/SparkServing - Deploying a Classifier.ipynb +++ b/notebooks/samples/SparkServing - Deploying a Classifier.ipynb @@ -35,12 +35,8 @@ "metadata": {}, "outputs": [], "source": [ - "dataFilePath = \"AdultCensusIncome.csv\"\n", - "import os, urllib\n", - "if not os.path.isfile(dataFilePath):\n", - " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\" + dataFilePath, dataFilePath)\n", - "data = spark.createDataFrame(pd.read_csv(dataFilePath, dtype={\" hours-per-week\": np.float64}))\n", - "data = data.select([\" education\", \" marital-status\", \" hours-per-week\", \" income\"])\n", + "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n", + "data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n", "train, test = data.randomSplit([0.75, 0.25], seed=123)\n", "train.limit(10).toPandas()" ] @@ -67,7 +63,7 @@ "source": [ "from mmlspark.train import TrainClassifier\n", "from pyspark.ml.classification import LogisticRegression\n", - "model = TrainClassifier(model=LogisticRegression(), labelCol=\" income\", numFeatures=256).fit(train)" + "model = TrainClassifier(model=LogisticRegression(), labelCol=\"income\", numFeatures=256).fit(train)" ] }, { @@ -147,7 +143,7 @@ "outputs": [], "source": [ "import requests\n", - "data = u'{\" education\":\" 10th\",\" marital-status\":\" Divorced\",\" hours-per-week\":40.0}'\n", + "data = u'{\"education\":\" 10th\",\"marital-status\":\"Divorced\",\"hours-per-week\":40.0}'\n", "r = requests.post(data=data, url=\"http://localhost:8898/my_api\")\n", "print(\"Response {}\".format(r.text))" ] @@ -159,7 +155,7 @@ "outputs": [], "source": [ "import requests\n", - "data = u'{\" education\":\" Masters\",\" marital-status\":\" Married-civ-spouse\",\" hours-per-week\":40.0}'\n", + "data = u'{\"education\":\" Masters\",\"marital-status\":\"Married-civ-spouse\",\"hours-per-week\":40.0}'\n", "r = requests.post(data=data, url=\"http://localhost:8898/my_api\")\n", "print(\"Response {}\".format(r.text))" ] diff --git a/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb b/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb index bb5a95bec3..9e3cd73cda 100644 --- a/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb +++ b/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb @@ -17,8 +17,7 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "from pyspark.sql.types import IntegerType, StringType, StructType, StructField" + "import pandas as pd\n" ] }, { @@ -27,13 +26,7 @@ "metadata": {}, "outputs": [], "source": [ - "dataFile = \"BookReviewsFromAmazon10K.tsv\"\n", - "textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n", - " StructField(\"text\", StringType(), False)])\n", - "import os, urllib\n", - "if not os.path.isfile(dataFile):\n", - " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n", - "data = spark.createDataFrame(pd.read_csv(dataFile, sep=\"\\t\", header=None), textSchema)\n", + "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n", "data.limit(10).toPandas()" ] }, diff --git a/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb b/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb index db3e445ba4..402f455900 100644 --- a/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb +++ b/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb @@ -17,8 +17,7 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "from pyspark.sql.types import IntegerType, StringType, StructType, StructField" + "import pandas as pd\n" ] }, { @@ -27,13 +26,7 @@ "metadata": {}, "outputs": [], "source": [ - "dataFile = \"BookReviewsFromAmazon10K.tsv\"\n", - "textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n", - " StructField(\"text\", StringType(), False)])\n", - "import os, urllib\n", - "if not os.path.isfile(dataFile):\n", - " urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n", - "data = spark.createDataFrame(pd.read_csv(dataFile, sep=\"\\t\", header=None), textSchema)\n", + "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n", "data.limit(10).toPandas()" ] }, diff --git a/pipeline.yaml b/pipeline.yaml index 71b50ca440..96ddc70518 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -154,8 +154,8 @@ jobs: azureSubscription: 'Findable Incubation(ca9d21ff-2a46-4e8b-bf06-8d65242342e5)' scriptLocation: inlineScript inlineScript: | - pip install requests || pip install requests - sbt setup || sbt setup + (timeout 30 pip install requests) || (echo "retrying" && timeout 30 pip install requests) + (timeout 200 sbt setup) || (echo "retrying" && timeout 200 sbt setup) - task: AzureCLI@1 displayName: 'Unit Test' timeoutInMinutes: 20 diff --git a/src/it/scala/com/microsoft/ml/nbtest/DatabricksUtilities.scala b/src/it/scala/com/microsoft/ml/nbtest/DatabricksUtilities.scala index 5798d63ef8..f740da642b 100644 --- a/src/it/scala/com/microsoft/ml/nbtest/DatabricksUtilities.scala +++ b/src/it/scala/com/microsoft/ml/nbtest/DatabricksUtilities.scala @@ -54,7 +54,7 @@ object DatabricksUtilities { // MMLSpark info val truncatedScalaVersion: String = BuildInfo.scalaVersion .split(".".toCharArray.head).dropRight(1).mkString(".") - val version = s"com.microsoft.ml.spark:${BuildInfo.name}_$truncatedScalaVersion:${BuildInfo.version}" + val version = s"com.microsoft.ml.spark:${BuildInfo.name}_$truncatedScalaVersion:0.17+85-1d8f34cf" val repository = "https://mmlspark.azureedge.net/maven" val libraries: String = List(