From d669c1552f3678666a02f5cf561db7d6ddece421 Mon Sep 17 00:00:00 2001
From: Bryan Smith <brysmith@microsoft.com>
Date: Wed, 22 Jan 2020 09:39:25 -0800
Subject: [PATCH 1/2] Added notebooks for training and scoring a Ridge
 regression model with the Diabetes dataset

---
 .../Diabetes Ridge Regression Scoring.ipynb   | 123 ++++++++++++
 .../Diabetes Ridge Regression Training.ipynb  | 180 ++++++++++++++++++
 2 files changed, 303 insertions(+)
 create mode 100644 experimentation/Diabetes Ridge Regression Scoring.ipynb
 create mode 100644 experimentation/Diabetes Ridge Regression Training.ipynb

diff --git a/experimentation/Diabetes Ridge Regression Scoring.ipynb b/experimentation/Diabetes Ridge Regression Scoring.ipynb
new file mode 100644
index 00000000..b686c5cd
--- /dev/null
+++ b/experimentation/Diabetes Ridge Regression Scoring.ipynb	
@@ -0,0 +1,123 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Score Data with a Ridge Regression Model Trained on the Diabetes Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook loads the model trained in the Diabetes Ridge Regression Training notebook, prepares the data, and scores the data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import numpy\n",
+    "from azureml.core.model import Model\n",
+    "import joblib"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_path = Model.get_model_path(model_name=\"sklearn_regression_model.pkl\")\n",
+    "model = joblib.load(model_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_data = '{\"data\":[[1,2,3,4,5,6,7,8,9,10],[10,9,8,7,6,5,4,3,2,1]]}'\n",
+    "\n",
+    "data = json.loads(raw_data)[\"data\"]\n",
+    "data = numpy.array(data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Score Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\"RequestId\":\"\", \"TraceParent\":\"\", \"NumberOfPredictions\":2}\n",
+      "Test result:  {'result': [5113.099642122813, 3713.6329271385353]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "request_headers = {}\n",
+    "\n",
+    "result = model.predict(data)\n",
+    "print(('{{\"RequestId\":\"{0}\", '\n",
+    "           '\"TraceParent\":\"{1}\", '\n",
+    "           '\"NumberOfPredictions\":{2}}}'\n",
+    "           ).format(\n",
+    "               request_headers.get(\"X-Ms-Request-Id\", \"\"),\n",
+    "               request_headers.get(\"Traceparent\", \"\"),\n",
+    "               len(result)\n",
+    "           ))\n",
+    "print(\"Test result: \", {\"result\": result.tolist()})"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (storedna)",
+   "language": "python",
+   "name": "storedna"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/experimentation/Diabetes Ridge Regression Training.ipynb b/experimentation/Diabetes Ridge Regression Training.ipynb
new file mode 100644
index 00000000..9d1ab8e0
--- /dev/null
+++ b/experimentation/Diabetes Ridge Regression Training.ipynb	
@@ -0,0 +1,180 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train a Ridge Regression Model on the Diabetes Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook loads the Diabetes dataset from sklearn, splits the data into training and validation sets, trains a Ridge regression model, validates the model on the validation set, and saves the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\brysmith\\AppData\\Local\\Continuum\\anaconda3\\envs\\storedna\\lib\\site-packages\\sklearn\\externals\\joblib\\__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
+      "  warnings.warn(msg, category=DeprecationWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.datasets import load_diabetes\n",
+    "from sklearn.linear_model import Ridge\n",
+    "from sklearn.metrics import mean_squared_error\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.externals import joblib"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X, y = load_diabetes(return_X_y=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Split Data into Training and Validation Sets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)\n",
+    "data = {\"train\": {\"X\": X_train, \"y\": y_train},\n",
+    "        \"test\": {\"X\": X_test, \"y\": y_test}}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train Model on Training Set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,\n",
+       "      normalize=False, random_state=None, solver='auto', tol=0.001)"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "alpha = 0.5\n",
+    "\n",
+    "reg = Ridge(alpha=alpha)\n",
+    "reg.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Validate Model on Validation Set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mse:  3298.9096058070622\n"
+     ]
+    }
+   ],
+   "source": [
+    "preds = reg.predict(data[\"test\"][\"X\"])\n",
+    "print(\"mse: \", mean_squared_error(preds, y_valid))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['sklearn_regression_model.pkl']"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_name = \"sklearn_regression_model.pkl\"\n",
+    "\n",
+    "joblib.dump(value=reg, filename=model_name)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (storedna)",
+   "language": "python",
+   "name": "storedna"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 223b95002da00d5733a808c52f28f00e48e77b8c Mon Sep 17 00:00:00 2001
From: Bryan Smith <brysmith@microsoft.com>
Date: Wed, 22 Jan 2020 14:04:28 -0800
Subject: [PATCH 2/2] Fixed joblib import and remove request headers

---
 .../Diabetes Ridge Regression Scoring.ipynb   | 17 +++-------
 .../Diabetes Ridge Regression Training.ipynb  | 33 +++++++------------
 2 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/experimentation/Diabetes Ridge Regression Scoring.ipynb b/experimentation/Diabetes Ridge Regression Scoring.ipynb
index b686c5cd..9ac340ed 100644
--- a/experimentation/Diabetes Ridge Regression Scoring.ipynb	
+++ b/experimentation/Diabetes Ridge Regression Scoring.ipynb	
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,7 +35,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -52,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -71,14 +71,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{\"RequestId\":\"\", \"TraceParent\":\"\", \"NumberOfPredictions\":2}\n",
       "Test result:  {'result': [5113.099642122813, 3713.6329271385353]}\n"
      ]
     }
@@ -87,14 +86,6 @@
     "request_headers = {}\n",
     "\n",
     "result = model.predict(data)\n",
-    "print(('{{\"RequestId\":\"{0}\", '\n",
-    "           '\"TraceParent\":\"{1}\", '\n",
-    "           '\"NumberOfPredictions\":{2}}}'\n",
-    "           ).format(\n",
-    "               request_headers.get(\"X-Ms-Request-Id\", \"\"),\n",
-    "               request_headers.get(\"Traceparent\", \"\"),\n",
-    "               len(result)\n",
-    "           ))\n",
     "print(\"Test result: \", {\"result\": result.tolist()})"
    ]
   }
diff --git a/experimentation/Diabetes Ridge Regression Training.ipynb b/experimentation/Diabetes Ridge Regression Training.ipynb
index 9d1ab8e0..7ae84e38 100644
--- a/experimentation/Diabetes Ridge Regression Training.ipynb	
+++ b/experimentation/Diabetes Ridge Regression Training.ipynb	
@@ -16,24 +16,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\brysmith\\AppData\\Local\\Continuum\\anaconda3\\envs\\storedna\\lib\\site-packages\\sklearn\\externals\\joblib\\__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
-      "  warnings.warn(msg, category=DeprecationWarning)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from sklearn.datasets import load_diabetes\n",
     "from sklearn.linear_model import Ridge\n",
     "from sklearn.metrics import mean_squared_error\n",
     "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.externals import joblib"
+    "import joblib"
    ]
   },
   {
@@ -45,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -61,11 +52,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
-    "X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
     "data = {\"train\": {\"X\": X_train, \"y\": y_train},\n",
     "        \"test\": {\"X\": X_test, \"y\": y_test}}"
    ]
@@ -79,7 +70,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -89,7 +80,7 @@
        "      normalize=False, random_state=None, solver='auto', tol=0.001)"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -110,7 +101,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -123,7 +114,7 @@
    ],
    "source": [
     "preds = reg.predict(data[\"test\"][\"X\"])\n",
-    "print(\"mse: \", mean_squared_error(preds, y_valid))"
+    "print(\"mse: \", mean_squared_error(preds, y_test))"
    ]
   },
   {
@@ -135,7 +126,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -144,7 +135,7 @@
        "['sklearn_regression_model.pkl']"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }