From d669c1552f3678666a02f5cf561db7d6ddece421 Mon Sep 17 00:00:00 2001 From: Bryan Smith Date: Wed, 22 Jan 2020 09:39:25 -0800 Subject: [PATCH 1/2] Added notebooks for training and scoring a Ridge regression model with the Diabetes dataset --- .../Diabetes Ridge Regression Scoring.ipynb | 123 ++++++++++++ .../Diabetes Ridge Regression Training.ipynb | 180 ++++++++++++++++++ 2 files changed, 303 insertions(+) create mode 100644 experimentation/Diabetes Ridge Regression Scoring.ipynb create mode 100644 experimentation/Diabetes Ridge Regression Training.ipynb diff --git a/experimentation/Diabetes Ridge Regression Scoring.ipynb b/experimentation/Diabetes Ridge Regression Scoring.ipynb new file mode 100644 index 00000000..b686c5cd --- /dev/null +++ b/experimentation/Diabetes Ridge Regression Scoring.ipynb @@ -0,0 +1,123 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Score Data with a Ridge Regression Model Trained on the Diabetes Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook loads the model trained in the Diabetes Ridge Regression Training notebook, prepares the data, and scores the data." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import numpy\n", + "from azureml.core.model import Model\n", + "import joblib" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Model" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "model_path = Model.get_model_path(model_name=\"sklearn_regression_model.pkl\")\n", + "model = joblib.load(model_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare Data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "raw_data = '{\"data\":[[1,2,3,4,5,6,7,8,9,10],[10,9,8,7,6,5,4,3,2,1]]}'\n", + "\n", + "data = json.loads(raw_data)[\"data\"]\n", + "data = numpy.array(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Score Data" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"RequestId\":\"\", \"TraceParent\":\"\", \"NumberOfPredictions\":2}\n", + "Test result: {'result': [5113.099642122813, 3713.6329271385353]}\n" + ] + } + ], + "source": [ + "request_headers = {}\n", + "\n", + "result = model.predict(data)\n", + "print(('{{\"RequestId\":\"{0}\", '\n", + " '\"TraceParent\":\"{1}\", '\n", + " '\"NumberOfPredictions\":{2}}}'\n", + " ).format(\n", + " request_headers.get(\"X-Ms-Request-Id\", \"\"),\n", + " request_headers.get(\"Traceparent\", \"\"),\n", + " len(result)\n", + " ))\n", + "print(\"Test result: \", {\"result\": result.tolist()})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (storedna)", + "language": "python", + "name": "storedna" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/experimentation/Diabetes Ridge Regression Training.ipynb b/experimentation/Diabetes Ridge Regression Training.ipynb new file mode 100644 index 00000000..9d1ab8e0 --- /dev/null +++ b/experimentation/Diabetes Ridge Regression Training.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train a Ridge Regression Model on the Diabetes Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook loads the Diabetes dataset from sklearn, splits the data into training and validation sets, trains a Ridge regression model, validates the model on the validation set, and saves the model." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\brysmith\\AppData\\Local\\Continuum\\anaconda3\\envs\\storedna\\lib\\site-packages\\sklearn\\externals\\joblib\\__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n", + " warnings.warn(msg, category=DeprecationWarning)\n" + ] + } + ], + "source": [ + "from sklearn.datasets import load_diabetes\n", + "from sklearn.linear_model import Ridge\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.externals import joblib" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = load_diabetes(return_X_y=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Split Data into Training and Validation Sets" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)\n", + "data = {\"train\": {\"X\": X_train, \"y\": y_train},\n", + " \"test\": {\"X\": X_test, \"y\": y_test}}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train Model on Training Set" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,\n", + " normalize=False, random_state=None, solver='auto', tol=0.001)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alpha = 0.5\n", + "\n", + "reg = Ridge(alpha=alpha)\n", + "reg.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Validate Model on Validation Set" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mse: 3298.9096058070622\n" + ] + } + ], + "source": [ + "preds = reg.predict(data[\"test\"][\"X\"])\n", + "print(\"mse: \", mean_squared_error(preds, y_valid))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save Model" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['sklearn_regression_model.pkl']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_name = \"sklearn_regression_model.pkl\"\n", + "\n", + "joblib.dump(value=reg, filename=model_name)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (storedna)", + "language": "python", + "name": "storedna" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 223b95002da00d5733a808c52f28f00e48e77b8c Mon Sep 17 00:00:00 2001 From: Bryan Smith Date: Wed, 22 Jan 2020 14:04:28 -0800 Subject: [PATCH 2/2] Fixed joblib import and remove request headers --- .../Diabetes Ridge Regression Scoring.ipynb | 17 +++------- .../Diabetes Ridge Regression Training.ipynb | 33 +++++++------------ 2 files changed, 16 insertions(+), 34 deletions(-) diff --git a/experimentation/Diabetes Ridge Regression Scoring.ipynb b/experimentation/Diabetes Ridge Regression Scoring.ipynb index b686c5cd..9ac340ed 100644 --- a/experimentation/Diabetes Ridge Regression Scoring.ipynb +++ b/experimentation/Diabetes Ridge Regression Scoring.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -71,14 +71,13 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{\"RequestId\":\"\", \"TraceParent\":\"\", \"NumberOfPredictions\":2}\n", "Test result: {'result': [5113.099642122813, 3713.6329271385353]}\n" ] } @@ -87,14 +86,6 @@ "request_headers = {}\n", "\n", "result = model.predict(data)\n", - "print(('{{\"RequestId\":\"{0}\", '\n", - " '\"TraceParent\":\"{1}\", '\n", - " '\"NumberOfPredictions\":{2}}}'\n", - " ).format(\n", - " request_headers.get(\"X-Ms-Request-Id\", \"\"),\n", - " request_headers.get(\"Traceparent\", \"\"),\n", - " len(result)\n", - " ))\n", "print(\"Test result: \", {\"result\": result.tolist()})" ] } diff --git a/experimentation/Diabetes Ridge Regression Training.ipynb b/experimentation/Diabetes Ridge Regression Training.ipynb index 9d1ab8e0..7ae84e38 100644 --- a/experimentation/Diabetes Ridge Regression Training.ipynb +++ b/experimentation/Diabetes Ridge Regression Training.ipynb @@ -16,24 +16,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\brysmith\\AppData\\Local\\Continuum\\anaconda3\\envs\\storedna\\lib\\site-packages\\sklearn\\externals\\joblib\\__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n", - " warnings.warn(msg, category=DeprecationWarning)\n" - ] - } - ], + "outputs": [], "source": [ "from sklearn.datasets import load_diabetes\n", "from sklearn.linear_model import Ridge\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.externals import joblib" + "import joblib" ] }, { @@ -45,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -61,11 +52,11 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n", "data = {\"train\": {\"X\": X_train, \"y\": y_train},\n", " \"test\": {\"X\": X_test, \"y\": y_test}}" ] @@ -79,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -89,7 +80,7 @@ " normalize=False, random_state=None, solver='auto', tol=0.001)" ] }, - "execution_count": 16, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -110,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -123,7 +114,7 @@ ], "source": [ "preds = reg.predict(data[\"test\"][\"X\"])\n", - "print(\"mse: \", mean_squared_error(preds, y_valid))" + "print(\"mse: \", mean_squared_error(preds, y_test))" ] }, { @@ -135,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -144,7 +135,7 @@ "['sklearn_regression_model.pkl']" ] }, - "execution_count": 14, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" }