From 249f1d1b3f4a307de9560f5709d2fcdc3926a9e4 Mon Sep 17 00:00:00 2001 From: Bayomi Date: Sun, 7 Mar 2021 00:29:54 -0500 Subject: [PATCH 1/2] adding dataset support --- unboxapi/__init__.py | 36 +++++++++++++++++++++++++++++++----- unboxapi/lib/network.py | 18 ++++++++++++++++++ unboxapi/template.py | 2 +- 3 files changed, 50 insertions(+), 6 deletions(-) create mode 100644 unboxapi/lib/network.py diff --git a/unboxapi/__init__.py b/unboxapi/__init__.py index 1d3895d5..4c50e024 100644 --- a/unboxapi/__init__.py +++ b/unboxapi/__init__.py @@ -5,13 +5,19 @@ import bentoml from bentoml.saved_bundle.bundler import _write_bento_content_to_dir from bentoml.utils.tempdir import TempDirectory +import pandas as pd +import uuid +from .lib.network import FlaskAPIRequest from .template import create_template_model class UnboxClient(object): - def __init__(self): - self.authenticate() + def __init__(self, email=None, password=None): + self.firebase = None + self.user = None + self.flask_api_request = FlaskAPIRequest() + self.authenticate(email, password) def add(self, function, model): bento_service = create_template_model("sklearn", "text") @@ -32,7 +38,12 @@ def upload(self, remote_path, file_path): storage = self.firebase.storage() storage.child(remote_path).put(file_path, self.user['idToken']) - def authenticate(self): + def authenticate(self, email, password): + + if not email or not password: + email = input("What is your Unbox email?") + password = getpass.getpass("What is your Unbox password?") + config = { "apiKey": "AIzaSyAKlGQOmXTjPQhL1Uvj-Jr-_jUtNWmpOgs", "authDomain": "unbox-ai.firebaseapp.com", @@ -46,6 +57,21 @@ def authenticate(self): auth = self.firebase.auth() # Log the user in - email = input("What is your Unbox email?") - password = getpass.getpass("What is your Unbox password?") self.user = auth.sign_in_with_email_and_password(email, password) + + def add_dataset(self, file_path: str): + self.flask_api_request.upload_dataset(file_path) + + def add_dataframe(self, df: pd.DataFrame, file_path: str): + df.to_csv(file_path, index=False) + self.add_dataset(file_path) + + + # def add_dataset(self, file_name: str): + # self.upload( + # f"users/{self.user['localId']}/datasets/{file_name}", file_name) + # + # def add_dataframe(self, df: pd.DataFrame, file_name: str): + # df.to_csv(file_name, index=False) + # self.upload( + # f"users/{self.user['localId']}/datasets/{file_name}", file_name) diff --git a/unboxapi/lib/network.py b/unboxapi/lib/network.py new file mode 100644 index 00000000..e576d1d5 --- /dev/null +++ b/unboxapi/lib/network.py @@ -0,0 +1,18 @@ +import requests + + +class FlaskAPIRequest: + + def __init__(self): + self.url = "http://localhost:5000" + + def post(self, file, data, endpoint="/"): + return requests.post( + self.url + endpoint, + files={"file": file}, + data=data, + ) + + def upload_dataset(self, filename): + file = (filename, open(filename, 'rb')) + return self.post(file, {}, endpoint="/upload_dataset") diff --git a/unboxapi/template.py b/unboxapi/template.py index 8623c01a..44b1af52 100644 --- a/unboxapi/template.py +++ b/unboxapi/template.py @@ -5,7 +5,7 @@ "sklearn": "SklearnModelArtifact", "pytorch": "PytorchModelArtifact", "tensorflow": "TensorflowSavedModelArtifact", - "transformers": "TransformersModelArtifac" + "transformers": "TransformersModelArtifact" } From 228cac463580a6659810fb3211d65b92eed27b22 Mon Sep 17 00:00:00 2001 From: vikasnair Date: Sun, 7 Mar 2021 04:17:33 -0800 Subject: [PATCH 2/2] Integrate datasets v1 with Firebase Firestore & Storage --- .../sentiment-analysis/sentiment-unbox.ipynb | 129 +++++++++++------- unboxapi/__init__.py | 96 ++++++------- unboxapi/lib/network.py | 73 ++++++++-- unboxapi/template.py | 27 ++-- 4 files changed, 194 insertions(+), 131 deletions(-) diff --git a/examples/sentiment-analysis/sentiment-unbox.ipynb b/examples/sentiment-analysis/sentiment-unbox.ipynb index 948e85a5..f2136a30 100644 --- a/examples/sentiment-analysis/sentiment-unbox.ipynb +++ b/examples/sentiment-analysis/sentiment-unbox.ipynb @@ -67,19 +67,24 @@ "outputs": [], "source": [ "columns = ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text']\n", - "dftrain = pd.read_csv('./data/training.1600000.processed.noemoticon.csv',\n", - " header = None,\n", - " encoding ='ISO-8859-1')\n", - "dftest = pd.read_csv('./data/testdata.manual.2009.06.14.csv',\n", - " header = None,\n", - " encoding ='ISO-8859-1')\n", - "dftrain.columns = columns\n", - "dftest.columns = columns" + "df_train_file_path = './data/training.1600000.processed.noemoticon.csv'\n", + "df_train_name = 'training.1600000.processed.noemoticon'\n", + "df_train = pd.read_csv(df_train_file_path,\n", + " header=None,\n", + " encoding='ISO-8859-1')\n", + "\n", + "df_test_file_path = './data/testdata.manual.2009.06.14.csv'\n", + "df_test_name = 'testdata.manual.2009.06.14'\n", + "df_test = pd.read_csv(df_test_file_path,\n", + " header=None,\n", + " encoding='ISO-8859-1')\n", + "df_train.columns = columns\n", + "df_test.columns = columns" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "multiple-disability", "metadata": {}, "outputs": [ @@ -106,23 +111,23 @@ " ('lr', LogisticRegression())])" ] }, - "execution_count": 4, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sentiment_lr = Pipeline([\n", - " ('count_vect', CountVectorizer(min_df = 100,\n", - " ngram_range = (1,2),\n", - " stop_words = 'english')), \n", + " ('count_vect', CountVectorizer(min_df=100,\n", + " ngram_range=(1,2),\n", + " stop_words='english')), \n", " ('lr', LogisticRegression())])\n", - "sentiment_lr.fit(dftrain.text, dftrain.polarity)" + "sentiment_lr.fit(df_train.text, df_train.polarity)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "civilian-auditor", "metadata": {}, "outputs": [ @@ -143,13 +148,13 @@ } ], "source": [ - "Xtest, ytest = dftest.text[dftest.polarity!=2], dftest.polarity[dftest.polarity!=2]\n", - "print(classification_report(ytest,sentiment_lr.predict(Xtest)))" + "x_test, y_test = df_test.text[df_test.polarity != 2], df_test.polarity[df_test.polarity != 2]\n", + "print(classification_report(y_test, sentiment_lr.predict(x_test)))" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "numerous-ability", "metadata": {}, "outputs": [ @@ -159,18 +164,18 @@ "array([4])" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sentiment_lr.predict([Xtest[0]])" + "sentiment_lr.predict([x_test[0]])" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "id": "electronic-princess", "metadata": {}, "outputs": [ @@ -180,13 +185,13 @@ "array([4, 0])" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sentiment_lr.predict([\"good\", \"bad\"])" + "sentiment_lr.predict(['good', 'bad'])" ] }, { @@ -199,22 +204,13 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "medium-field", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "What is your Unbox email?me@vikasnair.com\n", - "What is your Unbox password?········\n" - ] - } - ], + "outputs": [], "source": [ "import unboxapi\n", - "client = unboxapi.UnboxClient()" + "client = unboxapi.UnboxClient(email='me@vikasnair.com', password='00000000')" ] }, { @@ -227,19 +223,19 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "id": "maritime-writing", "metadata": {}, "outputs": [], "source": [ - "class_dict = {4: \"positive\", 0: \"negative\", 2: \"neutral\"}\n", + "class_dict = { 4: 'positive', 0: 'negative', 2: 'neutral' }\n", "def predict_function(model, text_list):\n", " return [class_dict[d] for d in model.predict(text_list)]" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "id": "bored-treasury", "metadata": {}, "outputs": [ @@ -249,13 +245,13 @@ "['positive', 'positive', 'negative']" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "texts = [\"some new text, sweet noodles\", \"happy time\", \"sad day\"]\n", + "texts = ['some new text, sweet noodles', 'happy time', 'sad day']\n", "\n", "predict_function(sentiment_lr, texts)" ] @@ -270,7 +266,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "present-seating", "metadata": { "scrolled": true @@ -280,16 +276,47 @@ "name": "stdout", "output_type": "stream", "text": [ - "[2021-03-01 04:47:41,045] WARNING - pip package requirement pandas already exist\n", - "[2021-03-01 04:47:41,052] WARNING - pip package requirement scikit-learn already exist\n" + "Uploading model...\n", + "[2021-03-07 04:11:30,623] WARNING - Using BentoML installed in `editable` model, the local BentoML repository including all code changes will be packaged together with saved bundle created, under the './bundled_pip_dependencies' directory of the saved bundle.\n", + "[2021-03-07 04:12:02,814] INFO - Detected non-PyPI-released BentoML installed, copying local BentoML modulefiles to target saved bundle path..\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "warning: no previously-included files matching '*~' found anywhere in distribution\n", + "warning: no previously-included files matching '*.pyo' found anywhere in distribution\n", + "warning: no previously-included files matching '.git' found anywhere in distribution\n", + "warning: no previously-included files matching '.ipynb_checkpoints' found anywhere in distribution\n", + "warning: no previously-included files matching '__pycache__' found anywhere in distribution\n", + "warning: no directories found matching 'bentoml/yatai/web/dist'\n", + "no previously-included directories found matching 'e2e_tests'\n", + "no previously-included directories found matching 'tests'\n", + "no previously-included directories found matching 'benchmark'\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "UPDATING BentoML-0.11.0+33.g7e83376/bentoml/_version.py\n", + "set BentoML-0.11.0+33.g7e83376/bentoml/_version.py to '0.11.0+33.g7e83376'\n" ] } ], "source": [ - "client.add(\n", - " function=predict_function,\n", - " model=sentiment_lr\n", - ")" + "print('Uploading model...')\n", + "client.add_model(function=predict_function, model=sentiment_lr)\n", + "print('Complete.')\n", + "\n", + "print('\\nUploading dataset (from file)...')\n", + "response_i = client.add_dataset(df_train_file_path, df_train_name)\n", + "print(f'Complete. Response: {response_i}')\n", + "\n", + "print('\\nUploading dataset (from data frame)...')\n", + "response_j = client.add_dataframe(df_test, df_test_name)\n", + "print(f'Complete. Response: {response_j}')" ] }, { @@ -299,6 +326,14 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "corporate-azerbaijan", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/unboxapi/__init__.py b/unboxapi/__init__.py index 4c50e024..fdc79387 100644 --- a/unboxapi/__init__.py +++ b/unboxapi/__init__.py @@ -1,77 +1,59 @@ -import os -import getpass -import tarfile -import pyrebase -import bentoml +import bentoml, getpass, os, pandas as pd, tarfile, tempfile, uuid + from bentoml.saved_bundle.bundler import _write_bento_content_to_dir from bentoml.utils.tempdir import TempDirectory -import pandas as pd -import uuid -from .lib.network import FlaskAPIRequest +from .lib.network import FlaskAPI, FirebaseAPI from .template import create_template_model class UnboxClient(object): - def __init__(self, email=None, password=None): - self.firebase = None - self.user = None - self.flask_api_request = FlaskAPIRequest() - self.authenticate(email, password) - - def add(self, function, model): - bento_service = create_template_model("sklearn", "text") + + # Public functions + + def __init__(self, email: str = None, password: str = None): + self.flask_api = FlaskAPI() + self.firebase_api = FirebaseAPI(email=email, password=password) + + + def add_model(self, function, model): + bento_service = create_template_model('sklearn', 'text') bento_service.pack('model', model) bento_service.pack('function', function) with TempDirectory() as temp_dir: _write_bento_content_to_dir(bento_service, temp_dir) - with TempDirectory() as tarfile_dir: - file_name = f'{bento_service.name}.tar' - tarfile_path = f'{tarfile_dir}/{file_name}' - with tarfile.open(tarfile_path, mode="w:gz") as tar: - tar.add(temp_dir, arcname=bento_service.name) - self.upload( - f"users/{self.user['localId']}/models/{file_name}", tarfile_path) - - def upload(self, remote_path, file_path): - storage = self.firebase.storage() - storage.child(remote_path).put(file_path, self.user['idToken']) - - def authenticate(self, email, password): - if not email or not password: - email = input("What is your Unbox email?") - password = getpass.getpass("What is your Unbox password?") - - config = { - "apiKey": "AIzaSyAKlGQOmXTjPQhL1Uvj-Jr-_jUtNWmpOgs", - "authDomain": "unbox-ai.firebaseapp.com", - "databaseURL": "https://unbox-ai.firebaseio.com", - "storageBucket": "unbox-ai.appspot.com" - } + with TempDirectory() as tarfile_dir: + model_id = str(uuid.uuid1()) + tarfile_path = f'{tarfile_dir}/{model_id}' - self.firebase = pyrebase.initialize_app(config) + with tarfile.open(tarfile_path, mode='w:gz') as tar: + tar.add(temp_dir, arcname=bento_service.name) - # Get a reference to the auth service - auth = self.firebase.auth() + user_id = self.firebase_api.user['localId'] + remote_path = f'users/{user_id}/models/{model_id}' + self.firebase_api.upload(remote_path, tarfile_path) - # Log the user in - self.user = auth.sign_in_with_email_and_password(email, password) - def add_dataset(self, file_path: str): - self.flask_api_request.upload_dataset(file_path) + def add_dataset(self, file_path: str, name: str): + # For now, let's upload straight to Firebase Storage from here + user_id = self.firebase_api.user['localId'] + dataset_id = str(uuid.uuid1()) + remote_path = f'users/{user_id}/datasets/{dataset_id}' + self.firebase_api.upload(remote_path, file_path) - def add_dataframe(self, df: pd.DataFrame, file_path: str): - df.to_csv(file_path, index=False) - self.add_dataset(file_path) + # And then set the metadata via request to our Flask API + id_token = self.firebase_api.user['idToken'] + response = self.flask_api.upload_dataset_metadata(user_id, + dataset_id, + name, + id_token) + return response.json() - # def add_dataset(self, file_name: str): - # self.upload( - # f"users/{self.user['localId']}/datasets/{file_name}", file_name) - # - # def add_dataframe(self, df: pd.DataFrame, file_name: str): - # df.to_csv(file_name, index=False) - # self.upload( - # f"users/{self.user['localId']}/datasets/{file_name}", file_name) + def add_dataframe(self, df: pd.DataFrame, name: str): + with tempfile.TemporaryDirectory() as tmp_dir: + dataset_file_path = os.path.join(tmp_dir, str(uuid.uuid1())) + df.to_csv(dataset_file_path, index=False) + return self.add_dataset(dataset_file_path, name) diff --git a/unboxapi/lib/network.py b/unboxapi/lib/network.py index e576d1d5..7fbbc0f0 100644 --- a/unboxapi/lib/network.py +++ b/unboxapi/lib/network.py @@ -1,18 +1,63 @@ -import requests +import pyrebase, requests -class FlaskAPIRequest: +class FlaskAPI: def __init__(self): - self.url = "http://localhost:5000" - - def post(self, file, data, endpoint="/"): - return requests.post( - self.url + endpoint, - files={"file": file}, - data=data, - ) - - def upload_dataset(self, filename): - file = (filename, open(filename, 'rb')) - return self.post(file, {}, endpoint="/upload_dataset") + self.url = 'http://localhost:5000' + + + def post(self, endpoint: str = '/', data: any = None, file: any = None): + return requests.post(self.url + endpoint, + json=data, + files=file and {'file': file}) + + + def upload_dataset_metadata(self, + user_id: str, + dataset_id: str, + name: str, + id_token: str): + data = { + 'dataset_id': dataset_id, + 'id_token': id_token, + 'name': name, + 'user_id': user_id + } + return self.post(endpoint='/dataset', data=data) + + + def upload_dataset(self, file_path: str, name: str, id_token: str): + data = { 'name' : name, 'id_token' : id_token } + file = open(file_path, 'rb') + return self.post(endpoint='/dataset', data=data, file=file) + + +class FirebaseAPI: + + def __init__(self, email: str = None, password: str = None): + if not email or not password: + email = input('What is your Unbox email?') + password = getpass.getpass('What is your Unbox password?') + + config = { + 'apiKey': 'AIzaSyAKlGQOmXTjPQhL1Uvj-Jr-_jUtNWmpOgs', + 'authDomain': 'unbox-ai.firebaseapp.com', + 'databaseURL': 'https://unbox-ai.firebaseio.com', + 'storageBucket': 'unbox-ai.appspot.com' + } + + # Initialize Pyrebase instance + self.firebase = pyrebase.initialize_app(config) + + # Get a reference to the auth service + auth = self.firebase.auth() + + # Login + self.user = auth.sign_in_with_email_and_password(email, password) + + + def upload(self, remote_path: str, file_path: str): + storage = self.firebase.storage() + id_token = self.user['idToken'] + storage.child(remote_path).put(file_path, id_token) \ No newline at end of file diff --git a/unboxapi/template.py b/unboxapi/template.py index 44b1af52..290e3d4b 100644 --- a/unboxapi/template.py +++ b/unboxapi/template.py @@ -2,17 +2,17 @@ modelTypes = { - "sklearn": "SklearnModelArtifact", - "pytorch": "PytorchModelArtifact", - "tensorflow": "TensorflowSavedModelArtifact", - "transformers": "TransformersModelArtifact" + 'sklearn': 'SklearnModelArtifact', + 'pytorch': 'PytorchModelArtifact', + 'tensorflow': 'TensorflowSavedModelArtifact', + 'transformers': 'TransformersModelArtifact' } -def create_template_model(model_type, input_type): - # return TemplateModel([modelTypes[model_type]('model'), PickleArtifact('function')]) +# TODO: in dire need of cleanup +def create_template_model(model_type: str, input_type: str): with open('template_model.py', 'w') as python_file: - file_contents = f"""\ + file_contents = f'''\ from typing import List from bentoml import env, artifacts, api, BentoService from bentoml.frameworks.{model_type} import {modelTypes[model_type]} @@ -25,9 +25,9 @@ def create_template_model(model_type, input_type): class TemplateModel(BentoService): @api(input=DataframeInput( - orient="records", - columns=["text"], - dtype={{"text": "str"}}, + orient='records', + columns=['text'], + dtype={{'text': 'str'}}, ), batch=True) def batch(self, df): text = df['text'].tolist() @@ -40,17 +40,18 @@ def batch(self, df): def predict(self, parsed_json_list: List[JsonSerializable], tasks: List[InferenceTask]): text = [] for json, task in zip(parsed_json_list, tasks): - if "text" in json: + if 'text' in json: text.append(json['text']) else: task.discard(http_status=400, - err_msg="input json must contain `text` field") + err_msg='input json must contain `text` field') return self.artifacts.function( self.artifacts.model, text ) - """ + ''' python_file.write(textwrap.dedent(file_contents)) + from template_model import TemplateModel return TemplateModel()