Skip to content

Commit 7362e3e

Browse files
authored
Merge pull request #2 from unboxai/dataset_feature
Onboard v1 datasets feature
2 parents b1d9c18 + 228cac4 commit 7362e3e

File tree

4 files changed

+203
-96
lines changed

4 files changed

+203
-96
lines changed

examples/sentiment-analysis/sentiment-unbox.ipynb

Lines changed: 82 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -67,19 +67,24 @@
6767
"outputs": [],
6868
"source": [
6969
"columns = ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text']\n",
70-
"dftrain = pd.read_csv('./data/training.1600000.processed.noemoticon.csv',\n",
71-
" header = None,\n",
72-
" encoding ='ISO-8859-1')\n",
73-
"dftest = pd.read_csv('./data/testdata.manual.2009.06.14.csv',\n",
74-
" header = None,\n",
75-
" encoding ='ISO-8859-1')\n",
76-
"dftrain.columns = columns\n",
77-
"dftest.columns = columns"
70+
"df_train_file_path = './data/training.1600000.processed.noemoticon.csv'\n",
71+
"df_train_name = 'training.1600000.processed.noemoticon'\n",
72+
"df_train = pd.read_csv(df_train_file_path,\n",
73+
" header=None,\n",
74+
" encoding='ISO-8859-1')\n",
75+
"\n",
76+
"df_test_file_path = './data/testdata.manual.2009.06.14.csv'\n",
77+
"df_test_name = 'testdata.manual.2009.06.14'\n",
78+
"df_test = pd.read_csv(df_test_file_path,\n",
79+
" header=None,\n",
80+
" encoding='ISO-8859-1')\n",
81+
"df_train.columns = columns\n",
82+
"df_test.columns = columns"
7883
]
7984
},
8085
{
8186
"cell_type": "code",
82-
"execution_count": 4,
87+
"execution_count": 6,
8388
"id": "multiple-disability",
8489
"metadata": {},
8590
"outputs": [
@@ -106,23 +111,23 @@
106111
" ('lr', LogisticRegression())])"
107112
]
108113
},
109-
"execution_count": 4,
114+
"execution_count": 6,
110115
"metadata": {},
111116
"output_type": "execute_result"
112117
}
113118
],
114119
"source": [
115120
"sentiment_lr = Pipeline([\n",
116-
" ('count_vect', CountVectorizer(min_df = 100,\n",
117-
" ngram_range = (1,2),\n",
118-
" stop_words = 'english')), \n",
121+
" ('count_vect', CountVectorizer(min_df=100,\n",
122+
" ngram_range=(1,2),\n",
123+
" stop_words='english')), \n",
119124
" ('lr', LogisticRegression())])\n",
120-
"sentiment_lr.fit(dftrain.text, dftrain.polarity)"
125+
"sentiment_lr.fit(df_train.text, df_train.polarity)"
121126
]
122127
},
123128
{
124129
"cell_type": "code",
125-
"execution_count": 5,
130+
"execution_count": 7,
126131
"id": "civilian-auditor",
127132
"metadata": {},
128133
"outputs": [
@@ -143,13 +148,13 @@
143148
}
144149
],
145150
"source": [
146-
"Xtest, ytest = dftest.text[dftest.polarity!=2], dftest.polarity[dftest.polarity!=2]\n",
147-
"print(classification_report(ytest,sentiment_lr.predict(Xtest)))"
151+
"x_test, y_test = df_test.text[df_test.polarity != 2], df_test.polarity[df_test.polarity != 2]\n",
152+
"print(classification_report(y_test, sentiment_lr.predict(x_test)))"
148153
]
149154
},
150155
{
151156
"cell_type": "code",
152-
"execution_count": 6,
157+
"execution_count": 8,
153158
"id": "numerous-ability",
154159
"metadata": {},
155160
"outputs": [
@@ -159,18 +164,18 @@
159164
"array([4])"
160165
]
161166
},
162-
"execution_count": 6,
167+
"execution_count": 8,
163168
"metadata": {},
164169
"output_type": "execute_result"
165170
}
166171
],
167172
"source": [
168-
"sentiment_lr.predict([Xtest[0]])"
173+
"sentiment_lr.predict([x_test[0]])"
169174
]
170175
},
171176
{
172177
"cell_type": "code",
173-
"execution_count": 7,
178+
"execution_count": 9,
174179
"id": "electronic-princess",
175180
"metadata": {},
176181
"outputs": [
@@ -180,13 +185,13 @@
180185
"array([4, 0])"
181186
]
182187
},
183-
"execution_count": 7,
188+
"execution_count": 9,
184189
"metadata": {},
185190
"output_type": "execute_result"
186191
}
187192
],
188193
"source": [
189-
"sentiment_lr.predict([\"good\", \"bad\"])"
194+
"sentiment_lr.predict(['good', 'bad'])"
190195
]
191196
},
192197
{
@@ -199,22 +204,13 @@
199204
},
200205
{
201206
"cell_type": "code",
202-
"execution_count": 8,
207+
"execution_count": 10,
203208
"id": "medium-field",
204209
"metadata": {},
205-
"outputs": [
206-
{
207-
"name": "stdout",
208-
"output_type": "stream",
209-
"text": [
210-
"What is your Unbox email?me@vikasnair.com\n",
211-
"What is your Unbox password?········\n"
212-
]
213-
}
214-
],
210+
"outputs": [],
215211
"source": [
216212
"import unboxapi\n",
217-
"client = unboxapi.UnboxClient()"
213+
"client = unboxapi.UnboxClient(email='me@vikasnair.com', password='00000000')"
218214
]
219215
},
220216
{
@@ -227,19 +223,19 @@
227223
},
228224
{
229225
"cell_type": "code",
230-
"execution_count": 9,
226+
"execution_count": 11,
231227
"id": "maritime-writing",
232228
"metadata": {},
233229
"outputs": [],
234230
"source": [
235-
"class_dict = {4: \"positive\", 0: \"negative\", 2: \"neutral\"}\n",
231+
"class_dict = { 4: 'positive', 0: 'negative', 2: 'neutral' }\n",
236232
"def predict_function(model, text_list):\n",
237233
" return [class_dict[d] for d in model.predict(text_list)]"
238234
]
239235
},
240236
{
241237
"cell_type": "code",
242-
"execution_count": 10,
238+
"execution_count": 12,
243239
"id": "bored-treasury",
244240
"metadata": {},
245241
"outputs": [
@@ -249,13 +245,13 @@
249245
"['positive', 'positive', 'negative']"
250246
]
251247
},
252-
"execution_count": 10,
248+
"execution_count": 12,
253249
"metadata": {},
254250
"output_type": "execute_result"
255251
}
256252
],
257253
"source": [
258-
"texts = [\"some new text, sweet noodles\", \"happy time\", \"sad day\"]\n",
254+
"texts = ['some new text, sweet noodles', 'happy time', 'sad day']\n",
259255
"\n",
260256
"predict_function(sentiment_lr, texts)"
261257
]
@@ -270,7 +266,7 @@
270266
},
271267
{
272268
"cell_type": "code",
273-
"execution_count": 14,
269+
"execution_count": null,
274270
"id": "present-seating",
275271
"metadata": {
276272
"scrolled": true
@@ -280,16 +276,47 @@
280276
"name": "stdout",
281277
"output_type": "stream",
282278
"text": [
283-
"[2021-03-01 04:47:41,045] WARNING - pip package requirement pandas already exist\n",
284-
"[2021-03-01 04:47:41,052] WARNING - pip package requirement scikit-learn already exist\n"
279+
"Uploading model...\n",
280+
"[2021-03-07 04:11:30,623] WARNING - Using BentoML installed in `editable` model, the local BentoML repository including all code changes will be packaged together with saved bundle created, under the './bundled_pip_dependencies' directory of the saved bundle.\n",
281+
"[2021-03-07 04:12:02,814] INFO - Detected non-PyPI-released BentoML installed, copying local BentoML modulefiles to target saved bundle path..\n"
282+
]
283+
},
284+
{
285+
"name": "stderr",
286+
"output_type": "stream",
287+
"text": [
288+
"warning: no previously-included files matching '*~' found anywhere in distribution\n",
289+
"warning: no previously-included files matching '*.pyo' found anywhere in distribution\n",
290+
"warning: no previously-included files matching '.git' found anywhere in distribution\n",
291+
"warning: no previously-included files matching '.ipynb_checkpoints' found anywhere in distribution\n",
292+
"warning: no previously-included files matching '__pycache__' found anywhere in distribution\n",
293+
"warning: no directories found matching 'bentoml/yatai/web/dist'\n",
294+
"no previously-included directories found matching 'e2e_tests'\n",
295+
"no previously-included directories found matching 'tests'\n",
296+
"no previously-included directories found matching 'benchmark'\n"
297+
]
298+
},
299+
{
300+
"name": "stdout",
301+
"output_type": "stream",
302+
"text": [
303+
"UPDATING BentoML-0.11.0+33.g7e83376/bentoml/_version.py\n",
304+
"set BentoML-0.11.0+33.g7e83376/bentoml/_version.py to '0.11.0+33.g7e83376'\n"
285305
]
286306
}
287307
],
288308
"source": [
289-
"client.add(\n",
290-
" function=predict_function,\n",
291-
" model=sentiment_lr\n",
292-
")"
309+
"print('Uploading model...')\n",
310+
"client.add_model(function=predict_function, model=sentiment_lr)\n",
311+
"print('Complete.')\n",
312+
"\n",
313+
"print('\\nUploading dataset (from file)...')\n",
314+
"response_i = client.add_dataset(df_train_file_path, df_train_name)\n",
315+
"print(f'Complete. Response: {response_i}')\n",
316+
"\n",
317+
"print('\\nUploading dataset (from data frame)...')\n",
318+
"response_j = client.add_dataframe(df_test, df_test_name)\n",
319+
"print(f'Complete. Response: {response_j}')"
293320
]
294321
},
295322
{
@@ -299,6 +326,14 @@
299326
"metadata": {},
300327
"outputs": [],
301328
"source": []
329+
},
330+
{
331+
"cell_type": "code",
332+
"execution_count": null,
333+
"id": "corporate-azerbaijan",
334+
"metadata": {},
335+
"outputs": [],
336+
"source": []
302337
}
303338
],
304339
"metadata": {

unboxapi/__init__.py

Lines changed: 44 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,59 @@
1-
import os
2-
import getpass
3-
import tarfile
4-
import pyrebase
5-
import bentoml
1+
import bentoml, getpass, os, pandas as pd, tarfile, tempfile, uuid
2+
63
from bentoml.saved_bundle.bundler import _write_bento_content_to_dir
74
from bentoml.utils.tempdir import TempDirectory
85

6+
from .lib.network import FlaskAPI, FirebaseAPI
97
from .template import create_template_model
108

119

1210
class UnboxClient(object):
13-
def __init__(self):
14-
self.authenticate()
1511

16-
def add(self, function, model):
17-
bento_service = create_template_model("sklearn", "text")
12+
# Public functions
13+
14+
def __init__(self, email: str = None, password: str = None):
15+
self.flask_api = FlaskAPI()
16+
self.firebase_api = FirebaseAPI(email=email, password=password)
17+
18+
19+
def add_model(self, function, model):
20+
bento_service = create_template_model('sklearn', 'text')
1821
bento_service.pack('model', model)
1922
bento_service.pack('function', function)
2023

2124
with TempDirectory() as temp_dir:
2225
_write_bento_content_to_dir(bento_service, temp_dir)
26+
2327
with TempDirectory() as tarfile_dir:
24-
file_name = f'{bento_service.name}.tar'
25-
tarfile_path = f'{tarfile_dir}/{file_name}'
26-
with tarfile.open(tarfile_path, mode="w:gz") as tar:
28+
model_id = str(uuid.uuid1())
29+
tarfile_path = f'{tarfile_dir}/{model_id}'
30+
31+
with tarfile.open(tarfile_path, mode='w:gz') as tar:
2732
tar.add(temp_dir, arcname=bento_service.name)
28-
self.upload(
29-
f"users/{self.user['localId']}/models/{file_name}", tarfile_path)
30-
31-
def upload(self, remote_path, file_path):
32-
storage = self.firebase.storage()
33-
storage.child(remote_path).put(file_path, self.user['idToken'])
34-
35-
def authenticate(self):
36-
config = {
37-
"apiKey": "AIzaSyAKlGQOmXTjPQhL1Uvj-Jr-_jUtNWmpOgs",
38-
"authDomain": "unbox-ai.firebaseapp.com",
39-
"databaseURL": "https://unbox-ai.firebaseio.com",
40-
"storageBucket": "unbox-ai.appspot.com"
41-
}
42-
43-
self.firebase = pyrebase.initialize_app(config)
44-
45-
# Get a reference to the auth service
46-
auth = self.firebase.auth()
47-
48-
# Log the user in
49-
email = input("What is your Unbox email?")
50-
password = getpass.getpass("What is your Unbox password?")
51-
self.user = auth.sign_in_with_email_and_password(email, password)
33+
34+
user_id = self.firebase_api.user['localId']
35+
remote_path = f'users/{user_id}/models/{model_id}'
36+
self.firebase_api.upload(remote_path, tarfile_path)
37+
38+
39+
def add_dataset(self, file_path: str, name: str):
40+
# For now, let's upload straight to Firebase Storage from here
41+
user_id = self.firebase_api.user['localId']
42+
dataset_id = str(uuid.uuid1())
43+
remote_path = f'users/{user_id}/datasets/{dataset_id}'
44+
self.firebase_api.upload(remote_path, file_path)
45+
46+
# And then set the metadata via request to our Flask API
47+
id_token = self.firebase_api.user['idToken']
48+
response = self.flask_api.upload_dataset_metadata(user_id,
49+
dataset_id,
50+
name,
51+
id_token)
52+
return response.json()
53+
54+
55+
def add_dataframe(self, df: pd.DataFrame, name: str):
56+
with tempfile.TemporaryDirectory() as tmp_dir:
57+
dataset_file_path = os.path.join(tmp_dir, str(uuid.uuid1()))
58+
df.to_csv(dataset_file_path, index=False)
59+
return self.add_dataset(dataset_file_path, name)

0 commit comments

Comments
 (0)