In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from google.cloud import bigquery
from google.cloud import storage
import joblib

import os
import pandas as pd

bqclient = bigquery.Client()
storage_client = storage.Client()

In [2]:
def download_table(bq_table_uri: str):
    prefix = "bq://"
    if bq_table_uri.startswith(prefix):
        bq_table_uri = bq_table_uri[len(prefix):]

    table = bigquery.TableReference.from_string(bq_table_uri)
    rows = bqclient.list_rows(
        table,
    )
    return rows.to_dataframe(create_bqstorage_client=False)

data_uri = "sara-vertex-demos.beans_demo.large_dataset"

# Download data into Pandas DataFrames, split into train / test
df = download_table(data_uri)
train_df, test_df = train_test_split(df, test_size=0.5, random_state=42)

labels = train_df.pop("Class").tolist()
data = train_df.values.tolist()
test_labels = test_df.pop("Class").tolist()
test_data = test_df.values.tolist()

In [3]:
# Define and train the Scikit model
skmodel1 = DecisionTreeClassifier()
skmodel1.fit(data, labels)
score1 = skmodel1.score(test_data, test_labels)
print('Model 1 accuracy is:', score1)

Model 1 accuracy is: 0.8878930355568616


In [4]:
skmodel1.predict([[23288,558.113,207.567738,143.085693,1.450653336,0.7244336162,23545,172.1952453,0.8045881703,0.9890847314,0.9395021523,0.8295857874,0.008913077034,0.002604069884,0.6882125787,0.9983578734]])

array(['DERMASON'], dtype='<U8')

In [4]:
skmodel2 = LogisticRegression(solver='liblinear')
skmodel2.fit(data, labels)
score1 = skmodel2.score(test_data, test_labels)
print('Model 2 accuracy is:', score1)

Model 2 accuracy is: 0.8620334998530709


In [6]:
skmodel2.predict([[23288,558.113,207.567738,143.085693,1.450653336,0.7244336162,23545,172.1952453,0.8045881703,0.9890847314,0.9395021523,0.8295857874,0.008913077034,0.002604069884,0.6882125787,0.9983578734]])

array(['DERMASON'], dtype='<U8')

In [5]:
joblib.dump(skmodel1, 'model1.joblib')
blob = storage.blob.Blob.from_string(f"gs://zencore-vertex-models/beans1/model.joblib", client=storage_client)
blob.upload_from_filename('model1.joblib')

joblib.dump(skmodel2, 'model2.joblib')
blob = storage.blob.Blob.from_string(f"gs://zencore-vertex-models/beans2/model.joblib", client=storage_client)
blob.upload_from_filename('model2.joblib')