In [16]:
import pandas as pd
import os
import joblib
from azureml.core import Workspace, Experiment, Dataset, Datastore, Model

In [2]:
### train ML model  -- code from before
### --------------------

df = pd.read_csv("./data/german_credit_dataset.csv").drop('Sno', axis = 1)

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

y_raw = df['Risk']
X_raw = df.drop('Risk', axis=1)

categorical_features = X_raw.select_dtypes(include=['object']).columns
numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value="missing")),
    ('onehotencoder', OneHotEncoder(categories='auto', sparse=False))])

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

feature_engineering_pipeline = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ], remainder="drop")

# Encode Labels
le = LabelEncoder()
encoded_y = le.fit_transform(y_raw)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_raw, encoded_y, test_size=0.30, stratify=encoded_y, random_state=42)

# Create sklearn pipeline
lr_clf = Pipeline(steps=[('preprocessor', feature_engineering_pipeline),
                         ('classifier', LogisticRegression(solver="lbfgs", random_state = 42, penalty='l2'))])

In [3]:
### connecting to ML workspace
### --------------------

subscription_id = '8c386bb9-fbb5-45dd-a9cd-2ca847235881'
resource_group = 'rg-aml-ws-ga'
workspace_name = 'aml-ws-ga'

ws = Workspace(subscription_id, resource_group, workspace_name)

#clf = joblib.load('model.pkl')

In [17]:
### log experiment
### --------------------

experiment_name = 'german_credit_hsg'
experiment = Experiment(ws, experiment_name)

run = experiment.start_logging()

# Train the model
lr_clf.fit(X_train, y_train)

# Capture metrics
train_acc = lr_clf.score(X_train, y_train)
test_acc = lr_clf.score(X_test, y_test)
print("Training accuracy: %.3f" % train_acc)
print("Test data accuracy: %.3f" % test_acc)
print("Recall for class 'Bad': ", recall_score(y_test, lr_clf.predict(X_test), pos_label=0))

# Log to Azure ML
run.log('Train accuracy', train_acc)
run.log('Test accuracy', test_acc)
run.log('Recall', recall_score(y_test, lr_clf.predict(X_test), pos_label=0))
#run.add_properties({'datasets': dataset})
    
run.complete()

Training accuracy: 0.749
Test data accuracy: 0.730
Recall for class 'Bad':  0.35555555555555557


In [20]:
#os.remove('model.pkl')
joblib.dump(value=lr_clf, filename='model.pkl')

['model.pkl']

In [21]:
run.upload_file(name = 'model.pkl', path_or_stream = './model.pkl')

<azureml._restclient.models.batch_artifact_content_information_dto.BatchArtifactContentInformationDto at 0x1a7c35da708>

In [22]:
dataset = Dataset.get_by_name(ws, 'german_credit_hsg')

In [23]:
### register model
### --------------------

model = run.register_model(model_name='german-credit-hsg',
                           model_path='model.pkl',
                           datasets=[['training-dataset', dataset]],
                           tags={"use": "demo"})

In [24]:
os.remove('model.pkl')

In [25]:
model = Model(workspace = ws, name = 'german-credit-hsg')

In [30]:
model.download(exist_ok = True)
loaded_model = joblib.load('model.pkl')

In [33]:
df.shape

(1000, 10)

In [31]:
loaded_model.predict(df)

array([1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,

In [10]:
### store & upload model
### --------------------

os.remove('model.pkl') 
joblib.dump(value=lr_clf, filename='model.pkl')

# Upload our model to our experiment
run.upload_file(name = 'model.pkl', path_or_stream = './model.pkl')

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'model.pkl'

In [None]:
details = run.get_details()
details

In [None]:
run.add_properties({'datasets': dataset})

In [None]:
run.

In [None]:
### register dataset
### --------------------

datastore = ws.get_default_datastore()
datastore.upload_files(files = ['data/german_credit_dataset.csv'], overwrite = True, show_progress = True)

dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'german_credit_dataset.csv')])
dataset.register(ws, name = 'german_credit_hsg', tags = {'purpose': 'demo'}, create_new_version = True)

In [None]:
### register model
### --------------------

model = run.register_model(model_name='german-credit-hsg',
                           model_path='outputs/model.pkl',
                           datasets=[['training-dataset', dataset]],
                           tags={"use": "demo"})

In [None]:
df[:5]

In [None]:
lr_clf.fit(X_train, y_train)

In [None]:
lr_clf.predict(df[:5])