**Company Name:**
- Major Hospital

**Problem Type:**

- Classification (Multi Class)

**Problem:**
- The company wants to automate the classification of patients depending on if they have hepatitis or not and if so, what category of hepatitis they have.

**Goal:**
- These details (features we will use to predict) are as follows:
  - X (Patient ID/No.)
  - Age (in years)
  - Sex (f,m)
  - ALB
  - ALP
  - ALT
  - AST
  - BIL
  - CHE
  - CHOL
  - CREA
  - GGT
  - PROT


- Which will let us determine the target variable which is:
  - Category (diagnosis) (values: '0=Blood Donor', '0s=suspect Blood Donor', '1=Hepatitis', '2=Fibrosis', '3=Cirrhosis')
  - We have encoded the Category column so, 0, 1, 2, 3, 4 correspond respectively to '0=Blood Donor', '0s=suspect Blood Donor', '1=Hepatitis', '2=Fibrosis', '3=Cirrhosis'

In [14]:
import pandas as pd
import numpy as np

#import main libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.metrics import accuracy_score
from optuna.samplers import TPESampler
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from scipy.stats import trim_mean
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [15]:
np.random.seed(42)

## Task 1: Understand the training code 

In [16]:
train = pd.read_parquet('data/train.parquet')

test = pd.read_parquet('data/test.parquet')

In [17]:
# Split train and test data into features X and targets Y.
le = LabelEncoder()

target_column_name = 'Category'
Y_train = train[target_column_name]
X_train = train.drop([target_column_name], axis = 1)  
Y_test = test[target_column_name]
X_test = test.drop([target_column_name], axis = 1)  

In [18]:
Y_train

0      0
1      0
2      0
3      0
4      0
      ..
487    0
488    0
489    0
490    0
491    0
Name: Category, Length: 492, dtype: int64

In [19]:



# Transform string data to numeric one-hot vectors

categorical_selector = selector(dtype_exclude=np.number)
categorical_columns = categorical_selector(X_train)
categorial_encoder = OneHotEncoder(handle_unknown='ignore')

# Standardize numeric data by removing the mean and scaling to unit variance
numerical_selector = selector(dtype_include=np.number)
numerical_columns = numerical_selector(X_train)
numerical_encoder = StandardScaler()

# Filling missing values in the training set with the 10% trim_mean of the column
for col in numerical_columns:
    X_train[col].fillna(trim_mean(X_train[col], 0.1), inplace=True)

# Filling missing values in the testing set with the 10% trim_mean of the column
for col in numerical_columns:
    X_test[col].fillna(trim_mean(X_test[col], 0.1), inplace=True)

# Create a preprocessor that will preprocess both numeric and categorical data
preprocessor = ColumnTransformer([('categorical-encoder', categorial_encoder, categorical_columns),('standard_scaler', numerical_encoder, numerical_columns)])




rf  = make_pipeline(preprocessor, RandomForestClassifier()) 

print('Training model...') 

model = rf.fit(X_train, Y_train)

print('Accuracy score: ', rf.score(X_test,Y_test))


Training model...
Accuracy score:  0.8780487804878049


## Task 2: Create a cloud client

In [20]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
credential = DefaultAzureCredential()
ml_client = MLClient.from_config(credential=credential)

Found the config file in: ./config.json


## Task 3: Register the training and test data

In [21]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

train_data_name = 'hepatitis_c_train_parquet'
test_data_name = 'hepatitis_c_test_parquet'
training_data = Data(    name=train_data_name,    
                path='data/train.parquet',    
                type=AssetTypes.URI_FILE,    
                description='RAI hepatitis c train data')

tr_data = ml_client.data.create_or_update(training_data)
test_data = Data(    name=test_data_name,    
                path='data/test.parquet',    
                type=AssetTypes.URI_FILE,    
                description='RAI hepatitis c test data')
                
ts_data = ml_client.data.create_or_update(test_data)

[32mUploading train.parquet[32m (< 1 MB): 100%|██████████| 25.2k/25.2k [00:00<00:00, 146kB/s]
[39m

[32mUploading test.parquet[32m (< 1 MB): 100%|██████████| 14.5k/14.5k [00:00<00:00, 87.1kB/s]
[39m



## Create a compute cluster

In [22]:
from azure.ai.ml.entities import AmlCompute
import time

compute_name = 'trainingcompute'

my_compute = AmlCompute(
    name=compute_name,
    size='Standard_DS12_v2',
    min_instances=0,
    max_instances=4,
    idle_time_before_scale_down=3600
)
ml_client.compute.begin_create_or_update(my_compute).result()

AmlCompute({'type': 'amlcompute', 'created_on': None, 'provisioning_state': 'Succeeded', 'provisioning_errors': None, 'name': 'trainingcompute', 'description': None, 'tags': None, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/da96cce7-575d-4d7b-a46c-7f24618285e4/resourceGroups/rg-ms-learn/providers/Microsoft.MachineLearningServices/workspaces/azmlwksp/computes/trainingcompute', 'Resource__source_path': None, 'base_path': '/workspaces/Azure-ResponsibleAI-Dashboard-Guide/code/classification', 'creation_context': None, 'serialize': <msrest.serialization.Serializer object at 0x7ff3fec249d0>, 'resource_id': None, 'location': 'eastus2', 'size': 'Standard_DS12_v2', 'min_instances': 0, 'max_instances': 4, 'idle_time_before_scale_down': 3600.0, 'identity': None, 'ssh_public_access_enabled': True, 'ssh_settings': None, 'network_settings': <azure.ai.ml.entities._compute.compute.NetworkSettings object at 0x7ff3fec243d0>, 'tier': 'dedicated', 'enable_node_public_ip': True, 'subnet'

## Create the job

In [23]:
from azure.ai.ml import command, Input, Output

target_column_name = 'Category'

# Create the job
job = command(
    description='Trains hepatitis c model',
    experiment_name='hepatitis_c_test',
    compute=compute_name,
    inputs=dict(training_data=Input(type='uri_file', path=f'{train_data_name}@latest'), 
                target_column_name=target_column_name),
    outputs=dict(model_output=Output(type=AssetTypes.MLFLOW_MODEL)),
    code='src/',
    environment='azureml://registries/azureml/environments/responsibleai-ubuntu20.04-py38-cpu/versions/37',
    command='python train.py ' + 
            '--training_data ${{inputs.training_data}} ' +
            '--target_column_name ${{inputs.target_column_name}} ' +
            '--model_output ${{outputs.model_output}}'
)
job = ml_client.jobs.create_or_update(job)
ml_client.jobs.stream(job.name)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


[32mUploading src (0.0 MBs): 100%|██████████| 2646/2646 [00:00<00:00, 29436.91it/s]
[39m



RunId: careful_onion_d6nzrkvw5t
Web View: https://ml.azure.com/runs/careful_onion_d6nzrkvw5t?wsid=/subscriptions/da96cce7-575d-4d7b-a46c-7f24618285e4/resourcegroups/rg-ms-learn/workspaces/azmlwksp

Execution Summary
RunId: careful_onion_d6nzrkvw5t
Web View: https://ml.azure.com/runs/careful_onion_d6nzrkvw5t?wsid=/subscriptions/da96cce7-575d-4d7b-a46c-7f24618285e4/resourcegroups/rg-ms-learn/workspaces/azmlwksp



## Register the model

In [16]:
from azure.ai.ml.entities import Model

model_name = 'hepatitis_c_model'

# Register the model.
model_path = f'azureml://jobs/{job.name}/outputs/model_output'
model = Model(name=model_name,
                path=model_path,
                type=AssetTypes.MLFLOW_MODEL)
registered_model = ml_client.models.create_or_update(model)