In [19]:
import mlflow
from mlflow.models.signature import infer_signature
import pandas as pd
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import mlflow.pyfunc
import mlflow.spark
from mlflow.models.signature import infer_signature

import numpy as np
from datetime import datetime
import pyspark
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window, WindowSpec
from pyspark.sql.functions import struct, col, pandas_udf, udf, lit
from synapse.ml.predict import MLFlowTransformer
from tqdm import tqdm
import warnings

StatementMeta(, db4cc422-d00b-4c6a-8b39-dfbb9b804c00, 24, Finished, Available, Finished)

In [20]:
# Load data into pandas DataFrame from "/lakehouse/default/Files/archive/tumor.csv"
dataset = pd.read_csv("/lakehouse/default/Files/archive/tumor.csv")
display(dataset)

StatementMeta(, db4cc422-d00b-4c6a-8b39-dfbb9b804c00, 25, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 383217f2-a620-4ba7-9930-6904f4bdb92f)

In [21]:
EXPERIMENT_NAME = "cancer_classification_model_exp"
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.autolog(exclusive=False)

StatementMeta(, db4cc422-d00b-4c6a-8b39-dfbb9b804c00, 26, Finished, Available, Finished)

2025/01/03 06:50:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.


In [22]:
#removing values for endpoints to work#
X = dataset.iloc[:, 1:-1] # excluding the first column which is essentially a primary key
y = dataset.iloc[:, -1] # last column is the label (result)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

StatementMeta(, db4cc422-d00b-4c6a-8b39-dfbb9b804c00, 27, Finished, Available, Finished)

In [23]:
# init mlflow wrapper for RF Classification model
class CancerClassificationModelWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, model):
        self.model = model

    def predict(self,context,model_input):
        return self.model.predict(model_input)

StatementMeta(, db4cc422-d00b-4c6a-8b39-dfbb9b804c00, 28, Finished, Available, Finished)

In [24]:
model_name = "cancer_classification_model"
mlflow.sklearn.autolog(registered_model_name=model_name)
rf_classifier = RandomForestClassifier()
with mlflow.start_run(run_name=model_name) as run:
    rf_classifier_run_id = run.info.run_id # Capture run_id for model prediction later
    rf_classifier.fit(X_train, y_train)
    wrappedModel = CancerClassificationModelWrapper(rf_classifier)
    y_pred = rf_classifier.predict(X_test)

    y_pred_wrapped = wrappedModel.predict(None,X_test)
    
    # Log model parameters and metrics
    mlflow.log_params(rf_classifier.get_params())
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred_wrapped))

    signature = infer_signature(X_train, y_pred)

    print (signature)


    mlflow.sklearn.log_model(artifact_path=model_name, sk_model=rf_classifier, signature=signature, registered_model_name=model_name)

StatementMeta(, db4cc422-d00b-4c6a-8b39-dfbb9b804c00, 29, Finished, Available, Finished)

Registered model 'cancer_classification_model' already exists. Creating a new version of this model...
2025/01/03 06:50:17 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: cancer_classification_model, version 33
Created version '33' of model 'cancer_classification_model'.
Registered model 'cancer_classification_model' already exists. Creating a new version of this model...
2025/01/03 06:50:24 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: cancer_classification_model, version 34
Created version '34' of model 'cancer_classification_model'.


inputs: 
  ['Clump Thickness': long, 'Uniformity of Cell Size': long, 'Uniformity of Cell Shape': long, 'Marginal Adhesion': long, 'Single Epithelial Cell Size': long, 'Bare Nuclei': long, 'Bland Chromatin': long, 'Normal Nucleoli': long, 'Mitoses': long]
outputs: 
  [Tensor('int64', (-1,))]
params: 
  None



In [25]:
print("Accuracy: ", accuracy_score(y_test, y_pred))

StatementMeta(, db4cc422-d00b-4c6a-8b39-dfbb9b804c00, 30, Finished, Available, Finished)

Accuracy:  0.9708029197080292


In [26]:
tqdm.pandas()

warnings.filterwarnings("ignore")

model = MLFlowTransformer(
   # inputCols=(X_test.columns.tolist()),
   # outputCol='Class',
    modelName='cancer_classification_model',
    modelVersion=34
)

apply_model_udf = model.to_udf()  
# Apply the model to the new data
columns = ['Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses']

StatementMeta(, db4cc422-d00b-4c6a-8b39-dfbb9b804c00, 31, Finished, Available, Finished)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

StatementMeta(, db4cc422-d00b-4c6a-8b39-dfbb9b804c00, 33, Finished, Available, Finished)

In [27]:
# # Convert Pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(X_test)
#spark_df.show()
dataset_new = spark_df.withColumn("cancer_m_or_b_pred",apply_model_udf(*[col(f) for f in columns]))
display (dataset_new)

StatementMeta(, db4cc422-d00b-4c6a-8b39-dfbb9b804c00, 32, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e6330c23-c1d8-4215-95f5-333676d333af)