# Model Training with MLflow #

Before run the code, start the mlflow server in terminal using script.  

mlflow ui --port 8080 --backend-store-uri sqlite:///mlruns.db --default-artifact-root ./mlruns

may be use "mlflow server --host 127.0.0.1 --port 8080" ?

In [1]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:8080")

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

from mlflow.models.signature import infer_signature

spark = SparkSession.builder.appName("ModelTrainingMLflow").getOrCreate()

data_path = "data/safety_dataset_filtered.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

df = df.drop("bookingID")

df = df.withColumn("label", col("label").cast("integer"))

feature_cols = [col_name for col_name in df.columns if col_name != "label"]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = assembler.transform(df)

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False)
df = scaler.fit(df).transform(df)

df = df.select("scaled_features", "label")

df = df.withColumnRenamed("scaled_features", "features")

train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# print(train_df.head(5))

rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=50)

In [3]:
# Train the model
rf_model = rf.fit(train_df)

# Make predictions
rf_preds = rf_model.transform(test_df)

# Evaluate the model
# Accuracy
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = accuracy_evaluator.evaluate(rf_preds)
print(f"Accuracy: {accuracy:.4f}")

# Precision
precision_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision = precision_evaluator.evaluate(rf_preds)
print(f"Precision: {precision:.4f}")

# Recall
recall_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = recall_evaluator.evaluate(rf_preds)
print(f"Recall: {recall:.4f}")

# F1-score
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_score = f1_evaluator.evaluate(rf_preds)
print(f"F1 Score: {f1_score:.4f}")

# AUC-ROC
auc_evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc_roc = auc_evaluator.evaluate(rf_preds)
print(f"AUC-ROC: {auc_roc:.4f}")

print("-----------------\n")

Accuracy: 0.7774
Precision: 0.7722
Recall: 0.7774
F1 Score: 0.7119
AUC-ROC: 0.7133
-----------------



In [4]:
import mlflow.spark

with mlflow.start_run() as run:
    print(f"Artifact URI: {run.info.artifact_uri}")

    # Log model parameters
    mlflow.log_param("num_trees", 50)
    mlflow.log_param("seed", 42)
    
    mlflow.log_metric("accuracy", accuracy)  # Log accuracy as a metric
    mlflow.log_metric("Precision", precision)  # Log precision as a metric
    mlflow.log_metric("Recall", recall)  # Log recall as a metric
    mlflow.log_metric("f1_score", f1_score)  # Log f1_score as a metric
    mlflow.log_metric("AUC-ROC", auc_roc)  # Log AUC-ROC as a metric
    
    signature = infer_signature(train_df.select("features"), train_df.select("label"))
    print(signature)
    
    print("Start logging model ...")

    # Log the trained model
    model_info = mlflow.spark.log_model(
        spark_model=rf_model,
        artifact_path="random_forest_model",
        signature=signature,
        registered_model_name="random-forest-model",
    )
    
    print(f"Model logged at: {model_info.model_uri}")
    

Artifact URI: file:C:/Users/vfre/PycharmProjects/BigDataProject/mlruns/0/7a385218478b4001a0d412b5431b24d4/artifacts
inputs: 
  ['features': SparkML vector (required)]
outputs: 
  ['label': integer (required)]
params: 
  None

Start logging model ...


Registered model 'random-forest-model' already exists. Creating a new version of this model...
2025/03/21 16:38:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random-forest-model, version 3


Model logged at: runs:/7a385218478b4001a0d412b5431b24d4/random_forest_model
üèÉ View run traveling-mole-360 at: http://127.0.0.1:8080/#/experiments/0/runs/7a385218478b4001a0d412b5431b24d4
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/0


Created version '3' of model 'random-forest-model'.
