In [0]:
gold_table = "nlp_dev.gold.fasttext_gold"

df_gold = spark.table(gold_table)

display(df_gold.take(5))


In [0]:
df_train = df_gold.filter("split = 'train'").cache()
df_test = df_gold.filter("split = 'test'")
df_val = df_gold.filter("split = 'val'").cache()

print("df_train Count:",df_train.count())
print("df_test Count:",df_test.count())
print("df_val Count:",df_val.count())

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from mlflow.models.signature import infer_signature
import mlflow
import mlflow.spark
from pyspark.sql import functions as F



mlflow.set_experiment('/Users/skneelam.2305@gmail.com/nlp_fasttext_experiments')


In [0]:


# Enable autologging
mlflow.spark.autolog()

# Some reasonable hyperparameters for a baseline
max_iter = 50
reg_param = 0.0
elastic_net = 0.0

with mlflow.start_run(run_name='lr_tfidf_baseline') as run:
    # 1) Define model
    lr = LogisticRegression(
        featuresCol="tfidf_features",
        labelCol="label",
        maxIter=max_iter,
        regParam=reg_param,
        elasticNetParam=elastic_net
    )

    # 2) Fit on train
    lr_model = lr.fit(df_train)

    # 3) Predict on validation set
    df_val_pred = lr_model.transform(df_val)
                                     
    # 4) Evaluate
    evaluator_acc = MulticlassClassificationEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="accuracy"    
    )

    evaluator_f1 = MulticlassClassificationEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="f1"    
    )

    val_accuracy = evaluator_acc.evaluate(df_val_pred)
    val_f1 = evaluator_f1.evaluate(df_val_pred)

    print("Validation accuracy:", val_accuracy)
    print("Validation F1      :", val_f1)

    # 5) Log params & metrics
    mlflow.log_param("maxIter", max_iter)
    mlflow.log_param("regParam", reg_param)
    mlflow.log_param("elasticNetParam", elastic_net)

    mlflow.log_metric("val_accuracy", val_accuracy)
    mlflow.log_metric("val_f1", val_f1)

    # 6) Build a signature from a small sample
    sample_in = df_train.limit(100)
    sample_out = lr_model.transform(sample_in).select("prediction")

    signature = infer_signature(sample_in, sample_out)

    # 7) Explicitly log the Spark model WITH signature
    mlflow.spark.log_model(
        spark_model=lr_model,
        artifact_path="model",
        signature=signature
    )

    run_id = run.info.run_id

run_id



In [0]:
model_name = "nlp_dev.ml.fasttext_sentiment_lr"
model_uri = f"runs:/{run_id}/model"

registered_model = mlflow.register_model(model_uri=model_uri, name=model_name)

print("Registered model:")
print("Name :", registered_model.name)
print("Version:", registered_model.version)

