### Prepare environment

In [0]:
%run ../environment/prepare_environment


### Embedding models training

This notebook will train two embedding models (one for combined internet services and second for payment method)
to support Telco dataset feature engineering.

Steps performed:
- take either one or several categorical columns that describe one concept (service bundle),
- concatenate them into a single sequence,
- tokenize the sequence for `Word2Vec`.
- we train a Spark `Word2Vec` model to learn co-occurrence patterns between services,
- log the model to MLflow with a clear input/output signature,
- register the model under a stable name so the same transformation is reused in training and inference.

In [0]:
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import concat_ws, split, col
import mlflow
import mlflow.spark
from mlflow.models import infer_signature

spark.sql("CREATE VOLUME IF NOT EXISTS ai_ml_in_practice.telco_customer_churn_silver.mlflow_tmp")

with mlflow.start_run(run_name="categorical_word2vec"):
    categorical_cols = [
        "internet_service",
        "online_security",
        "online_backup",
        "device_protection",
        "tech_support",
        "streaming_tv",
        "streaming_movies"
    ]

    df = spark.table(
        "ai_ml_in_practice.telco_customer_churn_silver.telco_customer_features"
    )

    df = (
        df
        .withColumn("categorical_sequence", concat_ws(";", *categorical_cols))
        .withColumn("categorical_tokens", split(col("categorical_sequence"), ";"))
    )

    word2vec = Word2Vec(
        vectorSize=5,
        minCount=0,
        inputCol="categorical_tokens",
        outputCol="categorical_embedding"
    )

    model = word2vec.fit(df)

    prediction_df = model.transform(df).select("categorical_embedding")
    signature = infer_signature(df.select("categorical_tokens"), prediction_df)

    mlflow.spark.log_model(
        model,
        signature=signature,
        artifact_path="word2vec_model",
        dfs_tmpdir="/Volumes/ai_ml_in_practice/telco_customer_churn_silver/mlflow_tmp"
    )

    run_id = mlflow.active_run().info.run_id
    logged_model_uri = f"runs:/{run_id}/word2vec_model"

    mlflow.register_model(
        logged_model_uri,
        name="ai_ml_in_practice.telco_customer_churn_silver.telco_word2vec_internet_services"
    )

In [0]:
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import concat_ws, split, col, regexp_replace, col
import mlflow
import mlflow.spark
from mlflow.models import infer_signature

spark.sql("CREATE VOLUME IF NOT EXISTS ai_ml_in_practice.telco_customer_churn_silver.mlflow_tmp")

with mlflow.start_run(run_name="payment_word2vec"):

    df = spark.table(
        "ai_ml_in_practice.telco_customer_churn_silver.telco_customer_features"
    )

    df = (
        df
        .withColumn("payment_normalized", regexp_replace(col('payment_method'), r'[()]', ''))
        .withColumn("payment_tokens", split(col("payment_method"), " "))
    )

    word2vec = Word2Vec(
        vectorSize=5,
        minCount=0,
        inputCol="payment_tokens",
        outputCol="payment_embedding"
    )

    model = word2vec.fit(df)

    prediction_df = model.transform(df).select("payment_embedding")
    signature = infer_signature(df.select("payment_tokens"), prediction_df)

    mlflow.spark.log_model(
        model,
        signature=signature,
        artifact_path="word2vec_payment_model",
        dfs_tmpdir="/Volumes/ai_ml_in_practice/telco_customer_churn_silver/mlflow_tmp"
    )

    run_id = mlflow.active_run().info.run_id
    logged_model_uri = f"runs:/{run_id}/word2vec_payment_model"

    mlflow.register_model(
        logged_model_uri,
        name="ai_ml_in_practice.telco_customer_churn_silver.telco_word2vec_payment_methods"
    )