## Setup
- Install required libraries
- Configure databricks secrets

In [None]:
%pip install mlflow
%pip install xgboost

### Local config

When running locally you'll need to set these environment variables.

In [None]:
# import os
# os.environ['DATABRICKS_HOST']="<redacted>"
# os.environ['DATABRICKS_TOKEN']="<redacted>"
# os.environ['DATABRICKS_USERNAME']="<redacted>"

### Configure MLFlow to log to Databricks
- Create/Update an experiment in Databricks
- Enable autolog for xgboost so metrics/params are logged automatically

In [None]:
import mlflow
import os

os.environ['MLFLOW_TRACKING_URI']="databricks"

databricks_username=os.environ['DATABRICKS_USERNAME']

experiment_path="/Users/{}/Experiments/mlops-experiment-1".format(databricks_username)

mlflow.set_experiment(experiment_path)
mlflow.xgboost.autolog(log_input_examples=True)

## Data preprocessing

In [None]:
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
#     http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
"""Feature engineers the customer churn dataset."""
import logging
import numpy as np
import pandas as pd
import os
import random
from pathlib import Path

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.info("Starting preprocessing.")

input_data_path = os.path.join("data/ml/processing/input", "churn.csv")

logger.info("Reading input data")

# read csv
df = pd.read_csv(input_data_path)

# drop the "Phone" feature column
df = df.drop(["Phone"], axis=1)

# Change the data type of "Area Code"
df["Area Code"] = df["Area Code"].astype(object)

# Drop several other columns
df = df.drop(["Day Charge", "Eve Charge", "Night Charge", "Intl Charge"], axis=1)

# Convert categorical variables into dummy/indicator variables.
model_data = pd.get_dummies(df)

# Create one binary classification target column
model_data = pd.concat(
    [
        model_data["Churn?_True."],
        model_data.drop(["Churn?_False.", "Churn?_True."], axis=1),
    ],
    axis=1,
)

# Get random split sections
train_data_boundary = random.random() * len(model_data)
remaining = len(model_data) - train_data_boundary
test_data_boundary = len(model_data) - (random.random() * remaining)
data_splits = [int(train_data_boundary), int(test_data_boundary)]

# Split the data
train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), data_splits)

if os.path.exists("/dbfs"):
    Path("/dbfs/{}/train".format(databricks_username)).mkdir(parents=True, exist_ok=True)
    Path("/dbfs/{}/test".format(databricks_username)).mkdir(parents=True, exist_ok=True)
    Path("/dbfs/{}/validation".format(databricks_username)).mkdir(parents=True, exist_ok=True)
    Path("/dbfs/{}/output".format(databricks_username)).mkdir(parents=True, exist_ok=True)
    output_data_path = "/dbfs/{}".format(databricks_username)
else:
    output_data_path = "data/ml/processing/"

train_data.to_csv("{}/train/train.csv".format(output_data_path), header=False, index=False)
validation_data.to_csv("{}/validation/validation.csv".format(output_data_path), header=False, index=False)
test_data.to_csv("{}/test/test.csv".format(output_data_path), header=False, index=False)

# Train model

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, recall_score

model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")

X_train = train_data.drop("Churn?_True.", axis=1)
y_train = train_data["Churn?_True."]

X_test = test_data.drop("Churn?_True.", axis=1)
y_test = test_data["Churn?_True."]

with mlflow.start_run():
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    y_pred = model.predict(X_test)

    mlflow.log_params({
        "data_splits": data_splits
    })

    mlflow.log_metrics({
        "acc": accuracy_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred)
    })

    np.savetxt("{}/output/predictions.csv".format(output_data_path), y_pred)
    mlflow.log_artifact("{}/output/predictions.csv".format(output_data_path))
    mlflow.log_artifact("{}/train/train.csv".format(output_data_path))
    mlflow.log_artifact("{}/test/test.csv".format(output_data_path))
    mlflow.log_artifact("{}/validation/validation.csv".format(output_data_path))