## Setup
- Install required libraries
- Configure databricks secrets

In [None]:
%pip install mlflow
%pip install xgboost

### Local config

In [None]:
# import os
# os.environ['DATABRICKS_HOST']="<redacted>"
# os.environ['DATABRICKS_TOKEN']="<redacted>"
# os.environ['DATABRICKS_USERNAME']="<redacted>"

### Configure mlflow to log to databricks

In [3]:
import mlflow
import os

os.environ['MLFLOW_TRACKING_URI']="databricks"

databricks_username=os.environ['DATABRICKS_USERNAME']

experiment_path="/Users/{}/Experiments/mlops-experiment-1".format(databricks_username)

mlflow.set_experiment(experiment_path)
mlflow.xgboost.autolog(log_input_examples=True)


InvalidConfigurationError: You haven't configured the CLI yet! Please configure by entering `/Users/lukehobbs/Library/Python/3.10/lib/python/site-packages/ipykernel_launcher.py configure`

## Data Preprocessing

In [7]:
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
#     http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
"""Feature engineers the customer churn dataset."""
import logging
import numpy as np
import pandas as pd
import os

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.info("Starting preprocessing.")

try:
    os.makedirs("data/ml/processing/input")
    os.makedirs("data/ml/processing/output")
    os.makedirs("data/ml/processing/train")
    os.makedirs("data/ml/processing/validation")
    os.makedirs("data/ml/processing/test")
except:
    pass

input_data_path = os.path.join("data/ml/processing/input", "churn.csv")

logger.info("Reading input data")

# read csv
df = pd.read_csv(input_data_path)

# drop the "Phone" feature column
df = df.drop(["Phone"], axis=1)

# Change the data type of "Area Code"
df["Area Code"] = df["Area Code"].astype(object)

# Drop several other columns
df = df.drop(["Day Charge", "Eve Charge", "Night Charge", "Intl Charge"], axis=1)

# Convert categorical variables into dummy/indicator variables.
model_data = pd.get_dummies(df)

# Create one binary classification target column
model_data = pd.concat(
    [
        model_data["Churn?_True."],
        model_data.drop(["Churn?_False.", "Churn?_True."], axis=1),
    ],
    axis=1,
)

# Split the data
train_data, validation_data, test_data = np.split(
    model_data.sample(frac=1, random_state=1729),
    [int(0.7 * len(model_data)), int(0.9 * len(model_data))],
)

train_data.to_csv("data/ml/processing/train/train.csv", header=False, index=False)
validation_data.to_csv(
    "data/ml/processing/validation/validation.csv", header=False, index=False
)
test_data.to_csv("data/ml/processing/test/test.csv", header=False, index=False)


# Train Model

In [28]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, recall_score

model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")

X_train = train_data.drop("Churn?_True.", axis=1)
y_train = train_data["Churn?_True."]

X_test = test_data.drop("Churn?_True.", axis=1)
y_test = test_data["Churn?_True."]

with mlflow.start_run():
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    y_pred = model.predict(X_test)

    mlflow.log_metrics({
        "acc": accuracy_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred)
    })

    np.savetxt("data/ml/processing/output/predictions.csv", y_pred)
    mlflow.log_artifact("data/ml/processing/output/predictions.csv")

