In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Download the data

In [None]:
s3_uri = 's3://sagemaker-eu-west-1-934765130326/sagemaker/Churn-xgboost/data/small/churn-dataset.csv'
!aws s3 cp $s3_uri .

data = pd.read_csv("./churn-dataset.csv")
pd.set_option("display.max_columns", 500)
data

## Explore the data

In [None]:
# Frequency tables for each categorical feature
for column in data.select_dtypes(include=["object"]).columns:
    display(pd.crosstab(index=data[column], columns="% observations", normalize="columns"))

# Histograms for each numeric features
display(data.describe())
%matplotlib inline
hist = data.hist(bins=30, sharey=True, figsize=(10, 10))

In [None]:
data = data.drop("Phone", axis=1)
data["Area Code"] = data["Area Code"].astype(object)

### Visualize relationship between features and target variable


In [None]:
for column in data.select_dtypes(include=["object"]).columns:
    if column != "Churn?":
        display(pd.crosstab(index=data[column], columns=data["Churn?"], normalize="columns"))

for column in data.select_dtypes(exclude=["object"]).columns:
    print(column)
    hist = data[[column, "Churn?"]].hist(by="Churn?", bins=30)
    plt.show()

In [None]:
display(data.corr())
pd.plotting.scatter_matrix(data, figsize=(12, 12))
plt.show()

### Remove some features, because data science...

In [None]:
data = data.drop(["Day Charge", "Eve Charge", "Night Charge", "Intl Charge"], axis=1)

### One-hot encode catagorical features

In [None]:
model_data = pd.get_dummies(data)
model_data = pd.concat(
    [model_data["Churn?_True."], model_data.drop(["Churn?_False.", "Churn?_True."], axis=1)], axis=1
)

### Split data

In [None]:
train_data, validation_data, test_data = np.split(
    model_data.sample(frac=1, random_state=1729),
    [int(0.7 * len(model_data)), int(0.9 * len(model_data))],
)

## Transfer preprocessing code into scripts
After some local, initial experiments, we want to transfer the code related to preprocessing the data and training the model into scripts that we can can check in to trigger the pipeline.

In [None]:
%%writefile ../algortihms/1-preprocessing/preprocess.py

"""Feature engineers the customer churn dataset."""
import logging
import numpy as np
import pandas as pd
import os

logger = logging.getLogger()
logger.setLevel(logging.INFO)

if __name__ == "__main__":
    logger.info("Starting preprocessing.")

    _dir = "/opt/ml/processing/input"

    input_data_path = ""

    # We don't know the ordering of the folders/files in the input, but we know there is only one file.
    for file in os.listdir(_dir):
        input_data_path = os.path.join("/opt/ml/processing/input", file)
        if os.path.isfile(input_data_path):
            input_data_path = os.path.join("/opt/ml/processing/input", file)
            break

    assert os.path.isfile(input_data_path)
    print(input_data_path)

    try:
        os.makedirs("/opt/ml/processing/train")
        os.makedirs("/opt/ml/processing/validation")
        os.makedirs("/opt/ml/processing/test")
        os.makedirs("/opt/ml/processing/train_data_with_headers")
    except:
        pass

    logger.info("Reading input data")

    # read csv
    df = pd.read_csv(input_data_path)

    # drop the "Phone" feature column
    df = df.drop(["Phone"], axis=1)

    # Change the data type of "Area Code"
    df["Area Code"] = df["Area Code"].astype(object)

    # Drop several other columns
    df = df.drop(["Day Charge", "Eve Charge", "Night Charge", "Intl Charge"], axis=1)

    # Convert categorical variables into dummy/indicator variables.
    model_data = pd.get_dummies(df)

    # Create one binary classification target column
    model_data = pd.concat(
        [
            model_data["Churn?_True."],
            model_data.drop(["Churn?_False.", "Churn?_True."], axis=1),
        ],
        axis=1,
    )

    model_data = model_data.rename(columns={"Churn?_True.": "Churn_True"})

    # Split the data
    train_data, validation_data, test_data = np.split(
        model_data.sample(frac=1, random_state=1729),
        [int(0.7 * len(model_data)), int(0.9 * len(model_data))],
    )

    train_data.to_csv("/opt/ml/processing/train/train.csv", header=False, index=False)
    train_data.to_csv(
        "/opt/ml/processing/train_data_with_headers/train.csv", header=True, index=False
    )
    validation_data.to_csv(
        "/opt/ml/processing/validation/validation.csv", header=False, index=False
    )
    test_data.to_csv("/opt/ml/processing/test/test.csv", header=False, index=False)

    train_data = train_data.drop(["Churn_True"], axis=1)
    train_data.to_csv(
        "/opt/ml/processing/data_baseline_with_headers/baseline.csv",
        header=True,
        index=False,
    )


In [None]:
%%writefile ../algortihms/3-postprocessing/evaluate.py

"""Evaluation script for measuring model accuracy."""
import json
import logging
import pathlib
import pickle
import tarfile
import numpy as np
import pandas as pd
import xgboost

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
    roc_curve,
)

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())


if __name__ == "__main__":
    model_path = "/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path="..")

    logger.debug("Loading xgboost model.")
    model = pickle.load(open("xgboost-model", "rb"))

    logger.debug("Loading test input data.")
    test_path = "/opt/ml/processing/test/test.csv"
    df = pd.read_csv(test_path, header=None)
    sample_payload = df.sample()

    logger.debug("Reading test data.")
    y_test = df.iloc[:, 0].to_numpy()
    df.drop(df.columns[0], axis=1, inplace=True)
    X_test = xgboost.DMatrix(df.values)

    logger.info("Performing predictions against test data.")
    prediction_probabilities = model.predict(X_test)
    predictions = np.round(prediction_probabilities)

    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)
    conf_matrix = confusion_matrix(y_test, predictions)
    fpr, tpr, _ = roc_curve(y_test, prediction_probabilities)

    logger.debug("Accuracy: {}".format(accuracy))
    logger.debug("Precision: {}".format(precision))
    logger.debug("Recall: {}".format(recall))
    logger.debug("Confusion matrix: {}".format(conf_matrix))

    # Available metrics to add to model: https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-model-quality-metrics.html
    report_dict = {
        "binary_classification_metrics": {
            "accuracy": {"value": accuracy, "standard_deviation": "NaN"},
            "precision": {"value": precision, "standard_deviation": "NaN"},
            "recall": {"value": recall, "standard_deviation": "NaN"},
            "confusion_matrix": {
                "0": {"0": int(conf_matrix[0][0]), "1": int(conf_matrix[0][1])},
                "1": {"0": int(conf_matrix[1][0]), "1": int(conf_matrix[1][1])},
            },
            "receiver_operating_characteristic_curve": {
                "false_positive_rates": list(fpr),
                "true_positive_rates": list(tpr),
            },
        },
    }

    output_dir = "/opt/ml/processing/evaluation"
    sample_dir = "/opt/ml/processing/sample"

    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    pathlib.Path(sample_dir).mkdir(parents=True, exist_ok=True)

    sample_payload.to_csv(f"{sample_dir}/payload.csv", header=False, index=False)

    evaluation_path = f"{output_dir}/evaluation.json"
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))
