# A Basic Model As A Starting Point
<!--- @wandbcode{decisionopt-nb4a} -->

Model building isn't our focus, so I won't go into great depth on this. But it gives a model to use

In [1]:
# %pip install shap -qqq

In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import wandb
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.compose import make_column_selector as selector
from wandb.xgboost import WandbCallback
from xgboost import XGBClassifier

os.environ["WANDB_NOTEBOOK_NAME"] = "ad_hoc_adjustments.ipynb"
plt.style.use('fivethirtyeight')

In [3]:
# We will load dataset from wandb Artifact
with wandb.init(project="ad_hoc_adjustments") as run:
    artifact = run.use_artifact('wandb_course/decision_opt/telco-customer-churn:latest', type='dataset')
    artifact_dir = artifact.download()
    path = Path(artifact_dir)

[34m[1mwandb[0m: Currently logged in as: [33mnisargvp[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   1 of 1 files downloaded.  


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [4]:
data = pd.read_csv(path/"WA_Fn-UseC_-Telco-Customer-Churn.csv")
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
run = wandb.init(project="ad_hoc_adjustments")

target = "Churn"
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(target, axis=1), data[target] == "Yes", test_size=0.2, random_state=0
)
cols_to_use = [
    "tenure",
    "PhoneService",
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaperlessBilling",
    "PaymentMethod",
    "MonthlyCharges",
]

preprocessor = ColumnTransformer(
    transformers=[("one_hot", OneHotEncoder(), selector(dtype_include="object"))],
    remainder="passthrough",  # Leave numerical variables unchanged
)

# Create pipeline
pipeline = Pipeline([("preprocessor", preprocessor), ("classifier", XGBClassifier())])
pipeline.fit(X_train[cols_to_use], y_train)
y_pred = pipeline.predict_proba(X_test[cols_to_use])[:, 1]
roc_auc = roc_auc_score(y_test, y_pred)
log_loss_val = log_loss(y_test, y_pred)

In [7]:
X_train[cols_to_use].head()

Unnamed: 0,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges
2920,72,Yes,No,DSL,Yes,Yes,Yes,Yes,Yes,Yes,Two year,No,Mailed check,85.1
2966,14,No,No phone service,DSL,No,No,No,No,Yes,Yes,Month-to-month,Yes,Electronic check,46.35
6099,71,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),24.7
5482,33,Yes,No,DSL,No,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,73.9
2012,47,Yes,Yes,Fiber optic,Yes,Yes,Yes,No,Yes,No,Month-to-month,Yes,Electronic check,98.75


In [12]:
def baseline_prediction(data):
    return pipeline.predict_proba(data[cols_to_use])[:, 1]

def prediction_adjust_DSL(data):
    baseline = baseline_prediction(data)
    has_DSL = data["InternetService"] == "DSL"
    out = (baseline + has_DSL * 0.1) #.clip(0, 1)
    return out

prediction_adjust_DSL(X_test) - baseline_prediction(X_test)

2200    0.1
4627    0.0
3225    0.1
2828    0.0
3768    0.0
       ... 
2631    0.0
5333    0.0
6972    0.0
4598    0.1
3065    0.1
Name: InternetService, Length: 1409, dtype: float64

In [13]:
import shap

encoded_value = pipeline.named_steps["preprocessor"].transform(X_train[cols_to_use])
explainer = shap.TreeExplainer(pipeline.named_steps["classifier"], data=encoded_value)

In [17]:
def prediction_with_less_effect_for_tenure(data):
    tenure_column = 0
    effect_reduction_size = 0.5

    prediction = baseline_prediction(data)
    encoded_data = pipeline.named_steps["preprocessor"].transform(data[cols_to_use])
    shap_values = explainer.shap_values(encoded_data)
    effect_for_tenure = shap_values[:, tenure_column]
    adjusted_predictions = prediction - effect_for_tenure * effect_reduction_size
    return adjusted_predictions

prediction_with_less_effect_for_tenure(X_test) - baseline_prediction(X_test)

array([0.02530413, 0.0081968 , 0.01990088, ..., 0.00852816, 0.02017519,
       0.02269489])

In [18]:
run.finish()

VBox(children=(Label(value='0.001 MB of 0.007 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.175217…