In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import CoxPHFitter
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.inspection import permutation_importance
from sksurv.ensemble import RandomSurvivalForest, GradientBoostingSurvivalAnalysis
from sksurv.preprocessing import OneHotEncoder
from sksurv.util import Surv
from sksurv.metrics import as_cumulative_dynamic_auc_scorer
import scipy.stats as stats

In [None]:
rng=np.random.RandomState(0)


In [None]:
df = pd.read_csv('all_members_cleaned_short_form.csv')
df.head()


In [None]:
df.drop(columns=["gender", "referral_source"], inplace=True)

For Cox PH analysis, we'll need to one hot encode the cateogrical variables, and min-max scale the 
continuous variables. The Lifelines documentation doesn't state whether variable scaling is handled
automatically, so we'll do it ourselves to be safe.

In [None]:
df = pd.get_dummies(df, columns=["referral_source", "gender"])

#df.drop(columns=["neon_id"], inplace=True)

df.head()


In [None]:
cox = CoxPHFitter(penalizer=0.1)
cox.fit(
    df,
    duration_col = "duration",
    event_col = "membership_cancelled",
    robust=True
)

In [None]:
fig, ax = plt.subplots(figsize=(12,16))

cox.plot(ax=ax)

In [None]:
cox.print_summary()

In [None]:
df = pd.read_csv('all_members_cleaned_short_form.csv')
#df = pd.get_dummies(df, columns=["referral_source", "gender"], drop_first=True)


In [None]:
X = df.drop(columns=["membership_cancelled", "duration", "total_dollars_spent", "num_classes_attended"])
y = df[["membership_cancelled", "duration"]]
y = Surv.from_dataframe('membership_cancelled', 'duration', y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=rng, stratify=y["membership_cancelled"])

In [None]:
times = set(np.percentile(y["duration"], np.linspace(5, 81, 10)))
times = np.array(list(times), dtype="float64")
print(times)

In [None]:
rsf = RandomSurvivalForest(
    n_estimators=2000, min_samples_split=15, min_samples_leaf=13, n_jobs=-1, random_state=rng, oob_score=True
)
rsf.fit(X_train, y_train)

rsf.score(X_test, y_test)

In [None]:
rsf_grid = RandomSurvivalForest(random_state=rng, oob_score=True, n_jobs=-1)
param_distributions={
        "estimator__n_estimators": [200, 1000, 2000, 3000],
        "estimator__min_samples_split": stats.uniform(0,0.5),
        "estimator__min_samples_leaf": stats.uniform(0,0.5),
        "estimator__max_depth": [None, 1, 5]
    }

rsf_grid = RandomizedSearchCV(
    as_cumulative_dynamic_auc_scorer(rsf_grid, times=times),
    param_distributions=param_distributions,
    n_iter=50
)

rsf_grid.fit(X_train, y_train)

cindex = rsf_grid.score(X_test, y_test)
print("Performance on test set", round(cindex, 3))

In [None]:
rsf_grid.best_params_

In [None]:
gbm = GradientBoostingSurvivalAnalysis(
    n_estimators=300,
    learning_rate=0.5,
    max_depth=1,
    random_state=rng
)

cross_val = cross_val_score(gbm, X_train, y_train)
print(cross_val.mean(), cross_val.std())

gbm.fit(X_train, y_train)
gbm.score(X_test, y_test)

In [None]:
gbm = GradientBoostingSurvivalAnalysis(random_state=rng)
param_distributions={
        "estimator__n_estimators": [100, 500, 1000],
        "estimator__learning_rate": stats.loguniform(0.1,1),
        "estimator__subsample": stats.uniform(0.5,0.5),
        "estimator__max_depth": [1,3,5]
    }

grid_search = RandomizedSearchCV(
    as_cumulative_dynamic_auc_scorer(gbm, times=times),
    param_distributions=param_distributions,
    n_iter=50
)

grid_search.fit(X_train, y_train)

cindex = grid_search.score(X_test, y_test)
print("Performance on test set", round(cindex, 3))

In [None]:
grid_search.best_params_

In [None]:
from sksurv.metrics import cumulative_dynamic_auc

rsf_chf_funcs = rsf.predict_cumulative_hazard_function(X_test, return_array=False)

rsf_risk_scores = np.row_stack([chf(times) for chf in rsf_chf_funcs])

rsf_auc, rsf_mean_auc = cumulative_dynamic_auc(y_train, y_test, rsf_risk_scores, times)

In [None]:
plt.plot(times, rsf_auc, "o-", label=f"RSF (mean AUC = {rsf_mean_auc:.3f})")
plt.xlabel("Months since joining")
plt.ylabel("time-dependent AUC")
plt.legend(loc="lower center")
plt.grid(True)

In [None]:
pred = grid_search.predict(X)

df = pd.concat([df, pd.DataFrame(pred, columns=["risk_score"])], axis=1)
df.head()

In [None]:
df[df["membership_cancelled"] == False].sort_values(by="risk_score", ascending=False)

In [None]:
result = permutation_importance(rsf, X_test, y_test, n_repeats=15, random_state=rng)

pd.DataFrame(
    {
        k: result[k]
        for k in (
            "importances_mean",
            "importances_std",
        )
    },
    index=X_test.columns,
).sort_values(by="importances_mean", ascending=False)

In [None]:
result = permutation_importance(gbm, X_test, y_test, n_repeats=15, random_state=rng)

pd.DataFrame(
    {
        k: result[k]
        for k in (
            "importances_mean",
            "importances_std",
        )
    },
    index=X_test.columns,
).sort_values(by="importances_mean", ascending=False)