In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import CoxPHFitter
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer 
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.inspection import permutation_importance
from sksurv.ensemble import RandomSurvivalForest, GradientBoostingSurvivalAnalysis
#from sksurv.preprocessing import OneHotEncoder
from sksurv.util import Surv
from sksurv.metrics import as_cumulative_dynamic_auc_scorer
import scipy.stats as stats
import pickle

In [None]:
rng=np.random.RandomState(0)


In [None]:
df = pd.read_csv('all_members_short_form.csv')

drop_cols = ["num_classes_attended", "total_dollars_spent", "neon_id","first_name", "last_name", "email"]
num_cols = ["num_classes_before_joining", "time_from_asmbly", "age"]

gender_cats = ["Male", "Female", "Non-binary", "Other", "Prefer not to answer"]
referral_cats = [
    "Google",
    "Facebook",
    "Instagram",
    "Friend/Coworker",
    "MeetUp",
    "Asmbly Maker Market",
    "Texas Woodworking Festival",
    "Other",
]

categories = [
    ('gender', gender_cats),
    ('referral_source', referral_cats),
]

ohe_cats = [x[1] for x in categories]
ohe_columns = [x[0] for x in categories]

In [None]:
def attr_label_split(df: pd.DataFrame):
    X = df.drop(columns=["membership_cancelled", "duration"])
    y = df[["membership_cancelled", "duration"]]
    y = Surv.from_dataframe('membership_cancelled', 'duration', y)
    return X, y

In [None]:
X, y = attr_label_split(df)

num_transforms = [
    ('impute', SimpleImputer(strategy='median')),
    #('scaler', RobustScaler()),
]
num_pipeline = Pipeline(num_transforms)

cat_transforms = [
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(
            categories=ohe_cats,
            handle_unknown='ignore'
        )
    ),
]
cat_pipeline = Pipeline(cat_transforms)

all_transforms = [
    ('numeric', num_pipeline, num_cols),
    ('categorical', cat_pipeline, ohe_columns),
    ('drops', 'drop', drop_cols),
]

full_transform_pipeline = ColumnTransformer(all_transforms, remainder='passthrough')

X_transformed = full_transform_pipeline.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.20, random_state=rng, stratify=y["membership_cancelled"])

In [None]:
times = set(np.percentile(y["duration"], np.linspace(5, 81, 10)))
times = np.array(list(times), dtype="float64")
print(times)

In [None]:
rsf = RandomSurvivalForest(
    n_estimators=2000, min_samples_split=15, min_samples_leaf=13, n_jobs=-1, random_state=rng, oob_score=True
)
#rsf.fit(X_train, y_train)
rsf.fit(X_transformed, y)
#rsf.score(X_test, y_test)
rsf.score(X_transformed, y)

In [None]:
rsf_grid = RandomSurvivalForest(random_state=rng, oob_score=True, n_jobs=-1)
param_distributions={
        "estimator__n_estimators": [200, 1000, 2000, 3000],
        "estimator__min_samples_split": stats.uniform(0,0.5),
        "estimator__min_samples_leaf": stats.uniform(0,0.5),
        "estimator__max_depth": [None, 1, 5]
    }

rsf_grid = RandomizedSearchCV(
    as_cumulative_dynamic_auc_scorer(rsf_grid, times=times),
    param_distributions=param_distributions,
    n_iter=50
)

rsf_grid.fit(X_train, y_train)

cindex = rsf_grid.score(X_test, y_test)
print("Performance on test set", round(cindex, 3))

In [None]:
rsf_grid.best_params_

In [None]:
gbm = GradientBoostingSurvivalAnalysis(
    n_estimators=300,
    learning_rate=0.5,
    max_depth=1,
    random_state=rng
)

cross_val = cross_val_score(gbm, X_transformed, y)
print(cross_val.mean(), cross_val.std())

gbm.fit(X_train, y_train)
gbm.score(X_test, y_test)

In [None]:
gbm = GradientBoostingSurvivalAnalysis(random_state=rng, max_depth=1)
param_grid={
        "estimator__n_estimators": [100, 300, 500, 1000],
        "estimator__learning_rate": [0.1, 0.5, 1],
        "estimator__subsample": [0.5, 0.75, 1],
    }

grid_search = GridSearchCV(
    as_cumulative_dynamic_auc_scorer(gbm, times=times),
    param_grid=param_grid
)

grid_search.fit(X_train, y_train)

auc = grid_search.score(X_test, y_test)
print("Performance on test set (AUC)", round(auc, 3))

In [None]:
grid_search.best_params_

In [None]:
from sksurv.metrics import cumulative_dynamic_auc

rsf_chf_funcs = rsf.predict_cumulative_hazard_function(X_test, return_array=False)

rsf_risk_scores = np.row_stack([chf(times) for chf in rsf_chf_funcs])

rsf_auc, rsf_mean_auc = cumulative_dynamic_auc(y_train, y_test, rsf_risk_scores, times)

In [None]:
plt.plot(times, rsf_auc, "o-", label=f"RSF (mean AUC = {rsf_mean_auc:.3f})")
plt.xlabel("Months since joining")
plt.ylabel("time-dependent AUC")
plt.legend(loc="lower center")
plt.grid(True)

In [None]:
pred = grid_search.predict(X_transformed)

new_df = pd.concat([df, pd.DataFrame(pred, columns=["risk_score"])], axis=1)

risk_df = new_df[new_df["membership_cancelled"] == False].sort_values(by="risk_score", ascending=False)

risk_df.head(20)

In [None]:
result = permutation_importance(rsf, X_test, y_test, n_repeats=15, random_state=rng)

pd.DataFrame(
    {
        k: result[k]
        for k in (
            "importances_mean",
            "importances_std",
        )
    },
    index=X_test.columns,
).sort_values(by="importances_mean", ascending=False)

In [None]:
full_transform_pipeline.get_feature_names_out()

In [None]:
result = permutation_importance(grid_search, X_test, y_test, n_repeats=15, random_state=rng)

pd.DataFrame(
    {
        k: result[k]
        for k in (
            "importances_mean",
            "importances_std",
        )
    },
    index=full_transform_pipeline.get_feature_names_out(),
).sort_values(by="importances_mean", ascending=False)

In [None]:
with open('transform_pipeline.pkl', 'wb') as f:
    pickle.dump(full_transform_pipeline, f)

with open('gbm_model.pkl', 'wb') as f:
    pickle.dump(grid_search, f)

In [None]:
with open('transform_pipeline.pkl', 'rb') as f:
    new_pipeline = pickle.load(f)

with open('gbm_model.pkl', 'rb') as f:
    new_model = pickle.load(f)

In [None]:
new_x = new_pipeline.transform(X)

new_preds = new_model.predict(new_x)

new_df = pd.concat([df, pd.DataFrame(new_preds, columns=["risk_score"])], axis=1)

new_risk_df = new_df[new_df["membership_cancelled"] == False].sort_values(by="risk_score", ascending=False)

new_risk_df.head(20)

In [None]:
full_risk_df = pd.concat([df, pd.DataFrame(new_preds, columns=["risk_score"])], axis=1).sort_values(by="risk_score", ascending=False)

full_risk_df.to_csv('asmbly_churn_risk.csv')

full_risk_df.head(10)