In [55]:
import pandas as pd
import numpy as np

# Standard ML import
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
import optuna

# Survival Analysis tools

from sksurv.svm import FastSurvivalSVM,FastKernelSurvivalSVM
from sksurv.metrics import concordance_index_censored,cumulative_dynamic_auc,concordance_index_ipcw
from sksurv.datasets import get_x_y
from lifelines import CoxPHFitter
from survlimepy import SurvLimeExplainer

# Interpretability tools

import shap
import lime


# Read Processing

In [74]:
# Dataset Insurance

data_ins = pd.read_csv("X_train.csv")
ins_credit = pd.read_csv("y_train.csv")

data_ins.rename(columns = {'0':'age','1':'sex', '2':'smoker', '3':'pren_prod', '4':'pren_comp', '5':'point_sales', '6': 'product_type', '7': 'dist_channel', '8': 'pay_freq', '9': 'pay_method', '10':'profession'}, inplace = True)
#data_ins.info()

# Categorical columns
colonnes_categorielles = ['sex', 'smoker', 'point_sales', 'product_type', 'dist_channel', 'pay_freq', 'pay_method', 'profession']
encoder = OneHotEncoder()
encoder.fit(data_ins[colonnes_categorielles])

ins_encodees = encoder.fit_transform(data_ins[colonnes_categorielles]).toarray()
nouveaux_noms_colonnes = encoder.get_feature_names_out(colonnes_categorielles)
ins_encodees_df = pd.DataFrame(ins_encodees, columns=nouveaux_noms_colonnes)
ins_features = data_ins.drop(columns=colonnes_categorielles).join(ins_encodees_df)

df = pd.concat([ins_features, ins_credit], axis=1).drop(columns=['Unnamed: 0'])

df.head()

Unnamed: 0,age,pren_prod,pren_comp,sex_0,sex_1,smoker_0,smoker_1,point_sales_0,point_sales_1,point_sales_2,...,pay_freq_2,pay_freq_3,pay_method_0,pay_method_1,pay_method_2,profession_0,profession_1,profession_2,evento,time
0,40,780.0,1.88,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,False,609.0
1,43,52.78,16.88,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,True,757.0
2,52,63.5,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,True,672.0
3,25,19.1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,True,407.0
4,51,351.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,True,632.0


In [75]:
# Data split

X,y = get_x_y(df,attr_labels=['evento','time'],pos_label=1,survival=True)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Cox model

In [76]:
# Create Cox model
cox_model = CoxPHFitter()

# Fit the model, for each categorical feature we drop the last category to avoid multicollinearity
cox_model.fit(df.drop(columns=['sex_1','smoker_1','point_sales_4','product_type_10','dist_channel_4','pay_freq_3','pay_method_2','profession_2']), duration_col='time', event_col='evento')

# Display the summary of the model
print(cox_model.summary)

Column pay_freq_2 have very low variance when conditioned on death event present or not. This may harm convergence. This could be a form of 'complete separation'. For example, try the following code:

>>> events = df['evento'].astype(bool)
>>> print(df.loc[events, 'pay_freq_2'].var())
>>> print(df.loc[~events, 'pay_freq_2'].var())

A very low variance means that the column pay_freq_2 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.

Newton-Raphson convergence completed successfully but norm(delta) is still high, 0.833. This may imply non-unique solutions to the maximum likelihood. Perhaps there is collinearity or complete separation in the dataset?



                     coef      exp(coef)     se(coef)  coef lower 95%  \
covariate                                                               
age             -0.016703       0.983436     0.001707       -0.020048   
pren_prod       -0.000067       0.999933     0.000040       -0.000146   
pren_comp        0.000019       1.000019     0.000035       -0.000049   
sex_0            0.051223       1.052558     0.030154       -0.007878   
smoker_0         0.123510       1.131462     0.059156        0.007567   
point_sales_0   -0.081251       0.921963     0.149570       -0.374402   
point_sales_1    0.225333       1.252740     0.148428       -0.065580   
point_sales_2    0.123760       1.131744     0.155332       -0.180685   
point_sales_3    0.136406       1.146147     0.157428       -0.172147   
product_type_0  13.160883  519635.816486   248.101464     -473.109051   
product_type_1  13.015335  449250.067323   248.101470     -473.254610   
product_type_2  12.810163  365917.412999   248.1014

# Survival SVM

In [77]:
# Create and fit the survival svm

SSVM = FastKernelSurvivalSVM(kernel = 'poly',degree = 5,random_state=42)
SSVM.fit(X_train,y_train)

In [78]:
# define the concordance index

def concordance_censored(estimator,X,y):
    concordance = concordance_index_censored([elt[0] for elt in y],[elt[1] for elt in y],estimator.predict(X))
    return concordance[0]

print(f"c_index : {concordance_censored(SSVM,X_test,y_test)}")

# define the concordance index ipcw

concordance_ipcw = concordance_index_ipcw(y_train,y_test,SSVM.predict(X_test))
print(f"c_index_ipcw : {concordance_ipcw[0]}")

# compute the auc dynamic score

auc_dynamic = cumulative_dynamic_auc(y_train,y_test,SSVM.predict(X_test),times = np.arange(1, 2059, 30))
print(f"auc_dynamic : {auc_dynamic[1]}")

c_index : 0.5159484169877243
c_index_ipcw : 0.5141468954482205
auc_dynamic : 0.51029445057416


In [79]:
# Hyperparameters optimization with optuna

# We use the concordance index as evaluation metric
def scorer(estimator, X, y):
    concordance =  concordance_index_censored([elt[0] for elt in y],[elt[1] for elt in y],estimator.predict(X))[0]
    return concordance

def objective(trial):
    # Define search space for hyperparameters
    gamma = trial.suggest_float('gamma', 1e-5, 1e3,log=True)
    kernel = trial.suggest_categorical('kernel', ['linear','poly'])
    degree = trial.suggest_int('degree', 2, 5)

    
    # Initialize model with hyperparameters
    model = FastKernelSurvivalSVM(random_state=42, max_iter = 1000, gamma = gamma, kernel = kernel, degree = degree)
    
    def k_fold_cross_validation(model, X, y, k=5):
        """
        Performs k-fold cross-validation for a given model and dataset.

        Parameters:
            model: The machine learning model to evaluate.
            X (numpy.ndarray): The feature matrix.
            y (numpy.ndarray): The target vector.
            k (int): Number of folds for cross-validation.

        Returns:
            float: The average accuracy across all folds.
        """
        n = len(X)
        fold_size = n // k
        scores = []

        for i in range(k):
            # Splitting data into training and validation sets
            validation_X = X[i * fold_size: (i + 1) * fold_size]
            validation_y = y[i * fold_size: (i + 1) * fold_size]
            train_X = np.concatenate([X[:i * fold_size], X[(i + 1) * fold_size:]])
            train_y = np.concatenate([y[:i * fold_size], y[(i + 1) * fold_size:]])

            # Fitting the model
            model.fit(train_X, train_y)

            # Making predictions on the validation set
            y_pred = model.predict(validation_X)

            # Calculating accuracy
            score = scorer(model, validation_X, validation_y)
            scores.append(score)

        # Returning the average accuracy
        return sum(scores) / k
    
    return k_fold_cross_validation(model, X_train, y_train, k=5)

# Create Optuna study object
study = optuna.create_study(direction='maximize')

# Run optimization
study.optimize(objective, n_trials=5)

# Access best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

[I 2024-04-11 14:43:21,380] A new study created in memory with name: no-name-5d1bb18a-623e-46a3-b2e3-20a989ea4ab9


X has feature names, but FastKernelSurvivalSVM was fitted without feature names
X has feature names, but FastKernelSurvivalSVM was fitted without feature names
X has feature names, but FastKernelSurvivalSVM was fitted without feature names
X has feature names, but FastKernelSurvivalSVM was fitted without feature names
X has feature names, but FastKernelSurvivalSVM was fitted without feature names
X has feature names, but FastKernelSurvivalSVM was fitted without feature names
X has feature names, but FastKernelSurvivalSVM was fitted without feature names
X has feature names, but FastKernelSurvivalSVM was fitted without feature names
X has feature names, but FastKernelSurvivalSVM was fitted without feature names
X has feature names, but FastKernelSurvivalSVM was fitted without feature names
[I 2024-04-11 14:43:40,078] Trial 0 finished with value: 0.5052577019700714 and parameters: {'gamma': 0.034606406277004655, 'kernel': 'poly', 'degree': 5}. Best is trial 0 with value: 0.50525770197007

Best Hyperparameters: {'gamma': 0.0014422636839047272, 'kernel': 'linear', 'degree': 3}


In [80]:
# Best ssurvival svm model

best_SSVM = FastKernelSurvivalSVM(random_state=42, max_iter = 1000, **best_params)
best_SSVM.fit(X_train, y_train)

# Compute the concordance index

print(f"c_index : {concordance_censored(best_SSVM,X_test,y_test)}")

c_index : 0.5516833661492369


# LIME

In [81]:
def generate_perturbations(instance, num_samples, num_features):
    """
    Generates perturbed instances by randomly perturbing features of the original instance.

    Parameters:
        instance (numpy.ndarray): The original instance.
        num_samples (int): Number of perturbed instances to generate.
        num_features (int): Number of features in the instance.

    Returns:
        numpy.ndarray: Perturbed instances.
    """
    perturbations = []
    for _ in range(num_samples):
        perturbation = np.copy(instance)
        for i in range(num_features):
            perturbation[i] += np.random.normal()
        perturbations.append(perturbation)
    return perturbations


def lime_explanation(instance, model, num_samples=1000):
    """
    Generates LIME explanation for a given instance.

    Parameters:
        instance (numpy.ndarray): The instance to be explained.
        model: The black-box model whose predictions are to be explained.
        num_samples (int): Number of perturbed instances to generate for LIME.

    Returns:
        numpy.ndarray: Feature importances estimated by LIME.
    """
    num_features = len(instance)
    perturbations = generate_perturbations(instance, num_samples, num_features)

    predictions = model.predict(perturbations)
    local_model = LinearRegression().fit(perturbations, predictions)
    return local_model.coef_



In [82]:
# converts datasets into numpy arrays

X_train_np = X_train.to_numpy()
X_test_np = X_test.to_numpy()


In [83]:
# Explain the first instance in the test set
lime_importance = lime_explanation(X_train_np[0], best_SSVM, num_samples=1000)
print("LIME feature importances:", lime_importance)

LIME feature importances: [-1.19832443e-06 -6.16639673e-05 -3.19665196e-05 -8.51678193e-09
 -1.87805284e-08 -2.08926786e-09 -2.52080425e-08 -1.45275528e-08
 -8.94766392e-09 -2.55244055e-09 -1.21990378e-09 -4.97493085e-11
 -1.62466246e-09 -3.83103967e-09 -1.73431797e-09 -1.41253515e-09
 -1.71964486e-10 -1.01309703e-10 -3.93878676e-11 -7.70028572e-09
 -9.17166548e-09 -9.20728714e-10 -5.89413121e-10 -7.67939935e-10
 -2.08981619e-08 -5.19501713e-09 -3.41084480e-10 -9.51069316e-11
 -5.58154482e-09 -7.57398577e-09 -4.77220361e-13 -1.41413025e-08
 -1.30115232e-08 -1.42819190e-08 -3.86810381e-12 -6.78497254e-10
 -2.28633375e-08 -3.75547563e-09]


X does not have valid feature names, but FastKernelSurvivalSVM was fitted with feature names


In [85]:
# Create a dictionnary to map the feature names to the feature importances
feature_importances = dict(zip(X.columns, lime_importance))

# Sort the feature importances
sorted_feature_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)

# Normalize the feature importances
total_importance = sum(abs(elt[1]) for elt in sorted_feature_importances)
normalized_feature_importances = [(elt[0], abs(elt[1]) / total_importance) for elt in sorted_feature_importances]

# Display the normalized feature importances
for feature, importance in normalized_feature_importances:
    print(f"{feature}: {importance}")

pay_freq_2: 5.0208781708311424e-09
pay_method_2: 4.0696666752880527e-08
product_type_6: 4.144032843479347e-07
point_sales_4: 5.234169324587764e-07
dist_channel_4: 1.0006285488000512e-06
product_type_5: 1.0658884597851136e-06
product_type_4: 1.809253765020148e-06
dist_channel_3: 3.5885803775860796e-06
product_type_10: 6.201268258186057e-06
profession_0: 7.138530398447996e-06
dist_channel_0: 8.079564864812326e-06
product_type_9: 9.687069289320794e-06
point_sales_3: 1.2834716960424274e-05
product_type_3: 1.4861408846543925e-05
product_type_0: 1.7093219329395266e-05
product_type_2: 1.8246914735336828e-05
smoker_0: 2.1981374333886725e-05
point_sales_2: 2.685445568286959e-05
profession_2: 3.95116956158662e-05
product_type_1: 4.030671164513502e-05
dist_channel_2: 5.465724063754094e-05
pay_freq_0: 5.872393313204079e-05
pay_freq_1: 7.968658288025819e-05
product_type_7: 8.10153959291781e-05
sex_0: 8.960582573539726e-05
point_sales_1: 9.413917381542535e-05
product_type_8: 9.649591426078858e-05
pa