In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
dic = pd.read_csv('data_dictionary.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
class FeatureTransformer:
    def __init__(self):
        self.dummy_features = [
            'dri_score', 'psych_disturb', 'diabetes', 'tbi_status', 'arrhythmia',
            'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct',
            'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'ethnicity',
            'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe',
            'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match',
            'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose',
            'cardiac', 'pulm_moderate'
        ]
        
        self.numeric_features = [
            'cyto_score', 'hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6',
            'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6',
            'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low',
            'cyto_score_detail', 'conditioning_intensity', 'year_hct', 'hla_match_a_high',
            'hla_match_b_low', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score',
            'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10',
            'donor_age', 'age_at_hct', 'efs_time'
        ]
        
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.imputer = SimpleImputer(strategy='constant', fill_value='Missing')
        self.numeric_imputer = SimpleImputer(strategy='median')
        self.scaler = StandardScaler()

        self.cyto_score_rank = {'Favorable': 0, 'Intermediate': 1, 'Poor': 2, 'Other': 1.5, 'TBD': np.nan, 'Not tested': np.nan}
        self.cyto_score_detail_rank = {'Favorable': 0, 'Intermediate': 1, 'Poor': 2, 'TBD': np.nan, 'Not tested': np.nan}
        self.conditioning_intensity_rank = {'NMA': 0, 'RIC': 1, 'MAC': 2, 'TBD': np.nan, 'No drugs reported': np.nan, 'N/A, F(pre-TED) not submitted': np.nan}

        self.earliest_year = None   
    
    def encode_race_group(self, df):
        race_groups = df['race_group'].unique()
        self.race_group_mapping = {race: i for i, race in enumerate(race_groups)}


    def fit(self, df):
        df = df.copy()

        self.encode_race_group(df)
        
        # Apply custom ranking for specific features
        df['cyto_score'] = df['cyto_score'].map(self.cyto_score_rank)
        df['cyto_score_detail'] = df['cyto_score_detail'].map(self.cyto_score_detail_rank)
        df['conditioning_intensity'] = df['conditioning_intensity'].map(self.conditioning_intensity_rank)
        
        # Add age difference feature
        df['age_difference'] = df['donor_age'] - df['age_at_hct']
        
        # Handle year_hct
        self.earliest_year = df['year_hct'].min()
        df['years_since_first_hct'] = df['year_hct'] - self.earliest_year
        
        # Update numeric_features list
        self.numeric_features = [f for f in self.numeric_features if f != 'year_hct'] + ['age_difference', 'years_since_first_hct']
        
        # Now proceed with the rest of the fitting process
        imputed_data = self.imputer.fit_transform(df[self.dummy_features])
        self.numeric_imputer.fit(df[self.numeric_features])
        self.scaler.fit(df[self.numeric_features])
        
        # Fit the encoder on the imputed data
        self.encoder.fit(imputed_data)
    

    def transform(self, df):
        df_transformed = df.copy()
        
        # Apply custom ranking for specific features
        df_transformed['cyto_score'] = df_transformed['cyto_score'].map(self.cyto_score_rank)
        df_transformed['cyto_score_detail'] = df_transformed['cyto_score_detail'].map(self.cyto_score_detail_rank)
        df_transformed['conditioning_intensity'] = df_transformed['conditioning_intensity'].map(self.conditioning_intensity_rank)
        
        # Add age difference feature
        df_transformed['age_difference'] = df_transformed['donor_age'] - df_transformed['age_at_hct']
        
        # Handle year_hct
        df_transformed['years_since_first_hct'] = df_transformed['year_hct'] - self.earliest_year
        df_transformed = df_transformed.drop(columns=['year_hct'])
        
        # Handle numeric features
        for feature in self.numeric_features:
            if feature in df_transformed.columns:
                df_transformed[feature] = pd.to_numeric(df_transformed[feature], errors='coerce')
        
        # Impute missing values in numeric features
        df_transformed[self.numeric_features] = self.numeric_imputer.transform(df_transformed[self.numeric_features])
        
        # Normalize numeric values
        df_transformed[self.numeric_features] = self.scaler.transform(df_transformed[self.numeric_features])
        
        # Handle categorical features
        imputed_data = self.imputer.transform(df_transformed[self.dummy_features])
        dummy_encoded = self.encoder.transform(imputed_data)
        dummy_columns = self.encoder.get_feature_names_out(self.dummy_features)
        dummy_df = pd.DataFrame(dummy_encoded, columns=dummy_columns, index=df_transformed.index)
        
        # Drop original dummy features and concatenate encoded features
        df_transformed = df_transformed.drop(columns=self.dummy_features)
        df_transformed = pd.concat([df_transformed, dummy_df], axis=1)
        
        # Handle 'efs' separately as it's binary
        if 'efs' in df_transformed.columns:
            df_transformed['efs'] = df_transformed['efs'].astype(int)

        df_transformed['race_group'] = df['race_group'].map(self.race_group_mapping)
        
        return df_transformed

In [None]:
train_df_split, test_df_split = train_test_split(train, test_size=0.2, random_state=42, shuffle=True)

transformer = FeatureTransformer()
transformer.fit(train_df_split)
train_df_split = transformer.transform(train_df_split)
test_df_split = transformer.transform(test_df_split)

In [None]:
from lifelines.utils import concordance_index

"""
To evaluate the equitable prediction of transplant survival outcomes,
we use the concordance index (C-index) between a series of event
times and a predicted score across each race group.
 
It represents the global assessment of the model discrimination power:
this is the model’s ability to correctly provide a reliable ranking
of the survival times based on the individual risk scores.
 
The concordance index is a value between 0 and 1 where:
 
0.5 is the expected result from random predictions,
1.0 is perfect concordance (with no censoring, otherwise <1.0),
0.0 is perfect anti-concordance (with no censoring, otherwise >0.0)

"""
import pandas.api.types

class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name : str) -> float:
    """
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_pred = {'prediction': {0: 1.0, 1: 0.0, 2: 1.0}}
    >>> y_pred = pd.DataFrame(y_pred)
    >>> y_pred.insert(0, row_id_column_name, range(len(y_pred)))
    >>> y_true = { 'efs': {0: 1.0, 1: 0.0, 2: 0.0}, 'efs_time': {0: 25.1234,1: 250.1234,2: 2500.1234}, 'race_group': {0: 'race_group_1', 1: 'race_group_1', 2: 'race_group_1'}}
    >>> y_true = pd.DataFrame(y_true)
    >>> y_true.insert(0, row_id_column_name, range(len(y_true)))
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name)
    0.75
    """
    
    #del solution[row_id_column_name]
    del submission[row_id_column_name]
    
    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_list = []
    for race in merged_df_race_dict.keys():
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))

# xgboost cox model

In [None]:
import re

regex = re.compile(r"\[|\]|<", re.IGNORECASE)
train_df_split.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in train_df_split.columns.values]
test_df_split.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in test_df_split.columns.values]


In [None]:
# Prepare the data
X_train = train_df_split.drop(['ID', 'efs', 'efs_time'], axis=1)
y_train_time = train_df_split['efs_time']
y_train_event = train_df_split['efs']

X_test = test_df_split.drop(['ID', 'efs', 'efs_time'], axis=1)
y_test_time = test_df_split['efs_time']
y_test_event = test_df_split['efs']

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd



# Create XGBoost DMatrix objects
dtrain = xgb.DMatrix(X_train, label=y_train_time, weight=y_train_event)
dtest = xgb.DMatrix(X_test, label=y_test_time, weight=y_test_event)

# Set parameters for XGBoost
params = {
    "objective": "survival:cox",
    "eval_metric": "cox-nloglik",
    "tree_method": "hist",
    #"max_depth": 3,
    #"learning_rate": 0.05,
    #"subsample": 0.8,
    #"colsample_bytree": 0.8
}

# Train the model
model = xgb.train(
    params,
    dtrain,
    num_boost_round=100,
    evals=[(dtrain, 'train'), (dtest, 'test')],
    #feval=xgb_score,
    verbose_eval=10  # Log every 10 iterations
)

# Make predictions
y_pred_train = pd.DataFrame({'ID':train_df_split['ID'],'prediction':model.predict(dtrain)})
y_pred_test = pd.DataFrame({'ID':test_df_split['ID'],'prediction':model.predict(dtest)})

# Calculate final scores
train_score = score(train_df_split, y_pred_train, 'ID')
test_score = score(test_df_split, y_pred_test, 'ID')

print(f"Final Train Score: {train_score}")
print(f"Final Test Score: {test_score}")


# lgbm cox model

In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd


# Prepare the data
X_train = train_df_split.drop(['ID','efs', 'efs_time'], axis=1)
y_train = train_df_split[['efs', 'efs_time']]

X_test = test_df_split.drop(['ID','efs', 'efs_time'], axis=1)
y_test = test_df_split[['efs', 'efs_time']]

# Create LightGBM datasets
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# Set parameters for LightGBM
params = {
    "objective": "survival:cox",  # Cox model
    "metric": "cox",  # Concordance index
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}



# Custom evaluation function
def lgb_score(preds, train_data):
    y_true = train_data.get_label()
    score = calculate_score(train_data.data, preds)
    return 'custom_score', score, True

# Train the model
model = lgb.train(params,
                  lgb_train,
                  num_boost_round=100,
                  valid_sets=[lgb_train, lgb_eval],
                  feval=lgb_score,
                  callbacks=[lgb.log_evaluation(10)])  # Log every 10 iterations

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Calculate final scores
train_score = calculate_score(train_df_split, y_pred_train)
test_score = calculate_score(test_df_split, y_pred_test)

print(f"Final Train Score: {train_score}")
print(f"Final Test Score: {test_score}")
