In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
dic = pd.read_csv('data_dictionary.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

class FeatureTransformer:
    def __init__(self):
        self.categorical_features = [
            'dri_score', 'psych_disturb', 'diabetes', 'tbi_status', 'arrhythmia',
            'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct',
            'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'ethnicity',
            'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe',
            'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match',
            'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose',
            'cardiac', 'pulm_moderate'
        ]
        
        self.numeric_features = [
            'hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6',
            'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6',
            'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low',
            'comorbidity_score', 'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 
            'hla_low_res_10', 'donor_age', 'age_at_hct'
        ]
        
        self.ordinal_features = ['cyto_score', 'cyto_score_detail', 'conditioning_intensity']
        
        self.imputer = SimpleImputer(strategy='constant', fill_value='Missing')
        self.numeric_imputer = SimpleImputer(strategy='median')
        self.scaler = StandardScaler()

        self.cyto_score_rank = {'Favorable': 0, 'Intermediate': 1, 'Poor': 2, 'Other': 1.5, 'TBD': np.nan, 'Not tested': np.nan}
        self.cyto_score_detail_rank = {'Favorable': 0, 'Intermediate': 1, 'Poor': 2, 'TBD': np.nan, 'Not tested': np.nan}
        self.conditioning_intensity_rank = {'NMA': 0, 'RIC': 1, 'MAC': 2, 'TBD': np.nan, 'No drugs reported': np.nan, 'N/A, F(pre-TED) not submitted': np.nan}

        self.earliest_year = None

    def fit(self, df):
        df = df.copy()
        
        # Handle ordinal features
        for feature in self.ordinal_features:
            df[feature] = df[feature].map(getattr(self, f"{feature}_rank"))
        
        # Add age difference feature
        df['age_difference'] = df['donor_age'] - df['age_at_hct']
        
        # Handle year_hct
        self.earliest_year = df['year_hct'].min()
        df['years_since_first_hct'] = df['year_hct'] - self.earliest_year
        
        # Update numeric_features list
        self.numeric_features += ['age_difference', 'years_since_first_hct']
        
        # Fit imputers and scaler
        self.imputer.fit(df[self.categorical_features])
        self.numeric_imputer.fit(df[self.numeric_features])
        self.scaler.fit(df[self.numeric_features])

    def transform(self, df):
        df_transformed = df.copy()
        
        # Handle ordinal features
        for feature in self.ordinal_features:
            df_transformed[feature] = df_transformed[feature].map(getattr(self, f"{feature}_rank"))
        
        # Add age difference feature
        df_transformed['age_difference'] = df_transformed['donor_age'] - df_transformed['age_at_hct']
        
        # Handle year_hct
        df_transformed['years_since_first_hct'] = df_transformed['year_hct'] - self.earliest_year
        df_transformed = df_transformed.drop(columns=['year_hct'])
        
        # Impute missing values
        df_transformed[self.categorical_features] = self.imputer.transform(df_transformed[self.categorical_features])
        df_transformed[self.numeric_features] = self.numeric_imputer.transform(df_transformed[self.numeric_features])
        
        # Normalize numeric values
        df_transformed[self.numeric_features] = self.scaler.transform(df_transformed[self.numeric_features])
        
        # Convert categorical features to 'category' dtype
        for feature in self.categorical_features:
            df_transformed[feature] = df_transformed[feature].astype('category')
        
        # Handle 'efs' separately as it's binary
        if 'efs' in df_transformed.columns:
            df_transformed['efs'] = df_transformed['efs'].astype(int)
            df_transformed["efs_time2"] = df_transformed.efs_time.copy()
            df_transformed.loc[df_transformed.efs==0,"efs_time2"] *= -1
        
        return df_transformed


In [None]:
train_df_split, test_df_split = train_test_split(train, test_size=0.2, random_state=42, shuffle=True)

transformer = FeatureTransformer()
transformer.fit(train_df_split)
train_df_split = transformer.transform(train_df_split)
test_df_split = transformer.transform(test_df_split)


# xgboost cox model

In [None]:
import re

regex = re.compile(r"\[|\]|<", re.IGNORECASE)
train_df_split.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in train_df_split.columns.values]
test_df_split.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in test_df_split.columns.values]


## cv

In [None]:
from sklearn.model_selection import KFold
from xgboost import XGBRegressor, XGBClassifier
import xgboost

In [None]:
FEATURES = train_df_split.drop(['ID', 'efs', 'efs_time','efs_time2'], axis=1).columns
train_df_split.reset_index(inplace=True)
test_df_split.reset_index(inplace=True)

In [None]:
%%time
FOLDS = 5
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
oof_xgb = np.zeros(len(train_df_split))
pred_xgb = np.zeros(len(test_df_split))

for i, (train_index, test_index) in enumerate(kf.split(train_df_split)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)
    
    x_train = train_df_split.loc[train_index,FEATURES].copy()
    y_train = train_df_split.loc[train_index,"efs_time2"]
    x_valid = train_df_split.loc[test_index,FEATURES].copy()
    y_valid = train_df_split.loc[test_index,"efs_time2"]
    x_test = test_df_split[FEATURES].copy()

    model_xgb = XGBRegressor(
        max_depth=3,  
        colsample_bytree=0.5, 
        subsample=0.8, 
        n_estimators=10_000,  
        learning_rate=0.1, 
        #eval_metric="mae",
        early_stopping_rounds=25,
        #objective='reg:logistic',
        enable_categorical=True,
        min_child_weight=5,
        objective = "survival:cox",
        eval_metric= "cox-nloglik",
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],  
        verbose=100 
    )

    # INFER OOF
    oof_xgb[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb /= FOLDS

In [None]:
from metric import score

y_true = train_df_split[["ID","efs","efs_time","race_group"]].copy()
y_pred = train_df_split[["ID"]].copy()
y_pred["prediction"] = oof_xgb #higher risk should lead to lower value, so our prediction is just simply a risk score
m = score(y_true.copy(), y_pred.copy(), "ID")

y_true = test_df_split[["ID","efs","efs_time","race_group"]].copy()
y_pred = test_df_split[["ID"]].copy()
y_pred["prediction"] = pred_xgb #higher risk should lead to lower value, so our prediction is just simply a risk score
n = score(y_true.copy(), y_pred.copy(), "ID")
#print(f"\nOverall CV for XGBoost =",m)
#print(f"\nOverall test for XGBoost =",n)
print(f"CV: {m} | Test: {n}")

In [None]:
"""
basic model from kaggle: 0.6677549507917714

full feature one hot encoding, imputing, normalizing, target normalizing
CV: 0.435125700420623 | Test: 0.43775461409778854

full feature one hot encoding, imputing, normalizing
CV: 0.6568945009770654 | Test: 0.6611580849857914

categorical values, imputing, normalizing
CV: 0.6624635547259778 | Test: 0.6641153857333534

"""

## no cv

In [None]:
# Prepare the data
X_train = train_df_split.drop(['ID', 'efs', 'efs_time','efs_time2'], axis=1)
y_train = train_df_split['efs_time2']

X_test = test_df_split.drop(['ID', 'efs', 'efs_time','efs_time2'], axis=1)
y_test = test_df_split['efs_time2']

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
from metric import score



# Create XGBoost DMatrix objects
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters for XGBoost
params = {
    "objective": "survival:cox",
    "eval_metric": "cox-nloglik",
    "max_depth":3,  
    "colsample_bytree":0.5, 
    "subsample":0.8, 
    "n_estimators":10_000,  
    "learning_rate":0.1, 
    "min_child_weight":5,
}

# Train the model
model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtrain, 'train'), (dtest, 'test')],
    #feval=xgb_score,
    verbose_eval=100  # Log every 10 iterations
)

# Make predictions
y_pred_train = pd.DataFrame({'ID':train_df_split['ID'],'prediction':model.predict(dtrain)})
y_pred_test = pd.DataFrame({'ID':test_df_split['ID'],'prediction':model.predict(dtest)})

# Calculate final scores
train_score = score(train_df_split, y_pred_train, 'ID')
test_score = score(test_df_split, y_pred_test, 'ID')

print(f"Final Train Score: {train_score}")
print(f"Final Test Score: {test_score}")


In [None]:
from metric import score

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb #higher risk should lead to lower value, so our prediction is just simply a risk score
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost =",m)