# Hyperparameter Tuning, Final Model + Graphs

## Importing packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sksurv.datasets import load_gbsg2
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sksurv.ensemble import RandomSurvivalForest
from sklearn.inspection import permutation_importance
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.metrics import concordance_index_ipcw
from sklearn.model_selection import GridSearchCV
from sksurv.linear_model import CoxPHSurvivalAnalysis
from matplotlib.lines import Line2D
from sksurv.metrics import concordance_index_censored
import random
set_config(display="text")  # displays text representation of estimators

In [None]:
pd.set_option('display.max_columns', 800)
pd.set_option('display.max_rows', 300)

## Loading the table created in the preprocessing notebook

In [None]:
kidpan_living = pd.read_csv('csv_data/final_kidpan_living.csv')

## Handling features

#### Columns with highest variable importance calculated in the preprocessing notebook

In [None]:
best_columns = ["CREAT_TRR", "AGE", "AGE_DON", "PREV_TX_ANY", "ON_DIALYSIS", "END_BMI_CALC", "DIAB", 
                "WORK_INCOME_TCR", "PRI_PAYMENT_TCR_KI", "DAYSWAIT_CHRON_KI", "PRE_TX_TXFUS", "TOT_SERUM_ALBUM",
               "PERIP_VASC", "ETHCAT_DON", "ETHCAT", "FUNC_STAT_TCR", "HCV_SEROSTATUS", "HIST_CIG",
               "HLAMIS", "EBV_SEROSTATUS","DIABETES_DON", "DAYSWAIT_ALLOC", "KI_CREAT_PREOP", 'DIAG_KI' ]
target_columns = ["GTIME_KI", "GSTATUS_KI"]
len(best_columns)

In [None]:
kidpan_living = kidpan_living[best_columns + target_columns]
kidpan_living

#### Fixing data types - correcting categorical and numerical columns

In [None]:
original_categorical_cols = ['WL_ORG', 'GENDER_REC', 'ABO_REC', 'CITIZENSHIP_REC', 'PERM_STATE', 'FUNC_STAT_TCR', 'ETHNICITY',
               'ETHCAT', 'REGION_REC', 'DATA_WAITLIST', 'OPO_CTR_CODE', 'INIT_OPO_CTR_CODE', 'DONATION',
               'ON_DIALYSIS', 'EXH_PERIT_ACCESS', 'EXH_VASC_ACCESS', 'PREV_TX', 'PREV_KI_TX', 'MALIG_TRR',
               'PRE_TX_TXFUS', 'TXKID', 'DON_RETYP', 'HBV_CORE_DON', 'HBV_SUR_ANTIGEN_DON', 'ETHCAT_DON',
               'CITIZENSHIP_DON', 'ABO_DON', 'GENDER_DON', 'HOME_STATE_DON', 'CANCER_SITE_DON', 'HIST_CIG_DON',
               'HIST_HYPERTENS_DON', 'HIST_CANCER_DON', 'DIABETES_DON', 'END_STAT_KI', 'ABO_MAT', 'ORGAN',
               'CMV_IGG_REC', 'CMV_IGM_REC', 'HBV_CORE_REC', 'HBV_SUR_ANTIGEN_REC', 'HCV_SEROSTATUS',
               'HIV_SEROSTATUS', 'PREV_TX_ANY', 'AGE_GROUP', 'MALIG', 'LT_ONE_WEEK_DON', 'EDUCATION_REC', 'DIAB',
               'DRUGTRT_COPD', 'PERIP_VASC', 'MALIG_TCR_KI', 'PRI_PAYMENT_TCR_KI', 'HAPLO_TY_MATCH_DON',
               'CMV_TEST_DON', 'HBV_TEST_DON', 'HCV_TEST_DON', 'HCV_RIBA_DON', 'HCV_ANTIBODY_DON', 'LIV_DON_TY',
               'EBV_SEROSTATUS', 'EBV_TEST_DON', 'HCV_RNA_DON', 'EDUCATION_DON', 'HBV_DNA_DON', 'CMV_NUCLEIC_DON',
               'CMV_IGG_DON', 'CMV_IGM_DON', 'WORK_INCOME_TCR', 'EBV_IGG_DON', 'EBV_IGM_DON', 'DON_ORG',
               'HCV_ANTIBODY', 'HCV_RIBA', 'HOME_STATE', 'REGION_DON', 'VIRUSES_TESTED', 'CMV_NUCLEIC',
               'DIABETES', 'FUNC_STAT', 'HBV_DNA', 'HCV_RNA', 'HEALTH_INS', 'HIST_CANCER', 'HIST_CIG', 'HIST_HYPER',
               'MARITAL_STAT', 'NON_AUTO_BLOOD', 'READMISSION_KI', 'TOBACCO_USE', 'WORK_INCOME', 'EBV_IGG',
               'EBV_IGM', 'DIAG_KI']
original_numerical_cols = ['NUM_PREV_TX', 'WGT_KG_TCR', 'HGT_CM_TCR', 'BMI_TCR', 'INIT_WGT_KG', 'INIT_HGT_CM', 'DAYSWAIT_CHRON',
               'END_STAT', 'INIT_DATE', 'INIT_BMI_CALC', 'END_BMI_CALC', 'INIT_STAT', 'TX_DATE', 'AMIS', 'BMIS',
               'DRMIS', 'HLAMIS', 'NPKID', 'HGT_CM_DON_CALC', 'WGT_KG_DON_CALC', 'BMI_DON_CALC', 'CREAT1Y',
               'DIAL_DATE', 'AGE', 'DISTANCE', 'DAYSWAIT_CHRON_KI', 'HGT_CM_CALC', 'WGT_KG_CALC',
               'BMI_CALC', 'CREAT6M', 'TOT_SERUM_ALBUM', 'WT_QUAL_DATE', 'DAYSWAIT_ALLOC', 'AGE_DON', 'DON_DATE',
               'PHYSICAL_CAPACITY', 'PREDON_HGT', 'PREDON_WGT', 'PREOP_URINE_PROTEIN', 'BP_PREOP_DIAST',
               'BP_PREOP_SYST', 'KI_CREAT_PREOP', 'CREAT_TRR']

categorical_cols = [col for col in kidpan_living.columns.tolist() if col in original_categorical_cols]
numerical_cols = [col for col in kidpan_living.columns.tolist() if col in original_numerical_cols]
kidpan_living[categorical_cols] = kidpan_living[categorical_cols].astype('object')

## Training the model

#### Splitting values into target and feature variables

In [None]:
y = kidpan_living[["GSTATUS_KI", "GTIME_KI"]]
X = kidpan_living.drop(['GSTATUS_KI' ,'GTIME_KI'], axis=1)
del kidpan_living

#### Formating y for RSF

In [None]:
y_struct = np.zeros(y.shape[0], dtype=[('event', bool), ('time', float)])
y_struct['event'] = y.iloc[:, 0] == 1
y_struct['time'] = y.iloc[:, 1]

#### Splitting X into train, validation and test data

In [None]:
random_state = 10

X_train, X_test_val, y_train, y_test_val = train_test_split(
    X, y_struct, test_size=0.3, random_state=random_state)

In [None]:
X_test, X_val, y_test, y_val = train_test_split(
    X_test_val, y_test_val, test_size=0.5, random_state=random_state)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

In [None]:
X_test_og = X_test.copy()

#### Imputing numerical columns

In [None]:
numerical_transformer = SimpleImputer(strategy='mean')

In [None]:
numerical_imputed_X_train = pd.DataFrame(numerical_transformer.fit_transform(X_train[numerical_cols]))
numerical_imputed_X_val = pd.DataFrame(numerical_transformer.transform(X_val[numerical_cols]))
numerical_imputed_X_test = pd.DataFrame(numerical_transformer.transform(X_test[numerical_cols]))
numerical_imputed_X_train.columns = X_train[numerical_cols].columns
numerical_imputed_X_val.columns = X_val[numerical_cols].columns
numerical_imputed_X_test.columns = X_test[numerical_cols].columns
numerical_imputed_X_train

#### Imputing categorical columns

In [None]:
categorical_imputer = SimpleImputer(strategy='most_frequent')

In [None]:
categorical_imputed_X_train = pd.DataFrame(categorical_imputer.fit_transform(X_train[categorical_cols]))
categorical_imputed_X_val = pd.DataFrame(categorical_imputer.fit_transform(X_val[categorical_cols]))
categorical_imputed_X_test = pd.DataFrame(categorical_imputer.transform(X_test[categorical_cols]))
categorical_imputed_X_train.columns = X_train[categorical_cols].columns
categorical_imputed_X_val.columns = X_val[categorical_cols].columns
categorical_imputed_X_test.columns = X_test[categorical_cols].columns

In [None]:
categorical_imputed_X_train

#### One-hot encoding categorical columns

In [None]:
enc = OneHotEncoder(sparse=False, handle_unknown = 'ignore')
encoded_array_train = enc.fit_transform(categorical_imputed_X_train)
encoded_array_val = enc.transform(categorical_imputed_X_val)
encoded_array_test = enc.transform(categorical_imputed_X_test)
encoded_columns_train=enc.get_feature_names_out(categorical_imputed_X_train.columns)
encoded_columns_val= enc.get_feature_names_out(categorical_imputed_X_val.columns)
encoded_columns_test=enc.get_feature_names_out(categorical_imputed_X_test.columns)
categorical_encoded_train = pd.DataFrame(encoded_array_train, columns=encoded_columns_train)
categorical_encoded_val = pd.DataFrame(encoded_array_val, columns=encoded_columns_val)
categorical_encoded_test = pd.DataFrame(encoded_array_test, columns=encoded_columns_test)

#### Merging back numerical and categorical columns

In [None]:
X_train = numerical_imputed_X_train.merge(categorical_encoded_train, left_index=True, right_index=True)
X_val = numerical_imputed_X_val.merge(categorical_encoded_val, left_index=True, right_index=True)
X_test = numerical_imputed_X_test.merge(categorical_encoded_test, left_index=True, right_index=True)
X_train

#### Merging training and validation sets for training the final model

In [None]:
X_train_val = pd.concat([X_train, X_val], ignore_index=True)
y_train_val = np.concatenate((y_train, y_val)) 

#### Hyperparameter tuning

In [None]:
rsf = RandomSurvivalForest(random_state=random_state)
param_grid = {
    'n_estimators': [100, 200, 300], 
    'min_samples_split': [5, 10], 
    'min_samples_leaf': [10, 15], 
    'max_depth' : [None, 3],
}

grid_search = GridSearchCV(rsf, param_grid, n_jobs=1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

In [None]:
val_score = best_model.score(X_val, y_val)
test_score = best_model.score(X_test, y_test)

In [None]:
print("Validation Accuracy:", val_score)
print("Test Accuracy:", test_score)

## Final model training

In [None]:
rsf = RandomSurvivalForest(n_estimators=200,
                           min_samples_split=15,
                           min_samples_leaf=10,
                           max_depth=None,
                           n_jobs=-1,
                           random_state=random_state)

In [None]:
rsf.fit(X_train_val, y_train_val)

In [None]:
rsf.score(X_test, y_test)

#### Validating permutation importance

In [None]:
result = permutation_importance(rsf, X_test, y_test, n_repeats=15, random_state=random_state, 
                                n_jobs=1)

In [None]:
variable_importance_df = pd.DataFrame(
    {k: result[k] for k in ("importances_mean", "importances_std",)},
    index=X_test.columns
).sort_values(by="importances_mean", ascending=False)

variable_importance_df

In [None]:
reindexed_variable_importance_df = variable_importance_df.reset_index()
renamed_variable_importance_df = reindexed_variable_importance_df.rename(columns={'index': 'Feature'})
renamed_variable_importance_df

In [None]:
renamed_variable_importance_df.to_csv('variable_importances/simpler_livdon_n15_nolocmis.csv', index=False)

## Graphs + other supporting computations

#### Graphs for model's capabilites

In [None]:
total_rows = len(X_test)
random_indices = random.sample(range(total_rows), 5)

In [None]:
random_features = X_test.iloc[random_indices]
random_targets = y_test[random_indices]
print(random_targets)
random_features

In [None]:
risk = rsf.predict(random_features)
risk

In [None]:
colors = ['blue', 'orange', 'green', 'red', 'purple']

In [None]:
surv = rsf.predict_survival_function(random_features, return_array=True)
lines = []
labels = risk.astype(int).tolist()
for i, s in enumerate(surv):
    # plt.step(rsf.unique_times_, s, where="post",color=colors[i] ,label=int(risk[i]))
    line  = plt.step(rsf.unique_times_, s, where="post",color=colors[i])
    lines.append(line[0])
    if random_targets[i]['event']:
        plt.axvline(x=random_targets[i]['time'], color=colors[i] ,linestyle='--')
    else:
        plt.axvline(x=random_targets[i]['time'], color=colors[i], linestyle=':')
              
legend1 = plt.legend(lines, labels, loc='upper right',  bbox_to_anchor=(1.0, 1.0))
legend1.set_title('Risk score')        
        
event_line = Line2D([], [], color='black', linestyle='--', label='Event')
censoring_line = Line2D([], [], color='black', linestyle=':', label='Censored')


legend2 = plt.legend(handles=[event_line, censoring_line], loc='upper right',bbox_to_anchor=(0.79, 1.0))
legend2.set_title('Observed survival time')

plt.gca().add_artist(legend1)
plt.gca().add_artist(legend2)

plt.ylabel("Survival probability")
plt.xlabel("Time in days")
#plt.legend(title="Risk score")
plt.grid(True)
#plt.savefig("plots/surv_example_no_title_obslegend.pdf", format="pdf")

In [None]:
hazard = rsf.predict_cumulative_hazard_function(random_features, return_array=True)
hazard_lines = []

for i, s in enumerate(hazard):
    plt.step(rsf.unique_times_, s, where="post",color=colors[i] ,label=int(risk[i]))
    hazard_line  = plt.step(rsf.unique_times_, s, where="post",color=colors[i])
    hazard_lines.append(hazard_line[0])
    if random_targets[i]['event']:
        plt.axvline(x=random_targets[i]['time'], color=colors[i] ,linestyle='--')
    else:
        plt.axvline(x=random_targets[i]['time'], color=colors[i], linestyle=':')
        
legend1 = plt.legend(lines, labels, loc='upper left',  bbox_to_anchor=(0.0, 1.0))
legend1.set_title('Risk score')  

event_line = Line2D([], [], color='black', linestyle='--', label='Event')
censoring_line = Line2D([], [], color='black', linestyle=':', label='Censored')

legend2 = plt.legend(handles=[event_line, censoring_line], loc='upper left',bbox_to_anchor=(0.2, 1.0))
legend2.set_title('Observed survival time')

plt.gca().add_artist(legend1)
plt.gca().add_artist(legend2)
        
plt.ylabel("Cumulative hazard")
plt.xlabel("Time in days")
plt.grid(True)
#plt.savefig("plots/hazard_example_no_title_obslegend.pdf", format="pdf")

#### Comparison to Cox Regression

In [None]:
estimator = CoxPHSurvivalAnalysis()
estimator.fit(X_train, y_train)

In [None]:
pd.Series(estimator.coef_, index=X_train.columns)

In [None]:
estimator.score(X_test, y_test)

#### Graphs for investigating recipient's age

In [None]:
y_test_df = pd.DataFrame(y_test)
y_test_df

In [None]:
X_test_reindexed = X_test.reset_index(drop=True)
X_test_reindexed

In [None]:
merged_test = X_test_reindexed.join(y_test_df)
merged_test

In [None]:
age_intervals = pd.IntervalIndex.from_tuples([(0, 15), (15, 30), (30, 45), (45, 60), (60, 75)], closed='left')
# age_intervals = pd.IntervalIndex.from_tuples([(0, 60), (60, 75)], closed='left')

In [None]:
samples = {}

for age_group in age_intervals:
    group_name = f'{age_group.left}-{age_group.right - 1}'
    group_data = merged_test.loc[merged_test['AGE'].between(age_group.left, age_group.right - 1)]
    samples[group_name] = group_data.sample(n=1000, replace=True).to_dict(orient='records')
    
    
sample_dfs = []
group_names = []
for group_name, group_data in samples.items():
    df = pd.DataFrame.from_dict(group_data)
    sample_dfs.append(df)
    group_names.append(group_name)

group_names

In [None]:
y_sample_dfs = []
X_sample_dfs = []
for group_df in sample_dfs:
    y_sample_dfs.append(group_df[['event', 'time']])
    X_sample_dfs.append(group_df.drop(['event', 'time'], axis=1))
    
y_sample_dfs[0]

In [None]:
averaged_risk_scores = []
for group_df in X_sample_dfs:
    risk = rsf.predict(group_df)
    average_risk = np.mean(risk, axis=0)
    averaged_risk_scores.append(average_risk)
    
print(f"{averaged_risk_scores[1]}-{averaged_risk_scores[0]}={averaged_risk_scores[1]-averaged_risk_scores[0]}")

In [None]:
averaged_survival_functions = []
for group_df in X_sample_dfs:
    surv = rsf.predict_survival_function(group_df, return_array=True)
    average_surv = np.mean(surv, axis=0)
    averaged_survival_functions.append(average_surv)
    
averaged_survival_functions

In [None]:
fig, ax = plt.subplots()

for index, function in enumerate(averaged_survival_functions):
    ax.step(rsf.unique_times_, function, where="post", label=group_names[index])

ax.set_ylabel("Survival probability")
ax.set_xlabel("Time in days")
ax.legend(title='Recipient age')
ax.set_ylim(0, 1.1)
#ax.set_title("Averaged survival curves predicted by RSF")
ax.grid(True)
# fig.savefig("plots/RSF_average_age.pdf", format='pdf')

In [None]:
kaplan_meier_curves = []
for y_sample in y_sample_dfs:
    time, survival_prob = kaplan_meier_estimator(y_sample["event"], y_sample["time"])
    kaplan_meier_curves.append([time, survival_prob])


In [None]:
fig, ax = plt.subplots()

for index, kaplan_meier_curve in enumerate(kaplan_meier_curves):
    ax.step(kaplan_meier_curve[0], kaplan_meier_curve[1], where="post", label=group_names[index])

ax.set_ylabel("Survival probability")
ax.set_xlabel("Time in days")
ax.legend(title='Recipient age')
ax.set_ylim(0, 1.1)
#ax.set_title("Survival curves computed by Kaplan-Meier estimator")
ax.grid(True)
#fig.savefig("plots/KaplanMeier_test_average_age.pdf", format='pdf')

#### Graphs for ethnic categories

In [None]:
y_df = pd.DataFrame(y_struct)
ethcat_df = final_kidpan_living[['ETHCAT', 'ETHCAT_DON']]
merged = ethcat_df.join(y_df)
merged

In [None]:
dropped_ethcat = merged[(final_kidpan_living['ETHCAT'] != 998) & (merged['ETHCAT_DON'] != 998)]
replacement_dict = {1: 'white', 2: 'black', 4: 'hispanic', 5: 'asian', 6: 'native american', 
                    7: 'Pacific islander', 9: 'multiracial' }
dropped_ethcat['ETHCAT'] = dropped_ethcat['ETHCAT'].replace(replacement_dict)
dropped_ethcat['ETHCAT_DON'] = dropped_ethcat['ETHCAT_DON'].replace(replacement_dict)
dropped_ethcat

In [None]:
subsets = dropped_ethcat.groupby(['ETHCAT', 'ETHCAT_DON'])

subset_sizes = subsets.size()
sorted_subsets = subset_sizes.sort_values(ascending=False)


In [None]:
top_subsets = sorted_subsets.head(10)

In [None]:
fig, ax = plt.subplots()

for (recipient, donor), _ in top_subsets.items():
    subset = subsets.get_group((recipient, donor))
    time, survival_prob = kaplan_meier_estimator(subset["event"], subset["time"])
    ax.step(time, survival_prob, where="post", label=f"{recipient}-{donor}")
    
ax.set_ylabel("Survival probability")
ax.set_xlabel("Time in days")
ax.set_xlim(-500, 13000)
ax.legend(title='Ethnicity')
ax.grid(True)
# plt.savefig("plots/kaplan_meier_ethnicity.pdf", format="pdf")

In [None]:
subsets = []
for i in [1,2,4,5]:
    temp = X_test.loc[(X_test[f'ETHCAT_{i}'] == 1) & (X_test[f"ETHCAT_{i}"] == 1)]
    subsets.append(temp)

subsets[0]

In [None]:
averaged_survival_functions = []
for subset in subsets:
    surv = rsf.predict_survival_function(subset, return_array=True)
    average_surv = np.mean(surv, axis=0)
    averaged_survival_functions.append(average_surv)
    
averaged_survival_functions

In [None]:
fig, ax = plt.subplots()
labels = ["white", "black", "hispanic", "asian"]

for index, function in enumerate(averaged_survival_functions):
    ax.step(rsf.unique_times_, function, where="post", label=f"{labels[index]}-{labels[index]}")

ax.set_ylabel("Survival probability")
ax.set_xlabel("Time in days")
ax.legend(title='HLAMIS')
ax.set_ylim(0, 1.1)
ax.grid(True)

fig.savefig("plots/RSF_average_ethcat.pdf", format='pdf')

#### Graphs for HLA mismatch

In [None]:
y_df = pd.DataFrame(y_struct)
hlamis_df = final_kidpan_living[['HLAMIS', 'AMIS', 'BMIS', 'DRMIS']]
merged = hlamis_df.join(y_df)
merged

In [None]:
dropped_hlamis = merged.dropna(axis=0)
dropped_hlamis

In [None]:
subsets = []
for i in range (0,6):
    temp = dropped_hlamis.loc[dropped_hlamis['HLAMIS'] == i]
    subsets.append(temp)

subsets[0]

In [None]:
fig, ax = plt.subplots()

for index, subset in enumerate(subsets):
    print(index)
    time, survival_prob = kaplan_meier_estimator(subset["event"], subset["time"])
    ax.step(time, survival_prob, where="post", label=index)
    
ax.set_ylabel("Survival probability")
ax.set_xlabel("Time in days")
ax.set_xlim(-500, 13000)
ax.legend(title='HLAMIS')
ax.grid(True)
# plt.savefig("plots/kaplan_meier_hlamis.pdf", format="pdf")

In [None]:
subsets = []
for i in range (0,6):
    temp = X_test.loc[X_test['ETH_CAT'] == i]
    subsets.append(temp)

subsets[0]

In [None]:
averaged_survival_functions = []
for subset in subsets:
    surv = rsf.predict_survival_function(subset, return_array=True)
    average_surv = np.mean(surv, axis=0)
    averaged_survival_functions.append(average_surv)
    
averaged_survival_functions

In [None]:
fig, ax = plt.subplots()

for index, function in enumerate(averaged_survival_functions):
    ax.step(rsf.unique_times_, function, where="post", label=index)

ax.set_ylabel("Survival probability")
ax.set_xlabel("Time in days")
ax.legend(title='HLAMIS')
ax.set_ylim(0, 1.1)
ax.grid(True)

# fig.savefig("plots/RSF_average_hlamis.pdf", format='pdf')

#### Rate of censoring

In [None]:
censored_data = y[y['GSTATUS_KI'] == 0]
event_data = y[y['GSTATUS_KI'] == 1]
bin_width = 1000
print(len(censored_data )/ (len(censored_data) + len(event_data)))

In [None]:
plt.hist(censored_data['GTIME_KI'], bins=int((y['GTIME_KI'].max() - y['GTIME_KI'].min()) / bin_width), color='red', alpha=0.5, label='Censored')# Set plot title and axis labels
plt.hist(event_data['GTIME_KI'], bins=int((y['GTIME_KI'].max() - y['GTIME_KI'].min()) / bin_width), color='green', alpha=0.5, label='Event Occurrence')
plt.title('Censoring Distribution Histogram')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.legend()
plt.show()

#### Computing risk score - example

In [None]:
X_test_before_encoding = numerical_imputed_X_test.merge(categorical_imputed_X_test, left_index=True, right_index=True)

In [None]:
random_index = np.random.choice(X_test.shape[0])

In [None]:
original_row = X_test_before_encoding.iloc[[random_index]]
original_row

In [None]:
row = original_row.copy()
patient_columns = ['CREAT_TRR', 'AGE', 'END_BMI_CALC', 'DAYSWAIT_CHRON_KI', 'TOT_SERUM_ALBUM', 'DAYSWAIT_ALLOC', 'PREV_TX_ANY',
              'ON_DIALYSIS', 'DIAB', 'WORK_INCOME_TCR', 'PRI_PAYMENT_TCR_KI', 'PRE_TX_TXFUS', 'PERIP_VASC', 'ETHCAT',
               'FUNC_STAT_TCR', 'HCV_SEROSTATUS', 'HIST_CIG', 'EBV_SEROSTATUS', 'DIAG_KI']
patient = row[patient_columns]
patient

In [None]:
vertical_table = pd.melt(patient, var_name='Column', value_name='Value')
vertical_table

In [None]:
print(vertical_table.to_latex(index=False))

In [None]:
num_rows = 5

# Select random row indices from the one-hot encoded test set
random_indices = np.random.choice(X_test.shape[0], size=num_rows, replace=False)

# Get the corresponding rows from the original dataset as a DataFrame
original_rows = X_test_before_encoding.iloc[random_indices]
original_rows

In [None]:
rows = original_rows.copy()
donor_columns = [x for x in rows.columns.tolist() if x not in patient_columns]
donors_og = rows[donor_columns]
donors = donors_og.reset_index(drop=True)
donors[['AGE_DON', 'DRMIS', 'AMIS', 'BMIS', 'HLAMIS']] = donors[['AGE_DON', 'DRMIS', 'AMIS', 'BMIS', 'HLAMIS']].astype(int)
donors

In [None]:
print(donors.to_latex())

In [None]:
encoded_donors_whole_rows = X_test.iloc[random_indices]
encoded_donors_whole_rows

In [None]:
patient_multiple = pd.concat([patient, patient, patient, patient, patient])
patient

In [None]:
example_test = pd.concat([patient_multiple.reset_index(drop=True), donors_og.reset_index(drop=True)], axis=1)
example_test = example_test[X.columns]
example_test

In [None]:
right_columns = X_test_og[categorical_cols].columns.tolist()
X_test_og[categorical_cols]

In [None]:
example_test_categorical = example_test[right_columns]
example_test_categorical

In [None]:
encoded_example_array = enc.transform(example_test[right_columns])
encoded_example_columns = enc.get_feature_names_out(example_test_categorical.columns)
encoded_example = pd.DataFrame(encoded_example_array, columns=encoded_example_columns)
encoded_example

In [None]:
numerical_example = example_test[numerical_cols]
numerical_example

In [None]:
example = numerical_example.merge(encoded_example, left_index=True, right_index=True)
example

In [None]:
risks_ex = rsf.predict(example)
risks_ex