In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
dic = pd.read_csv('data_dictionary.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
dic

In [None]:
train.describe().T

In [None]:
col1 = train.columns
col2 = test.columns
list((set(col1)-set(col2)))

In [None]:
test.columns

# feature investigation

In [None]:
categorical_features = train.select_dtypes(include=['object', 'category']).columns
numerical_features = train.select_dtypes(include=['int64', 'float64']).columns

categorical_features, numerical_features

In [None]:
train.shape

In [None]:
for f in train.columns:
    if f == 'ID':
        continue
    uniqueV = train[f].unique()
    print('feature: ',f)
    print('number of unique values: ', len(uniqueV))
    print('number of missing values: ', train[f].isnull().sum(), np.round(train[f].isnull().sum()/len(train),2))
    print('description: ', dic[dic['variable'] == f]['description'].values[0])
    if len(uniqueV) < 30:
        print(uniqueV)
    print('---------------------------------------------')

In [None]:
numeric = ['donor_age','age_at_hct']
categorical = ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes',
       'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia',
       'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue',
       'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status',
       'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6',
       'hla_match_c_low', 'rituximab', 'hla_match_drb1_low',
       'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail',
       'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct',
       'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe',
       'prior_tumor', 'hla_match_b_low', 'peptic_ulcer',
       'hla_match_a_low', 'gvhd_proph', 'rheum_issue',
       'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score',
       'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related',
       'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high',
       'pulm_moderate', 'hla_low_res_10']

In [None]:
for f in numerical_features:
    plt.figure(figsize=(12, 8))
    plt.hist(train[f], bins=50)
    plt.title(f)
    plt.show()

In [None]:
result = train.groupby(['dri_score', 'efs'])['efs_time'].mean().unstack()

# Rename columns for clarity
result.columns = ['No Event', 'Event']

# Reset index to make 'category' a column
result = result.reset_index()

# Sort by 'Event' median life expectancy descending
result = result.sort_values('No Event', ascending=False)
result

In [None]:
cat = train['dri_score'].unique()
for c in cat:
    plt.figure(figsize=(12, 8))
    plt.hist(train[(train['dri_score'] == c) & (train['efs'] == 1)]['efs_time'], bins=50, color='red')
    plt.hist(train[(train['dri_score'] == c) & (train['efs'] == 0)]['efs_time'], bins=50, color='blue', alpha=0.5)
    plt.title(c)
    plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

class FeatureTransformer:
    def __init__(self):
        self.dummy_features = [
            'dri_score', 'psych_disturb', 'diabetes', 'tbi_status', 'arrhythmia',
            'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct',
            'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'ethnicity',
            'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe',
            'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match',
            'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose',
            'cardiac', 'pulm_moderate'
        ]
        
        self.numeric_features = [
            'cyto_score', 'hla_match_c_high', 'hla_high_res_8', 'hla_low_res_6',
            'hla_high_res_6', 'hla_high_res_10', 'hla_match_dqb1_high', 'hla_nmdp_6',
            'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low',
            'cyto_score_detail', 'conditioning_intensity', 'year_hct', 'hla_match_a_high',
            'hla_match_b_low', 'hla_match_a_low', 'hla_match_b_high', 'comorbidity_score',
            'karnofsky_score', 'hla_low_res_8', 'hla_match_drb1_high', 'hla_low_res_10',
            'donor_age', 'age_at_hct'
        ]
        
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.imputer = SimpleImputer(strategy='constant', fill_value='Missing')
        self.numeric_imputer = SimpleImputer(strategy='median')
        self.scaler = StandardScaler()

        self.cyto_score_rank = {'Favorable': 0, 'Intermediate': 1, 'Poor': 2, 'Other': 1.5, 'TBD': np.nan, 'Not tested': np.nan}
        self.cyto_score_detail_rank = {'Favorable': 0, 'Intermediate': 1, 'Poor': 2, 'TBD': np.nan, 'Not tested': np.nan}
        self.conditioning_intensity_rank = {'NMA': 0, 'RIC': 1, 'MAC': 2, 'TBD': np.nan, 'No drugs reported': np.nan, 'N/A, F(pre-TED) not submitted': np.nan}

        self.earliest_year = None   

    def fit(self, df):
        df = df.copy()
        
        # Apply custom ranking for specific features
        df['cyto_score'] = df['cyto_score'].map(self.cyto_score_rank)
        df['cyto_score_detail'] = df['cyto_score_detail'].map(self.cyto_score_detail_rank)
        df['conditioning_intensity'] = df['conditioning_intensity'].map(self.conditioning_intensity_rank)
        
        # Add age difference feature
        df['age_difference'] = df['donor_age'] - df['age_at_hct']
        
        # Handle year_hct
        self.earliest_year = df['year_hct'].min()
        df['years_since_first_hct'] = df['year_hct'] - self.earliest_year
        
        # Update numeric_features list
        self.numeric_features = [f for f in self.numeric_features if f != 'year_hct'] + ['age_difference', 'years_since_first_hct']
        
        # Now proceed with the rest of the fitting process
        imputed_data = self.imputer.fit_transform(df[self.dummy_features])
        self.numeric_imputer.fit(df[self.numeric_features])
        self.scaler.fit(df[self.numeric_features])
        
        # Fit the encoder on the imputed data
        self.encoder.fit(imputed_data)

    def transform(self, df):
        df_transformed = df.copy()
        
        # Apply custom ranking for specific features
        df_transformed['cyto_score'] = df_transformed['cyto_score'].map(self.cyto_score_rank)
        df_transformed['cyto_score_detail'] = df_transformed['cyto_score_detail'].map(self.cyto_score_detail_rank)
        df_transformed['conditioning_intensity'] = df_transformed['conditioning_intensity'].map(self.conditioning_intensity_rank)
        
        # Add age difference feature
        df_transformed['age_difference'] = df_transformed['donor_age'] - df_transformed['age_at_hct']
        
        # Handle year_hct
        df_transformed['years_since_first_hct'] = df_transformed['year_hct'] - self.earliest_year
        df_transformed = df_transformed.drop(columns=['year_hct'])
        
        # Handle numeric features
        for feature in self.numeric_features:
            if feature in df_transformed.columns:
                df_transformed[feature] = pd.to_numeric(df_transformed[feature], errors='coerce')
        
        # Impute missing values in numeric features
        df_transformed[self.numeric_features] = self.numeric_imputer.transform(df_transformed[self.numeric_features])
        
        # Normalize numeric values
        df_transformed[self.numeric_features] = self.scaler.transform(df_transformed[self.numeric_features])
        
        # Handle categorical features
        imputed_data = self.imputer.transform(df_transformed[self.dummy_features])
        dummy_encoded = self.encoder.transform(imputed_data)
        dummy_columns = self.encoder.get_feature_names_out(self.dummy_features)
        dummy_df = pd.DataFrame(dummy_encoded, columns=dummy_columns, index=df_transformed.index)
        
        # Drop original dummy features and concatenate encoded features
        df_transformed = df_transformed.drop(columns=self.dummy_features)
        df_transformed = pd.concat([df_transformed, dummy_df], axis=1)
        
        # Handle 'efs' separately as it's binary
        if 'efs' in df_transformed.columns:
            df_transformed['efs'] = df_transformed['efs'].astype(int)
        
        return df_transformed



# Usage:
transformer = FeatureTransformer()
transformer.fit(train)
train_transformed = transformer.transform(train)
#test_transformed = transformer.transform(test_df)


# target investigation

In [None]:
train.loc[train['efs'] == 1].efs_time.hist(bins=100)
train.loc[train['efs'] == 0].efs_time.hist(bins=100)