In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from scipy.io import arff

In [2]:
data = arff.loadarff('bone-marrow.arff')
df = pd.DataFrame(data[0])
df.drop(columns=['Disease'], inplace=True)

In [7]:
#Convert all columns to numeric, coerce errors to null values
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors='coerce')
    
#Make sure binary columns are encoded as 0 and 1
for c in df.columns[df.nunique()==2]:
    df[c] = (df[c]==1)*1.0

In [8]:
#Calculate the number of unique values for each column
print('Count of unique values in each column:')
print(df.nunique())

Count of unique values in each column:
Recipientgender           2
Stemcellsource            2
Donorage                187
Donorage35                2
IIIV                      2
Gendermatch               2
DonorABO                  4
RecipientABO              4
RecipientRh               2
ABOmatch                  2
CMVstatus                 4
DonorCMV                  2
RecipientCMV              2
Riskgroup                 2
Txpostrelapse             2
Diseasegroup              2
HLAmatch                  4
HLAmismatch               2
Antigen                   4
Alel                      5
HLAgrI                    7
Recipientage            125
Recipientage10            2
Recipientageint           3
Relapse                   2
aGvHDIIIIV                2
extcGvHD                  2
CD34kgx10d6             183
CD3dCD34                182
CD3dkgx10d8             163
Rbodymass               130
ANCrecovery              18
PLTrecovery              50
time_to_aGvHD_III_IV     28
survival_

In [9]:
#Set target, survival_status,as y; features (dropping survival status and time) as X
y = df.survival_status
X= df.drop(columns=['survival_time','survival_status'])


In [10]:
#Define lists of numeric and categorical columns based on number of unique values
num_cols = X.columns[X.nunique()>7]
cat_cols = X.columns[X.nunique()<=7]

#Print columns with missing values
print('Columns with missing values:')
print(X.columns[X.isnull().sum()>0])

Columns with missing values:
Index(['RecipientABO', 'CMVstatus', 'Antigen', 'Alel', 'CD3dCD34',
       'CD3dkgx10d8', 'Rbodymass'],
      dtype='object')


In [11]:
#Split data into train/test split
x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.2)

In [12]:
#Create categorical preprocessing pipeline
#Using mode to fill in missing values and OHE
cat_vals = Pipeline([("imputer",SimpleImputer(strategy='most_frequent')), ("ohe",OneHotEncoder(sparse=False, drop='first', handle_unknown = 'ignore'))])

In [13]:
#Create numerical preprocessing pipeline
# Using mean to fill in missing values and standard scaling of features
num_vals = Pipeline([("imputer",SimpleImputer(strategy='mean')), ("scale",StandardScaler())])

In [14]:
#Create column transformer that will preprocess the numerical and categorical features separately
preprocess = ColumnTransformer(
    transformers=[
        ("cat_process", cat_vals, cat_cols),
        ("num_process", num_vals, num_cols)
    ]
)

In [15]:
#Create a pipeline with preprocess, PCA, and a logistic regresssion model
pipeline = Pipeline([("preprocess",preprocess), 
                     ("pca", PCA()),
                     ("clf",LogisticRegression())])


In [16]:
#Fit the pipeline on the training data
pipeline.fit(x_train, y_train)

#Predict the pipeline on the test data
print('Pipeline Accuracy Test Set:')
print(pipeline.score(x_test,y_test))

Pipeline Accuracy Test Set:
0.7894736842105263


In [17]:
#Define search space of hyperparameters
search_space = [{'clf':[LogisticRegression()],
                     'clf__C': np.logspace(-4, 2, 10),
                'pca__n_components':np.linspace(5,35,7).astype(int)},
                {'clf': [RandomForestClassifier()], # Actual Estimator
                'clf__max_depth': np.linspace(2,20,10).astype(int),
                'pca__n_components':np.linspace(5,35,7).astype(int)}
                   ]

In [None]:
#Search over hyperparameters abolve to optimize pipeline and fit
gs = GridSearchCV(pipeline, search_space, cv=5)
gs.fit(x_train, y_train)

#Save the best estimator from the gridsearch and print attributes and final accuracy on test set
best_model = gs.best_estimator_

In [None]:
#Print attributes of best_model
print('The best classification model is:')
print(best_model.named_steps['clf'])
print('The hyperparameters of the best classification model are:')
print(best_model.named_steps['clf'].get_params())
print('The number of components selected in the PCA step are:')
print(best_model.named_steps['pca'].n_components)

#Print final accuracy score 
print('Best Model Accuracy Test Set:')
print(best_model.score(x_test,y_test))