#### This is a random forest model for the prediciton of outcome (survived, death) on 312 cirrhosis patients from Mayo Clinic 

In [3]:
# Load required packages
import pandas as pd 
from sklearn.model_selection import train_test_split

In [133]:
# Load dataset 
cirrhosis =  pd.read_csv("C:/Users/minjk/Downloads/cirrhosis+patient+survival+prediction+dataset-1/cirrhosis.csv")

In [53]:
len(cirrhosis)

418

In [7]:
cirrhosis.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


#### Data Manipulation

In [135]:
# Drop missing values of treatment 
cirrhosis["Drug"].isna().value_counts()
cirrhosis = cirrhosis[cirrhosis["Drug"].notnull()]

In [65]:
cirrhosis['Status'].value_counts()

Status
C     168
D     125
CL     19
Name: count, dtype: int64

In [137]:
# Recode Status variable as following: CL: censored due to liver transplantation to C: Censored 
cirrhosis.loc[:, 'Status'] = cirrhosis['Status'].map({"CL": "C", "D":"D", "C":"C"})

In [139]:
# Recode Sex variable as following: Female:0, male:1 
#cirrhosis["Sex"].isna().value_counts()
cirrhosis.loc[:,'Sex'] = cirrhosis['Sex'].map({'F':0, 'M': 1})


In [141]:
# Recode Drug varaible as following: D-penicillamine (Treatment):1, Placebo: 0
cirrhosis.loc[:,"Drug"] = cirrhosis["Drug"].map({"D-penicillamine": 1, "Placebo":0})

In [145]:
cirrhosis

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,1,21464,0,Y,Y,Y,Y,14.5,261.0,2.60,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,1,20617,0,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,1,25594,1,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.10,55.0,151.0,12.0,4.0
3,4,1925,D,1,19994,0,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,C,0,13918,0,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,308,1153,C,1,22347,0,N,Y,N,N,0.4,246.0,3.58,24.0,797.0,91.00,113.0,288.0,10.4,2.0
308,309,994,C,0,21294,0,N,N,N,N,0.4,260.0,2.75,41.0,1166.0,70.00,82.0,231.0,10.8,2.0
309,310,939,C,1,22767,0,N,N,N,N,1.7,434.0,3.35,39.0,1713.0,171.00,100.0,234.0,10.2,2.0
310,311,839,C,1,13879,0,N,N,N,N,2.0,247.0,3.16,69.0,1050.0,117.00,88.0,335.0,10.5,2.0


In [147]:
# Define target and features dataset 
# target: Status: Death or censored 

y = cirrhosis['Status']

X = cirrhosis[['N_Days', 'Sex', 'Drug', 'Cholesterol', 'Stage']]

In [149]:
# Split the target and features datasets into 80% training dataset and 20% testing dataset 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [151]:
# Create a random forest classifier with 100 random trees 
from sklearn.ensemble import RandomForestClassifier 
random_forest_classifier = RandomForestClassifier(n_estimators = 100, random_state = 42)

In [153]:
# Train the random forest model with training dataset 
random_forest_classifier.fit(X_train, y_train)

In [155]:
# Perform prediction with test dataset 
y_pred = random_forest_classifier.predict(X_test)

In [157]:
# Goodness of fit: Check accuracy of random forest model and print a sample classification result 
from sklearn.metrics import accuracy_score, classification_report 

# Accuracy 
accuracy = accuracy_score(y_test, y_pred)

# Classification Report 
classification_report = classification_report(y_test, y_pred)

print(accuracy)
print(classification_report)

0.6666666666666666
              precision    recall  f1-score   support

           C       0.70      0.76      0.73        37
           D       0.61      0.54      0.57        26

    accuracy                           0.67        63
   macro avg       0.65      0.65      0.65        63
weighted avg       0.66      0.67      0.66        63

