<a href="https://colab.research.google.com/github/nrwfebriani/cirrhosisprediction/blob/main/decisiontree_cirrhosisprediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Decision Tree Classifier for Cirrhosis Prediction


---

Group 14:
*   Elza Natali Wijaya (20/460542/TK/51131)
*   Maria Anastasia Tambunan (20/460551/TK/51140)
*   Nindya Fathul Risya (20/460556/TK/51145)
*   Nur Wulan Febriani (20/460557/TK/51146)

In [1]:
# Import library
import pandas as pd
import numpy as np

In [2]:
# Menampilkan data dari dataset 
url = 'https://raw.githubusercontent.com/nrwfebriani/cirrhosisprediction/main/cirrhosis.csv'
df = pd.read_csv(url)
df.head(424)

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.60,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.10,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,414,681,D,,24472,F,,,,N,1.2,,2.96,,,,,174.0,10.9,3.0
414,415,1103,C,,14245,F,,,,N,0.9,,3.83,,,,,180.0,11.2,4.0
415,416,1055,C,,20819,F,,,,N,1.6,,3.42,,,,,143.0,9.9,3.0
416,417,691,C,,21185,F,,,,N,0.8,,3.75,,,,,269.0,10.4,3.0


In [3]:
# Fungsi untuk menghitung entropy
def calculate_entropy(df_label):
    classes,class_counts = np.unique(df_label,return_counts = True)
    entropy_value = np.sum([(-class_counts[i]/np.sum(class_counts))*np.log2(class_counts[i]/np.sum(class_counts)) 
                        for i in range(len(classes))])
    return entropy_value

In [4]:
# Fungsi untuk menghitung information gain
def calculate_information_gain(dataset,feature,label): 
    # Hitung dataset entropy
    dataset_entropy = calculate_entropy(dataset[label])   
    values,feat_counts= np.unique(dataset[feature],return_counts=True)
    
    # Hitung weighted feature entropy                                       # Memanggil fungsi entropy
    weighted_feature_entropy = np.sum([(feat_counts[i]/np.sum(feat_counts))*calculate_entropy(dataset.where(dataset[feature]
                              ==values[i]).dropna()[label]) for i in range(len(values))])    
    feature_info_gain = dataset_entropy - weighted_feature_entropy
    return feature_info_gain

In [5]:
# Fungsi membentuk decision tree
def create_decision_tree(dataset, df, features, label, parent):

  datum = np.unique(df[label], return_counts=True)
  unique_data = np.unique(dataset[label])

  if len(unique_data) <= 1:
    return unique_data[0]
  elif len(dataset) == 0:
    return unique_data[np.argmax(datum[1])]
  elif len(features) == 0:
    return parent
  else:
    parent = unique_data[np.argmax(datum[1])]

    # Memanggil fungsi calculate_information_gain
    item_values = [calculate_information_gain(dataset, feature, label) for feature in features]

    optimum_feature_index = np.argmax(item_values)
    optimum_feature = features[optimum_feature_index]
    decision_tree = {optimum_feature:{}}
    features = [i for i in features if i != optimum_feature]

    for value in np.unique(dataset[optimum_feature]):
      min_data = dataset.where(dataset[optimum_feature] == value).dropna()

      # Rekursif call fungsi create_decision_tree
      min_tree = create_decision_tree(min_data, df, features, label, parent)

      decision_tree[optimum_feature][value] = min_tree

    return(decision_tree)

In [6]:
# Fungsi untuk memprediksi
def predict_chirrosis(test_data, decision_tree):
  for nodes in decision_tree.keys():
    value = test_data[nodes]
    decision_tree = decision_tree[nodes][value]

    prediction = 0
    if type(decision_tree) is dict:

      # Rekursif call fungsi predict_chirrosis
      prediction = predict_chirrosis(test_data, decision_tree)
    else:
      prediction = decision_tree
      
    return prediction

In [7]:
# Menentukan features dan label
# Features untuk memproses karakteristik dari dataset yang ingin kita tampilkan
# Label untuk memproses apa yang ingin kita tampilkan sebagai output
features = df.columns[:-1]
label = 'Stage' # Stadium yang akan diprediksi
parent=None

In [8]:
# Melatih model decision tree 
decision_tree = create_decision_tree(df.astype(str),df,features,label,parent)

In [9]:
# Prediksi menggunakan sample data dan trained model yang ada pada tabel dataset
sample_data1 = {'ID':'416','N_Days':'1500','Status':'C','Drug':'D-penicillamine','Age':'16425', 'Sex':'F','Ascites': 'N', 'Hepatomegaly':'N','Spiders':'N','Edema':'Y','Bilirubin':'8.5','Cholesterol':'150','Albumin':'2.67','Copper':'120','Alk_Phos':'10001.2','SGOT':'150.3','Triglycerides':'102.5','Platelets':'276','Prothrombin':'10'}
sample_data2 = {'ID':'415','N_Days':'1300','Status':'C','Drug':'Placebo','Age':'15340', 'Sex':'M','Ascites': 'Y', 'Hepatomegaly':'N','Spiders':'Y','Edema':'N','Bilirubin':'10.2','Cholesterol':'90','Albumin':'3.12','Copper':'95','Alk_Phos':'12039.1','SGOT':'112.8','Triglycerides':'91.7','Platelets':'199','Prothrombin':'9'}
test_data1 = pd.Series(sample_data1)
test_data2 = pd.Series(sample_data2)
prediction1 = predict_chirrosis(test_data1,decision_tree)
prediction2 = predict_chirrosis(test_data2,decision_tree)
print("Prediction for sample data 1: Stage", prediction1, "\nPrediction for sample data 2: Stage", prediction2)

Prediction for sample data 1: Stage 3.0 
Prediction for sample data 2: Stage 4.0


In [10]:
# Menghitung akurasi decision tree classifier

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

# Mengubah value yang masih berupa string ke dalam bentuk float
dataset = df.values.tolist()
for column in range(len(dataset[0])):
  for row in dataset:
    if row[column]=='Y' or row[column]=='C' or row[column]=='CL' or row[column]=='F' or row[column]=='S' or row[column]=='D-penicillamine':
      row[column]=1.0
    elif row[column]=='N' or row[column]=='M' or row[column]=='Placebo' or row[column]=='D':
      row[column]=0.0

df2 = pd.DataFrame(dataset, columns =['ID', 'N_Days', 'Status', 'Drug', 'Age', 'Sex', 'Ascites',
        'Hepatomegaly', 'Spiders', 'Edema', 'Bilirubin', 'Cholesterol',
        'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets',
        'Prothrombin', 'Stage'], dtype = float)

# Menghapus kolom/baris yang memiliki value NA
df2 = df2.dropna()

X = df2[['N_Days', 'Status', 'Drug', 'Age', 'Sex', 'Ascites',
       'Hepatomegaly', 'Spiders', 'Edema', 'Bilirubin', 'Cholesterol',
       'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets',
       'Prothrombin']]
y = df2['Stage']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33,random_state=42)
clf_model = DecisionTreeClassifier(criterion="gini", random_state=42,max_depth=7, min_samples_leaf=5, max_leaf_nodes=3)      
clf_model.fit(X_train,y_train)
y_predict = clf_model.predict(X_test) #Making predictions to test the model on test data
print('Decision Tree Classifier Test accuracy %s' %accuracy_score(y_test,y_predict))

Decision Tree Classifier Test accuracy 0.5652173913043478
