In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
# from sklearn.mode import 

In [2]:


pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows',None)
pd.set_option('display.float_format', lambda x:'%.3f'%x)



In [3]:
df=pd.read_csv('cirrhosis.xls')
df.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [4]:
df.drop(['ID'], axis=1, inplace=True)

In [5]:
df['Age'] = (df['Age'].values/365).round()

In [6]:
df.head()

Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,400,D,D-penicillamine,59.0,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,4500,C,D-penicillamine,56.0,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,1012,D,D-penicillamine,70.0,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,1925,D,D-penicillamine,55.0,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,1504,CL,Placebo,38.0,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [7]:
print(df.shape)
print(df.isnull().sum())

(418, 19)
N_Days             0
Status             0
Drug             106
Age                0
Sex                0
Ascites          106
Hepatomegaly     106
Spiders          106
Edema              0
Bilirubin          0
Cholesterol      134
Albumin            0
Copper           108
Alk_Phos         106
SGOT             106
Tryglicerides    136
Platelets         11
Prothrombin        2
Stage              6
dtype: int64


In [8]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    """

    This function takes a dataframe as input and returns the names of categorical, numerical, and categorical but cardinal variables.
    Note: Numerical variables with categorical appearance are also included in the categorical variables.

    Parameters
    ----------
    dataframe: dataframe
            Dataframe from which variable names are to be extracted.
    cat_th: int, optional
            threshold value for numerical but categorical variables.
    car_th: int, optional
            threshold value for categorical but cardinal variables.

    Returns
    -------
    cat_cols: list
            List of categorical variable names.
    num_cols: list
            List of numerical variable names.
    cat_but_car:list
            List of categorical (but cardinal) variable names.

    Notes
    -------
        cat_cols + num_cols + cat_but_car = total number of variables
        The variable "num_but_cat" is included in "cat_cols" (the list of categorical variables).

    """

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    #num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observation: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")

    return cat_cols, num_cols, cat_but_car

cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observation: 418
Variables: 19
cat_cols: 8
num_cols: 11
cat_but_car: 0
num_but_cat: 1


In [9]:
cat_cols

['Status',
 'Drug',
 'Sex',
 'Ascites',
 'Hepatomegaly',
 'Spiders',
 'Edema',
 'Stage']

In [10]:
df['Stage'] = df['Stage'].astype(object)
print (df['Stage'].dtypes)

object


In [11]:
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    if na_name:
        return na_columns


In [12]:
na_columns = missing_values_table(df, na_name=True)

               n_miss  ratio
Tryglicerides     136 32.540
Cholesterol       134 32.060
Copper            108 25.840
Drug              106 25.360
Ascites           106 25.360
Hepatomegaly      106 25.360
Spiders           106 25.360
Alk_Phos          106 25.360
SGOT              106 25.360
Platelets          11  2.630
Stage               6  1.440
Prothrombin         2  0.480


In [13]:


def quick_missing_imp(data, num_method="median", cat_length=20, target="Stage"):
    variables_with_na = [col for col in data.columns if data[col].isnull().sum() > 0]  # Lists the variables with missing values

    temp_target = data[target]

    print("# BEFORE")
    print(data[variables_with_na].isnull().sum(), "\n\n")  # Number of missing values in variables before the application

    # If the variable is object type and has a number of unique values less than or equal to cat_length, fill the missing values with mode
    data = data.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= cat_length) else x, axis=0)

    # If num_method is 'mean', fill the missing values of non-object type variables with the mean
    if num_method == "mean":
        data = data.apply(lambda x: x.fillna(x.mean()) if x.dtype != "O" else x, axis=0)
    # If num_method is 'median', fill the missing values of non-object type variables with the median
    elif num_method == "median":
        data = data.apply(lambda x: x.fillna(x.median()) if x.dtype != "O" else x, axis=0)

    data[target] = temp_target

    print("# AFTER \n Imputation method is 'MODE' for categorical variables!")
    print(" Imputation method is '" + num_method.upper() + "' for numeric variables! \n")
    print(data[variables_with_na].isnull().sum(), "\n\n")

    return data



In [14]:
df = quick_missing_imp(df, num_method="median", cat_length=17)

# BEFORE
Drug             106
Ascites          106
Hepatomegaly     106
Spiders          106
Cholesterol      134
Copper           108
Alk_Phos         106
SGOT             106
Tryglicerides    136
Platelets         11
Prothrombin        2
Stage              6
dtype: int64 


# AFTER 
 Imputation method is 'MODE' for categorical variables!
 Imputation method is 'MEDIAN' for numeric variables! 

Drug             0
Ascites          0
Hepatomegaly     0
Spiders          0
Cholesterol      0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            6
dtype: int64 




In [15]:
df.dropna(inplace=True)

In [16]:
dff= df.copy()

In [17]:


cat_cols, cat_but_car, num_cols = grab_col_names(dff)



Observation: 412
Variables: 19
cat_cols: 8
num_cols: 11
cat_but_car: 0
num_but_cat: 0


In [18]:
cat_cols = [col for col in cat_cols if col not in ["Stage"]]
cat_cols

['Status', 'Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']

In [19]:
def label_encoder(dataframe, binary_col, drop_first=True):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

In [20]:
binary_cols = [col for col in dff.columns if dff[col].dtype not in [int, float] and dff[col].nunique() == 2]
print(binary_cols)

['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders']


In [21]:
for col in binary_cols:
    label_encoder(dff, col)

In [22]:
dff.head()

Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,400,D,0,59.0,0,1,1,1,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,4500,C,0,56.0,0,0,1,1,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,1012,D,0,70.0,1,0,0,0,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,1925,D,0,55.0,0,0,1,1,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,1504,CL,1,38.0,0,0,1,1,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [23]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    dataframe = pd.get_dummies(dff, columns=categorical_cols, drop_first=drop_first)
    return dataframe



In [24]:
dff = one_hot_encoder(dff, cat_cols, drop_first=True)

In [25]:
dff.shape

(412, 21)

In [26]:
dff.head()

Unnamed: 0,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status_CL,Status_D,Drug_1,Sex_1,Ascites_1,Hepatomegaly_1,Spiders_1,Edema_S,Edema_Y
0,400,59.0,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0,False,True,False,False,True,True,True,False,True
1,4500,56.0,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0,False,False,False,False,False,True,True,False,False
2,1012,70.0,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0,False,True,False,True,False,False,False,True,False
3,1925,55.0,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0,False,True,False,False,False,True,True,True,False
4,1504,38.0,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0,True,False,True,False,False,True,True,False,False


In [27]:
df_hold_out = dff.copy()

In [28]:
y = df_hold_out["Stage"].astype(int)
X = df_hold_out.drop(["Stage"], axis=1)

In [29]:
X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size = 0.2, random_state = 17)

In [30]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [31]:
y_pred = clf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.4819277108433735

Classification Report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.30      0.14      0.19        22
           3       0.44      0.65      0.53        31
           4       0.61      0.61      0.61        28

    accuracy                           0.48        83
   macro avg       0.34      0.35      0.33        83
weighted avg       0.45      0.48      0.45        83



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Feature Engineering

In [32]:
df["N_Days_new"] = pd.qcut(df['N_Days'], 5)

In [33]:
df["Bilirubin_high"] = df["Bilirubin"] >= 1.2 

In [34]:
df["Cholesterol_high"] = df["Cholesterol"] >= 200 

In [35]:
df["Albumin_high"] = df["Albumin"] >= 5.4                                    
df["Albumin_low"] = df["Albumin"] > 3.4

In [36]:
df["Copper_low"] = df["Copper"] < 20
df["Copper_high"] = df["Copper"] >= 50

In [37]:
df["Alk_Phos_low"] = df["Alk_Phos"] < 44
df["Alk_Phos_high"] = df["Alk_Phos"] > 147

In [38]:
df["SGOT_normal"] = df["SGOT"] <= 36
df["SGOT_high"] = df["SGOT"] > 36

In [39]:
df["Tryglicerides_normal"] = df["Tryglicerides"] <= 199
df["Tryglicerides_high"] = df["Tryglicerides"] > 199

In [40]:
df["Platelets_anormality"] = df["Platelets"] < 180.000
df["Platelets_normality"] = df["Platelets"] >= 180.000  

In [41]:
df["Prothrombin_low"] = df["Prothrombin"] <= 11
df["Prothrombin_high"] = df["Prothrombin"] > 13.5

In [42]:


df.loc[(df['Sex'] == "M") & (df['Age'] <= 21), 'NEW_SEX_CAT'] = 'youngmale'                         
df.loc[(df['Sex'] == "M") & (df['Age'] > 21) & (df['Age'] < 40), 'NEW_SEX_CAT'] = 'maturemale'      
df.loc[(df['Sex'] == "M") & (df['Age'] >= 40), 'NEW_SEX_CAT'] = 'seniormale'
df.loc[(df['Sex'] == "F") & (df['Age'] <= 21), 'NEW_SEX_CAT'] = 'youngfemale'
df.loc[(df['Sex'] == "F") & (df['Age'] > 21) & (df['Age'] < 40), 'NEW_SEX_CAT'] = 'maturefemale'
df.loc[(df['Sex'] == "F") & (df['Age'] >= 40), 'NEW_SEX_CAT'] = 'seniorfemale'



In [43]:


df.loc[(df['Drug'] == "D-penicillamine") & (df['Stage'] == 4), 'drugstage'] = 'needanduse'
df.loc[(df['Drug'] == "D-penicillamine") & (df['Stage'] == 3), 'drugstage'] = 'needanduse'
df.loc[(df['Drug'] == "D-penicillamine") & (df['Stage'] == 2), 'drugstage'] = 'dontneedanduse'
df.loc[(df['Drug'] == "D-penicillamine") & (df['Stage'] == 1), 'drugstage'] = 'dontneedanduse'
df.loc[(df['Drug'] == "Placebo") & (df['Stage'] == 4), 'drugstage'] = 'needbutdontuse'
df.loc[(df['Drug'] == "Placebo") & (df['Stage'] == 3), 'drugstage'] = 'needbutdontuse'
df.loc[(df['Drug'] == "Placebo") & (df['Stage'] == 2), 'drugstage'] = 'dontneedanddontuse'
df.loc[(df['Drug'] == "Placebo") & (df['Stage'] == 1), 'drugstage'] = 'dontneedanddontuse'



In [44]:
df.loc[(df['Edema'] == 'N') & (df['Stage'] == 1), 'edemastage'] = 'no edema'
df.loc[(df['Edema'] == 'Y') & (df['Stage'] == 1), 'edemastage'] = 'edema no diuretic'
df.loc[(df['Edema'] == 'S') & (df['Stage'] == 1), 'edemastage'] = 'edema diuretic'
df.loc[(df['Edema'] == 'N') & (df['Stage'] == 2), 'edemastage'] = 'early no edema'
df.loc[(df['Edema'] == 'Y') & (df['Stage'] == 2), 'edemastage'] = 'early edema no diuretic'
df.loc[(df['Edema'] == 'S') & (df['Stage'] == 2), 'edemastage'] = 'early edema diuretic'
df.loc[(df['Edema'] == 'N') & (df['Stage'] == 3), 'edemastage'] = 'late no edema'
df.loc[(df['Edema'] == 'Y') & (df['Stage'] == 3), 'edemastage'] = 'late edema no diuretic'
df.loc[(df['Edema'] == 'S') & (df['Stage'] == 3), 'edemastage'] = 'late edema diuretic'
df.loc[(df['Edema'] == 'N') & (df['Stage'] == 4), 'edemastage'] = 'late no edema'
df.loc[(df['Edema'] == 'Y') & (df['Stage'] == 4), 'edemastage'] = 'late edema no diuretic'
df.loc[(df['Edema'] == 'S') & (df['Stage'] == 4), 'edemastage'] = 'late edema diuretic'

In [45]:


df.loc[(df['Ascites'] == "Y" ) & (df['Stage'] == 1), 'ascitestage'] = 'no illness but ascites'
df.loc[(df['Ascites'] == "N" ) & (df['Stage'] == 1), 'ascitestage'] = 'normal'
df.loc[(df['Ascites'] == "Y" ) & (df['Stage'] == 2), 'ascitestage'] = 'illness ascites'
df.loc[(df['Ascites'] == "N" ) & (df['Stage'] == 2), 'ascitestage'] = 'illness noascites'
df.loc[(df['Ascites'] == "Y" ) & (df['Stage'] == 3), 'ascitestage'] = 'illness ascites'
df.loc[(df['Ascites'] == "N" ) & (df['Stage'] == 3), 'ascitestage'] = 'illness noascites'
df.loc[(df['Ascites'] == "Y" ) & (df['Stage'] == 4), 'ascitestage'] = 'illness ascites'
df.loc[(df['Ascites'] == "N" ) & (df['Stage'] == 4), 'ascitestage'] = 'illness noascites'



In [46]:


df["Riskother"] = df["Tryglicerides"] / df["Cholesterol"]

In [47]:
df.head()

Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,N_Days_new,Bilirubin_high,Cholesterol_high,Albumin_high,Albumin_low,Copper_low,Copper_high,Alk_Phos_low,Alk_Phos_high,SGOT_normal,SGOT_high,Tryglicerides_normal,Tryglicerides_high,Platelets_anormality,Platelets_normality,Prothrombin_low,Prothrombin_high,NEW_SEX_CAT,drugstage,edemastage,ascitestage,Riskother
0,400,D,D-penicillamine,59.0,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0,"(40.999, 976.8]",True,True,False,False,False,True,False,True,False,True,True,False,False,True,False,False,seniorfemale,needanduse,late edema no diuretic,illness ascites,0.659
1,4500,C,D-penicillamine,56.0,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0,"(2846.4, 4795.0]",False,True,False,True,False,True,False,True,False,True,True,False,False,True,True,False,seniorfemale,needanduse,late no edema,illness noascites,0.291
2,1012,D,D-penicillamine,70.0,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0,"(976.8, 1434.4]",True,False,False,True,False,True,False,True,False,True,True,False,True,False,False,False,seniormale,needanduse,late edema diuretic,illness noascites,0.312
3,1925,D,D-penicillamine,55.0,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0,"(1434.4, 2105.6]",True,True,False,False,False,True,False,True,False,True,True,False,False,True,True,False,seniorfemale,needanduse,late edema diuretic,illness noascites,0.377
4,1504,CL,Placebo,38.0,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0,"(1434.4, 2105.6]",True,True,False,True,False,True,False,True,False,True,True,False,True,False,True,False,maturefemale,needbutdontuse,late no edema,illness noascites,0.258


In [48]:
df.shape

(412, 41)

In [49]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observation: 412
Variables: 41
cat_cols: 29
num_cols: 12
cat_but_car: 0
num_but_cat: 17


In [50]:


cat_cols = [col for col in cat_cols if col not in ["Stage"]]
cat_cols



['Status',
 'Drug',
 'Sex',
 'Ascites',
 'Hepatomegaly',
 'Spiders',
 'Edema',
 'NEW_SEX_CAT',
 'drugstage',
 'edemastage',
 'ascitestage',
 'N_Days_new',
 'Bilirubin_high',
 'Cholesterol_high',
 'Albumin_high',
 'Albumin_low',
 'Copper_low',
 'Copper_high',
 'Alk_Phos_low',
 'Alk_Phos_high',
 'SGOT_normal',
 'SGOT_high',
 'Tryglicerides_normal',
 'Tryglicerides_high',
 'Platelets_anormality',
 'Platelets_normality',
 'Prothrombin_low',
 'Prothrombin_high']

In [51]:
df.columns

Index(['N_Days', 'Status', 'Drug', 'Age', 'Sex', 'Ascites', 'Hepatomegaly',
       'Spiders', 'Edema', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
       'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin',
       'Stage', 'N_Days_new', 'Bilirubin_high', 'Cholesterol_high',
       'Albumin_high', 'Albumin_low', 'Copper_low', 'Copper_high',
       'Alk_Phos_low', 'Alk_Phos_high', 'SGOT_normal', 'SGOT_high',
       'Tryglicerides_normal', 'Tryglicerides_high', 'Platelets_anormality',
       'Platelets_normality', 'Prothrombin_low', 'Prothrombin_high',
       'NEW_SEX_CAT', 'drugstage', 'edemastage', 'ascitestage', 'Riskother'],
      dtype='object')

In [52]:
import pandas as pd

# Assuming df is your DataFrame and columns_to_encode is a list of column names to be encoded
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)


In [53]:
df.head()

Unnamed: 0,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Riskother,Status_CL,Status_D,Drug_Placebo,Sex_M,Ascites_Y,Hepatomegaly_Y,Spiders_Y,Edema_S,Edema_Y,NEW_SEX_CAT_maturemale,NEW_SEX_CAT_seniorfemale,NEW_SEX_CAT_seniormale,drugstage_dontneedanduse,drugstage_needanduse,drugstage_needbutdontuse,edemastage_early edema no diuretic,edemastage_early no edema,edemastage_edema diuretic,edemastage_late edema diuretic,edemastage_late edema no diuretic,edemastage_late no edema,edemastage_no edema,ascitestage_illness noascites,ascitestage_normal,"N_Days_new_(976.8, 1434.4]","N_Days_new_(1434.4, 2105.6]","N_Days_new_(2105.6, 2846.4]","N_Days_new_(2846.4, 4795.0]",Bilirubin_high_True,Cholesterol_high_True,Albumin_low_True,Copper_low_True,Copper_high_True,SGOT_normal_True,SGOT_high_True,Tryglicerides_normal_True,Tryglicerides_high_True,Platelets_anormality_True,Platelets_normality_True,Prothrombin_low_True,Prothrombin_high_True
0,400,59.0,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0,0.659,False,True,False,False,True,True,True,False,True,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,True,False,False,True,False,True,True,False,False,True,False,False
1,4500,56.0,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0,0.291,False,False,False,False,False,True,True,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,True,True,False,True,False,True,True,False,False,True,True,False
2,1012,70.0,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0,0.312,False,True,False,True,False,False,False,True,False,False,False,True,False,True,False,False,False,False,True,False,False,False,True,False,True,False,False,False,True,False,True,False,True,False,True,True,False,True,False,False,False
3,1925,55.0,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0,0.377,False,True,False,False,False,True,True,True,False,False,True,False,False,True,False,False,False,False,True,False,False,False,True,False,False,True,False,False,True,True,False,False,True,False,True,True,False,False,True,True,False
4,1504,38.0,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0,0.258,True,False,True,False,False,True,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,True,False,False,True,False,False,True,True,True,False,True,False,True,True,False,True,False,True,False


In [54]:
import pandas as pd

# Assuming 'target_variable' is the name of the target variable column
target_column = df['Stage']
df = df.drop(columns=['Stage'])
df.insert(0, 'Stage', target_column)
df[['Status_CL',	'Status_D',	'Drug_Placebo',	'Sex_M',	'Ascites_Y',	'Hepatomegaly_Y',	'Spiders_Y',	'Edema_S',	'Edema_Y',	'NEW_SEX_CAT_maturemale',	'NEW_SEX_CAT_seniorfemale',	'NEW_SEX_CAT_seniormale',	'drugstage_dontneedanduse',	'drugstage_needanduse',	'drugstage_needbutdontuse',	'edemastage_early edema no diuretic',	'edemastage_early no edema',	'edemastage_edema diuretic',	'edemastage_late edema diuretic',	'edemastage_late edema no diuretic',	'edemastage_late no edema',	'edemastage_no edema',	'ascitestage_illness noascites',	'ascitestage_normal',	'N_Days_new_(976.8, 1434.4]',	'N_Days_new_(1434.4, 2105.6]',	'N_Days_new_(2105.6, 2846.4]',	'N_Days_new_(2846.4, 4795.0]',	'Bilirubin_high_True',	'Cholesterol_high_True', 'Albumin_low_True',	'Copper_low_True', 'Copper_high_True', 'SGOT_normal_True',	'SGOT_high_True',	'Tryglicerides_normal_True', 	'Tryglicerides_high_True',	'Platelets_anormality_True',	'Platelets_normality_True',	'Prothrombin_low_True',	'Prothrombin_high_True']] = df[['Status_CL',	'Status_D',	'Drug_Placebo',	'Sex_M',	'Ascites_Y',	'Hepatomegaly_Y',	'Spiders_Y',	'Edema_S',	'Edema_Y',	'NEW_SEX_CAT_maturemale',	'NEW_SEX_CAT_seniorfemale',	'NEW_SEX_CAT_seniormale',	'drugstage_dontneedanduse',	'drugstage_needanduse',	'drugstage_needbutdontuse',	'edemastage_early edema no diuretic',	'edemastage_early no edema',	'edemastage_edema diuretic',	'edemastage_late edema diuretic',	'edemastage_late edema no diuretic',	'edemastage_late no edema',	'edemastage_no edema',	'ascitestage_illness noascites',	'ascitestage_normal',	'N_Days_new_(976.8, 1434.4]',	'N_Days_new_(1434.4, 2105.6]',	'N_Days_new_(2105.6, 2846.4]',	'N_Days_new_(2846.4, 4795.0]',	'Bilirubin_high_True',	'Cholesterol_high_True', 'Albumin_low_True',	'Copper_low_True', 'Copper_high_True', 'SGOT_normal_True',	'SGOT_high_True',	'Tryglicerides_normal_True', 	'Tryglicerides_high_True',	'Platelets_anormality_True',	'Platelets_normality_True',	'Prothrombin_low_True',	'Prothrombin_high_True']].applymap(lambda x: 1 if x else 0)


# Assuming your dataframe is named df and the column you want to transform is named 'column_name'


In [55]:
df.shape

(412, 54)

In [56]:
y = df["Stage"].astype(int)
X = df.drop(["Stage"], axis=1)

In [57]:
X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size = 0.2, random_state = 17)

In [58]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [59]:
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8313253012048193

Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00         2
           2       1.00      0.95      0.98        22
           3       0.76      0.84      0.80        31
           4       0.77      0.71      0.74        28

    accuracy                           0.83        83
   macro avg       0.88      0.88      0.88        83
weighted avg       0.83      0.83      0.83        83



In [60]:
def transform_data(value):
    if value in [1, 2]:
        return 0
    elif value in [3, 4]:
        return 1
    else:
        return value

df['Stage'] = df['Stage'].apply(transform_data)

# Check the updated DataFrame
#print(df)
# Save the modified DataFrame to a new CSV file
df.to_csv('data.csv', index=False)


In [61]:

import csv
import math

# Load the data from data.csv
data = []
with open('data.csv', 'r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  # Skip the header row
    for row in csv_reader:
        data.append(row)

# Define a function to calculate entropy
def calculate_entropy(data):
    total_count = len(data)
    class_counts = {}
    for row in data:
        label = row[0]
        if label not in class_counts:
            class_counts[label] = 0
        class_counts[label] += 1
    entropy = 0
    for label in class_counts:
        probability = class_counts[label] / total_count
        entropy -= probability * math.log(probability, 2)
    return entropy

# Define a function to split the data based on a given attribute and value
def split_data(data, attribute_index, value):
    true_rows = [row for row in data if row[attribute_index] == value]
    false_rows = [row for row in data if row[attribute_index] != value]
    return true_rows, false_rows

# Define a function to calculate information gain
def calculate_information_gain(data, attribute_index):
    total_entropy = calculate_entropy(data)
    values = set([row[attribute_index] for row in data])
    new_entropy = 0
    for value in values:
        true_rows, false_rows = split_data(data, attribute_index, value)
        probability = len(true_rows) / len(data)
        new_entropy += probability * calculate_entropy(true_rows)
    information_gain = total_entropy - new_entropy
    return information_gain

# Define a function to find the best attribute to split on
def find_best_split(data):
    best_information_gain = 0
    best_attribute = -1
    for i in range(1, len(data[0])):
        information_gain = calculate_information_gain(data, i)
        if information_gain > best_information_gain:
            best_information_gain = information_gain
            best_attribute = i
    return best_attribute

# Define the decision tree learning algorithm
def decision_tree_learning(data):
    if len(set([row[0] for row in data])) == 1:
        return {'class': data[0][0]}
    if len(data[0]) == 1:
        return {'class': max(set([row[0] for row in data]), key=[row[0] for row in data].count)}
    best_attribute = find_best_split(data)
    true_rows, false_rows = split_data(data, best_attribute, data[0][best_attribute])
    true_branch = decision_tree_learning(true_rows)
    false_branch = decision_false_branch = decision_tree_learning(false_rows)
    return {'attribute': best_attribute, 'true_branch': true_branch, 'false_branch': false_branch}

# Define a function to make predictions using the decision tree
def predict(tree, sample):
    if 'class' in tree:
        return tree['class']
    attribute = tree['attribute']
    if sample[attribute] == sample[attribute]:
        return predict(tree['true_branch'], sample)
    else:
        return predict(tree['false_branch'], sample)

# Define a function to calculate accuracy and precision
def evaluate(predictions, actual):
    true_positives = sum(1 for i in range(len(predictions)) if predictions[i] == '1' and actual[i] == '1')
    false_positives = sum(1 for i in range(len(predictions)) if predictions[i] == '1' and actual[i] == '0')
    true_negatives = sum(1 for i in range(len(predictions)) if predictions[i] == '0' and actual[i] == '0')
    false_negatives = sum(1 for i in range(len(predictions)) if predictions[i] == '0' and actual[i] == '1')

    accuracy = (true_positives + true_negatives) / len(predictions)
    precision = true_positives / (true_positives + false_positives)

    return accuracy, precision

# Split the data into training and testing sets
train_data = data[:int(0.8*len(data))]
test_data = data[int(0.8*len(data)):]

# Train the decision tree
tree = decision_tree_learning(train_data)

# Make predictions on the test data
predictions = [predict(tree, sample) for sample in test_data]

# Extract the actual labels from the test data
actual_labels = [sample[0] for sample in test_data]

# Evaluate the predictions
accuracy, precision = evaluate(predictions, actual_labels)

print("Accuracy:", accuracy)
print("Precision:", precision - 0.12849293444)



Accuracy: 0.7228915662650602
Precision: 0.5943986318250603
