# Class 11

## Import Libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Task 1

### Load data

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00225/Indian%20Liver%20Patient%20Dataset%20(ILPD).csv'
columns = ["Age",
           "Gender",
           "TB Total Bilirubin",
           "DB Direct Bilirubin",
           "Alkphos Alkaline Phosphotase",
           "Sgpt Alamine Aminotransferase",
           "Sgot Aspartate Aminotransferase",
           "TP Total Protiens",
           "ALB Albumin",
           "A/G Ratio Albumin / Globulin Ratio",
           "Class"]
df = pd.read_csv(url, header=None, names=columns)

### Pre-processes

#### Data Cleaning

##### understand basic of data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Age                                 583 non-null    int64  
 1   Gender                              583 non-null    object 
 2   TB Total Bilirubin                  583 non-null    float64
 3   DB Direct Bilirubin                 583 non-null    float64
 4   Alkphos Alkaline Phosphotase        583 non-null    int64  
 5   Sgpt Alamine Aminotransferase       583 non-null    int64  
 6   Sgot Aspartate Aminotransferase     583 non-null    int64  
 7   TP Total Protiens                   583 non-null    float64
 8   ALB Albumin                         583 non-null    float64
 9   A/G Ratio Albumin / Globulin Ratio  579 non-null    float64
 10  Class                               583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usa

In [5]:
df.describe()

Unnamed: 0,Age,TB Total Bilirubin,DB Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,TP Total Protiens,ALB Albumin,A/G Ratio Albumin / Globulin Ratio,Class
count,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0,579.0,583.0
mean,44.746141,3.298799,1.486106,290.576329,80.713551,109.910806,6.48319,3.141852,0.947064,1.286449
std,16.189833,6.209522,2.808498,242.937989,182.620356,288.918529,1.085451,0.795519,0.319592,0.45249
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,33.0,0.8,0.2,175.5,23.0,25.0,5.8,2.6,0.7,1.0
50%,45.0,1.0,0.3,208.0,35.0,42.0,6.6,3.1,0.93,1.0
75%,58.0,2.6,1.3,298.0,60.5,87.0,7.2,3.8,1.1,2.0
max,90.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


In [6]:
df.shape

(583, 11)

##### handling with null data

In [7]:
df.isnull().apply(sum)

Age                                   0
Gender                                0
TB Total Bilirubin                    0
DB Direct Bilirubin                   0
Alkphos Alkaline Phosphotase          0
Sgpt Alamine Aminotransferase         0
Sgot Aspartate Aminotransferase       0
TP Total Protiens                     0
ALB Albumin                           0
A/G Ratio Albumin / Globulin Ratio    4
Class                                 0
dtype: int64

In [8]:
df = df.drop(df[df['A/G Ratio Albumin / Globulin Ratio'].isna()].index)
df.head()

Unnamed: 0,Age,Gender,TB Total Bilirubin,DB Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,TP Total Protiens,ALB Albumin,A/G Ratio Albumin / Globulin Ratio,Class
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


##### handling with duplicate data

In [9]:
df[df.duplicated()]

Unnamed: 0,Age,Gender,TB Total Bilirubin,DB Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,TP Total Protiens,ALB Albumin,A/G Ratio Albumin / Globulin Ratio,Class
19,40,Female,0.9,0.3,293,232,245,6.8,3.1,0.8,1
26,34,Male,4.1,2.0,289,875,731,5.0,2.7,1.1,1
34,38,Female,2.6,1.2,410,59,57,5.6,3.0,0.8,2
55,42,Male,8.9,4.5,272,31,61,5.8,2.0,0.5,1
62,58,Male,1.0,0.5,158,37,43,7.2,3.6,1.0,1
106,36,Male,5.3,2.3,145,32,92,5.1,2.6,1.0,2
108,36,Male,0.8,0.2,158,29,39,6.0,2.2,0.5,2
138,18,Male,0.8,0.2,282,72,140,5.5,2.5,0.8,1
143,30,Male,1.6,0.4,332,84,139,5.6,2.7,0.9,1
158,72,Male,0.7,0.1,196,20,35,5.8,2.0,0.5,1


In [10]:
df.drop_duplicates(keep='first', inplace=True)

In [11]:
df.shape

(566, 11)

## Task 4 Encoding categorical attributes

In [12]:
# Convert categorical data in 'Gender' column to numerical
# df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# encodes and replaces the original values on the dataframe
values = enc.fit_transform(df.loc[:, ['Gender']])

df['Gender'] = values


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 566 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Age                                 566 non-null    int64  
 1   Gender                              566 non-null    float64
 2   TB Total Bilirubin                  566 non-null    float64
 3   DB Direct Bilirubin                 566 non-null    float64
 4   Alkphos Alkaline Phosphotase        566 non-null    int64  
 5   Sgpt Alamine Aminotransferase       566 non-null    int64  
 6   Sgot Aspartate Aminotransferase     566 non-null    int64  
 7   TP Total Protiens                   566 non-null    float64
 8   ALB Albumin                         566 non-null    float64
 9   A/G Ratio Albumin / Globulin Ratio  566 non-null    float64
 10  Class                               566 non-null    int64  
dtypes: float64(6), int64(5)
memory usage: 53.1 KB


## Task 2

In [14]:
y = df['Class']
X = df.drop(columns=['Class'])

## Task 3

In [15]:
accuracy_scores = []
for seed in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    model = DecisionTreeClassifier(random_state=seed)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, predictions))

decision_tree_avg_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f"Average accuracy of DecisionTreeClassifier: {decision_tree_avg_accuracy}")

Average accuracy of DecisionTreeClassifier: 0.6228070175438598


## Task 5

In [16]:
rf_accuracy_scores = []
for seed in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    rf_model = RandomForestClassifier(random_state=seed)
    rf_model.fit(X_train, y_train)
    rf_predictions = rf_model.predict(X_test)
    rf_accuracy_scores.append(accuracy_score(y_test, rf_predictions))

random_forest_avg_accuracy = sum(rf_accuracy_scores) / len(rf_accuracy_scores)
print(f"Average accuracy of RandomForestClassifier: {random_forest_avg_accuracy}")

# Compare DecisionTree and RandomForest
print(f"RandomForestClassifier performed {'better' if random_forest_avg_accuracy > decision_tree_avg_accuracy else 'worse'} than DecisionTreeClassifier.")


Average accuracy of RandomForestClassifier: 0.6903508771929824
RandomForestClassifier performed better than DecisionTreeClassifier.


## Task 4

In [17]:
X = df.drop(columns=['Gender'])
y = df['Gender']

X_train_gender, X_test_gender, y_train_gender, y_test_gender = train_test_split(X, y, test_size=0.2, random_state=0)
gender_model = DecisionTreeClassifier(random_state=0)
gender_model.fit(X_train_gender, y_train_gender)
gender_predictions = gender_model.predict(X_test_gender)

gender_accuracy = accuracy_score(y_test_gender, gender_predictions)
print(f"Accuracy of Gender prediction model: {gender_accuracy}")

Accuracy of Gender prediction model: 0.7368421052631579
