In [6]:
#dataset preprocessing

import pandas as pd
import numpy as np
from scipy.stats import norm

df = pd.read_csv('Indian Liver Patient Dataset (ILPD).csv')
 
#add header
headerList = ['age', 'sex', 'total_Bilirubin', 'direct_Bilirubin', 'tp', 'albumin', 'AG_atio', 'SGPT', 'SGOT', 'Alkphos','Target'] 
df.to_csv("trainset.csv", header=headerList, index=False)

In [3]:
# Stats of current dataset
import pandas as pd
import numpy as np
from scipy.stats import norm

df = pd.read_csv('trainset.csv')

# Compute statistics for numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
numeric_stats = df[numeric_columns].describe()

# Compute statistics for gender column
gender_ratio = df['gender'].value_counts(normalize=True)

# Print the statistics
print("Numeric Columns Statistics:")
print(numeric_stats)

print("\nGender Ratio:")
print(gender_ratio)

Numeric Columns Statistics:
              age  total_Bilirubin  direct_Bilirubin           tp  \
count  582.000000       582.000000        582.000000   582.000000   
mean    44.711340         3.303265          1.488488   290.754296   
std     16.181921         6.213926          2.810324   243.108929   
min      4.000000         0.400000          0.100000    63.000000   
25%     33.000000         0.800000          0.200000   175.250000   
50%     45.000000         1.000000          0.300000   208.000000   
75%     57.750000         2.600000          1.300000   298.000000   
max     90.000000        75.000000         19.700000  2110.000000   

           albumin      AG_atio        SGPT        SGOT     Alkphos  \
count   582.000000   582.000000  582.000000  582.000000  578.000000   
mean     80.824742   110.068729    6.482646    3.141581    0.947145   
std     182.757696   289.141876    1.086306    0.796176    0.319863   
min      10.000000    10.000000    2.700000    0.900000    0.30000

In [24]:
# do 80 - 20 split but make sure the gender proportion remain the same in both sets

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neural_network import MLPClassifier

data = pd.read_csv('trainset.csv')
sex_mapping = {"Male": 0, "Female": 1}
data["sex"] = data["sex"].map(sex_mapping)

columns_to_normalize = ['total_Bilirubin','direct_Bilirubin','tp','albumin','AG_atio','SGPT','SGOT','Alkphos']

# Initialize Min-Max scaler and Standard scaler
min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

# Min-Max scaling for selected columns
data[columns_to_normalize] = min_max_scaler.fit_transform(data[columns_to_normalize])

# Z-score normalization for selected columns
data[columns_to_normalize] = standard_scaler.fit_transform(data[columns_to_normalize])

X = data.drop('Target', axis=1)
y = data['Target']
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy on base dataset: {accuracy_score(y_test,y_pred)}")

     age  sex  total_Bilirubin  direct_Bilirubin        tp   albumin  \
0     62    0         1.223586          1.428648  1.680715 -0.092140   
1     62    0         0.643743          0.930056  0.820279 -0.114045   
2     58    0        -0.370981         -0.387651 -0.447733 -0.365961   
3     72    0         0.096115          0.182168 -0.394213 -0.294768   
4     46    0        -0.242127         -0.280810 -0.340693 -0.338579   
..   ...  ...              ...               ...       ...       ...   
577   60    0        -0.451514         -0.494492  0.861448 -0.333103   
578   40    0        -0.435407         -0.494492 -0.793554 -0.250956   
579   52    0        -0.403194         -0.458878 -0.188367 -0.179762   
580   31    0        -0.322660         -0.352037 -0.439499 -0.283815   
581   38    0        -0.370981         -0.423264 -0.307758 -0.327626   

      AG_atio      SGPT      SGOT   Alkphos  Target  
0   -0.034853  0.937332  0.073438 -0.651375       1  
1   -0.145620  0.476661  0.