In [1]:
import pandas as pd

LIVER = '/kaggle/input/liver-disorders/Indian Liver Patient Dataset (ILPD).csv'
df = pd.read_csv(filepath_or_buffer=LIVER)
df.head()

Unnamed: 0,Age,Gender,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,A/G Ratio,Selector
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        583 non-null    int64  
 1   Gender     583 non-null    object 
 2   TB         583 non-null    float64
 3   DB         583 non-null    float64
 4   Alkphos    583 non-null    int64  
 5   Sgpt       583 non-null    int64  
 6   Sgot       583 non-null    int64  
 7   TP         583 non-null    float64
 8   ALB        583 non-null    float64
 9   A/G Ratio  579 non-null    float64
 10  Selector   583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [3]:
df.nunique()

Age           72
Gender         2
TB           113
DB            80
Alkphos      263
Sgpt         152
Sgot         177
TP            58
ALB           40
A/G Ratio     69
Selector       2
dtype: int64

In [4]:
from plotly.express import histogram
for column in df.columns[:-1]:
    histogram(data_frame=df, x=column, color='Selector', log_y=column in {'TB', 'DB', 'Alkphos', 'Sgpt', 'Sgot'}).show()

In [5]:
from plotly.express import imshow
imshow(img=df.corr(numeric_only=True))

This is not good news; at first glance it looks like none of our variables are highly correlated with the target variable.

In [6]:
df.columns

Index(['Age', 'Gender', 'TB', 'DB', 'Alkphos', 'Sgpt', 'Sgot', 'TP', 'ALB',
       'A/G Ratio', 'Selector'],
      dtype='object')

In [7]:
from sklearn.manifold import TSNE
from plotly.express import scatter
tsne = TSNE(n_components=2, random_state=2024, init='pca', verbose=1,)
tsne_df = pd.DataFrame(data=tsne.fit_transform(X=df[[
    'Age', 
    'TB', 
    'DB', 
    'Alkphos', 
    'Sgpt',
    'Sgot', 
    'TP', 
    'ALB',
       'A/G Ratio',
]].fillna(value=df['A/G Ratio'].mean())), columns=['tx', 'ty'])
tsne_df['Selector'] = df['Selector']
tsne_df['Gender'] = df['Gender']
scatter(data_frame=tsne_df, x='tx', y='ty', symbol='Selector', color='Gender')

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 583 samples in 0.001s...
[t-SNE] Computed neighbors for 583 samples in 0.015s...
[t-SNE] Computed conditional probabilities for sample 583 / 583
[t-SNE] Mean sigma: 17.669739
[t-SNE] KL divergence after 250 iterations with early exaggeration: 56.844849
[t-SNE] KL divergence after 1000 iterations: 0.513042


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

columns = ['Age', 'TB', 'DB', 'Alkphos', 'Sgpt', 'Sgot', 'TP', 'ALB', 'A/G Ratio',]
X_train, X_test, y_train, y_test = train_test_split(df[columns].fillna(value=df['A/G Ratio'].mean()), (df['Selector'] - 1).astype(bool), test_size=0.25, random_state=2024)
model = LogisticRegression(max_iter=100000)
model.fit(X_train, y_train)
print(accuracy_score(y_test, model.predict(X_test)))

0.7397260273972602


This is so much better than we might expect from the TNSE plot above.

In [9]:
from sklearn.metrics import classification_report
print(classification_report(y_true = y_test, y_pred = model.predict(X_test)))

              precision    recall  f1-score   support

       False       0.78      0.90      0.83       107
        True       0.52      0.31      0.39        39

    accuracy                           0.74       146
   macro avg       0.65      0.60      0.61       146
weighted avg       0.71      0.74      0.72       146



In [10]:
histogram(x=columns, y=model.coef_[0])

Wow. Virtually all of our signal is in one variable, but it seems like all of the variables contribute to the target.