# Loading the Dataset

In [64]:
import pandas as pd

In [65]:
dataset = pd.read_csv('HepatitisCdata.csv')

# Understanding the data

In [66]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,2,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,3,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,4,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,5,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [67]:
dataset.tail()

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
610,611,3=Cirrhosis,62,f,32.0,416.6,5.9,110.3,50.0,5.57,6.3,55.7,650.9,68.5
611,612,3=Cirrhosis,64,f,24.0,102.8,2.9,44.4,20.0,1.54,3.02,63.0,35.9,71.3
612,613,3=Cirrhosis,64,f,29.0,87.3,3.5,99.0,48.0,1.66,3.63,66.7,64.2,82.0
613,614,3=Cirrhosis,46,f,33.0,,39.0,62.0,20.0,3.56,4.2,52.0,50.0,71.0
614,615,3=Cirrhosis,59,f,36.0,,100.0,80.0,12.0,9.07,5.3,67.0,34.0,68.0


In [68]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  615 non-null    int64  
 1   Category    615 non-null    object 
 2   Age         615 non-null    int64  
 3   Sex         615 non-null    object 
 4   ALB         614 non-null    float64
 5   ALP         597 non-null    float64
 6   ALT         614 non-null    float64
 7   AST         615 non-null    float64
 8   BIL         615 non-null    float64
 9   CHE         615 non-null    float64
 10  CHOL        605 non-null    float64
 11  CREA        615 non-null    float64
 12  GGT         615 non-null    float64
 13  PROT        614 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 67.4+ KB


# Handling missing values and outliers

In [69]:
dataset['Category'].unique()

array(['0=Blood Donor', '0s=suspect Blood Donor', '1=Hepatitis',
       '2=Fibrosis', '3=Cirrhosis'], dtype=object)

In [70]:
dataset.Sex = dataset.Sex.map({'m':0,'f':1})
dataset.Category = dataset.Category.map({'0=Blood Donor':0, '0s=suspect Blood Donor':1, 
                                         '1=Hepatitis':1, '2=Fibrosis':2, '3=Cirrhosis':3})

In [71]:
dataset.isna().sum()

Unnamed: 0     0
Category       0
Age            0
Sex            0
ALB            1
ALP           18
ALT            1
AST            0
BIL            0
CHE            0
CHOL          10
CREA           0
GGT            0
PROT           1
dtype: int64

In [72]:
dataset.mean(numeric_only=True)

Unnamed: 0    308.000000
Category        0.265041
Age            47.408130
Sex             0.386992
ALB            41.620195
ALP            68.283920
ALT            28.450814
AST            34.786341
BIL            11.396748
CHE             8.196634
CHOL            5.368099
CREA           81.287805
GGT            39.533171
PROT           72.044137
dtype: float64

In [73]:
dataset.median(numeric_only=True)

Unnamed: 0    308.00
Category        0.00
Age            47.00
Sex             0.00
ALB            41.95
ALP            66.20
ALT            23.00
AST            25.90
BIL             7.30
CHE             8.26
CHOL            5.30
CREA           77.00
GGT            23.30
PROT           72.20
dtype: float64

In [74]:
dataset.mode(numeric_only=True)

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,1,0.0,46.0,0.0,39.0,52.5,16.6,22.0,6.0,7.52,5.07,74.0,13.0,71.9
1,2,,,,,61.2,,23.9,,,5.10,,14.5,
2,3,,,,,,,24.3,,,,,19.1,
3,4,,,,,,,,,,,,24.1,
4,5,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610,611,,,,,,,,,,,,,
611,612,,,,,,,,,,,,,
612,613,,,,,,,,,,,,,
613,614,,,,,,,,,,,,,


In [75]:
dataset.fillna(dataset.mean(numeric_only=True), inplace=True)

In [76]:
dataset.isna().sum()

Unnamed: 0    0
Category      0
Age           0
Sex           0
ALB           0
ALP           0
ALT           0
AST           0
BIL           0
CHE           0
CHOL          0
CREA          0
GGT           0
PROT          0
dtype: int64

# Feature selection

In [77]:
from sklearn.feature_selection import SelectKBest, f_regression

X = dataset.drop(['Unnamed: 0', 'Category'], axis=1)
y = dataset['Category']

In [78]:
corr_matrix = dataset.corr()
corr_matrix

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
Unnamed: 0,1.0,0.552971,0.420477,0.598597,-0.309379,0.022402,-0.034996,0.332626,0.181459,-0.270549,-0.085709,-0.025987,0.247781,-0.113735
Category,0.552971,1.0,0.133398,-0.054094,-0.32367,0.06711,0.111076,0.641508,0.492222,-0.36338,-0.306945,0.196711,0.472406,-0.026754
Age,0.420477,0.133398,1.0,0.024544,-0.197005,0.168473,-0.006017,0.088666,0.032492,-0.075093,0.123978,-0.022296,0.153087,-0.153665
Sex,0.598597,-0.054094,0.024544,1.0,-0.146065,0.019547,-0.161785,-0.130891,-0.111177,-0.169111,0.029886,-0.159589,-0.133276,-0.05125
ALB,-0.309379,-0.32367,-0.197005,-0.146065,1.0,-0.138858,0.001606,-0.193397,-0.22165,0.375848,0.204803,-0.001573,-0.155555,0.550033
ALP,0.022402,0.06711,0.168473,0.019547,-0.138858,1.0,0.172461,0.062202,0.0486,0.032971,0.121914,0.149554,0.442262,-0.053633
ALT,-0.034996,0.111076,-0.006017,-0.161785,0.001606,0.172461,1.0,0.273325,-0.038468,0.146953,0.068023,-0.043025,0.248086,0.094373
AST,0.332626,0.641508,0.088666,-0.130891,-0.193397,0.062202,0.273325,1.0,0.312231,-0.208536,-0.207507,-0.021387,0.491263,0.039888
BIL,0.181459,0.492222,0.032492,-0.111177,-0.22165,0.0486,-0.038468,0.312231,1.0,-0.333172,-0.156261,0.031224,0.217024,-0.041309
CHE,-0.270549,-0.36338,-0.075093,-0.169111,0.375848,0.032971,0.146953,-0.208536,-0.333172,1.0,0.420163,-0.011157,-0.110345,0.293184


In [79]:
selector = SelectKBest(score_func=f_regression, k=6)
X_new = selector.fit_transform(X, y)

In [80]:
selected_features = X.columns[selector.get_support()]
selected_features

Index(['ALB', 'AST', 'BIL', 'CHE', 'CHOL', 'GGT'], dtype='object')

# Normalizing the data

In [90]:
from sklearn.preprocessing import StandardScaler

In [91]:
data_frame = pd.read_csv('HepatitisCdata.csv')
data_frame.head()

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,2,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,3,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,4,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,5,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [92]:
data_frame.Sex = data_frame.Sex.map({'m':0,'f':1})
data_frame.Category = data_frame.Category.map({'0=Blood Donor':0, '0s=suspect Blood Donor':1, 
                                               '1=Hepatitis':1, '2=Fibrosis':2, '3=Cirrhosis':3})

In [93]:
X = data_frame[['ALB', 'AST', 'BIL', 'CHE', 'CHOL', 'GGT']]
y = data_frame['Category']

In [94]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [95]:
normalized_df = pd.DataFrame(X, columns=['ALB', 'AST', 'BIL', 'CHE', 'CHOL', 'GGT'])
normalized_df['target_variable'] = y

In [96]:
normalized_df.isna().sum()

ALB                 1
AST                 0
BIL                 0
CHE                 0
CHOL               10
GGT                 0
target_variable     0
dtype: int64

In [97]:
normalized_df.dropna(inplace=True)

In [99]:
normalized_df.to_csv('normalized_data.csv', index=False)