In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv(filepath_or_buffer="hcvdat0.csv", index_col=[0])
data.head()

Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
2,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
3,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
4,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
5,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


In [3]:
data.isnull().sum()

Category     0
Age          0
Sex          0
ALB          1
ALP         18
ALT          1
AST          0
BIL          0
CHE          0
CHOL        10
CREA         0
GGT          0
PROT         1
dtype: int64

In [4]:
# Separate the features from the target
y = data['Category']
X = data.drop(['Category'], axis=1)

In [5]:
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

In [6]:
one_hot_encoder = OneHotEncoder()
data_split_catagories = pd.get_dummies(X[categorical_cols])
X = pd.concat([X, data_split_catagories], axis=1)
X = X.drop("Sex", axis=1)

In [7]:
numerical_transformer = Pipeline(steps = [
    ('imputer', IterativeImputer(max_iter=10, random_state=0)),    
    ('scale', StandardScaler())
])
imputed_data = pd.DataFrame(numerical_transformer.fit_transform(X), columns=X.columns)

In [11]:
imputed_data.head()

Unnamed: 0,Age,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT,Sex_f,Sex_m
0,-1.533616,-0.541157,-0.630399,-0.815916,-0.383693,-0.198236,-0.574734,-1.896684,0.49707,-0.502286,-0.561267,-0.794544,0.794544
1,-1.533616,-0.541157,0.054676,-0.410857,-0.305057,-0.381375,1.349161,-0.504285,-0.14659,-0.438203,0.827068,-0.794544,0.794544
2,-1.533616,0.9142,0.22402,0.304879,0.538767,-0.269457,0.291926,-0.149534,0.094783,-0.115957,1.34538,-0.794544,0.794544
3,-1.533616,0.27315,-0.649642,0.084652,-0.368571,0.381706,-0.393234,-0.557498,-0.025903,-0.104971,0.678979,-0.794544,0.794544
4,-1.533616,-0.419877,0.200927,0.163305,-0.302033,-0.091404,0.432588,-0.929987,-0.106361,-0.176378,-0.6168,-0.794544,0.794544


In [9]:
imputed_data.isnull().sum()

Age      0
ALB      0
ALP      0
ALT      0
AST      0
BIL      0
CHE      0
CHOL     0
CREA     0
GGT      0
PROT     0
Sex_f    0
Sex_m    0
dtype: int64