## Modules and libraries

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

## Data

In [2]:
# Importing the data
df = pd.read_csv("/content/drive/MyDrive/0_Hackathon_SBSSU/liver cd/ilpd.csv",
                 names=['age','gender','t_bilirubin','d_bilirubin','alkphos','sgpt',
                        'sgot','t_protein','albumin','a_g_ratio','result'])

In [3]:
# Data head
df.sample(5)

Unnamed: 0,age,gender,t_bilirubin,d_bilirubin,alkphos,sgpt,sgot,t_protein,albumin,a_g_ratio,result
209,45,Female,0.9,0.3,189,23,33,6.6,3.9,,1
419,55,Female,8.2,3.9,1350,52,65,6.7,2.9,0.7,1
232,50,Male,2.7,1.6,157,149,156,7.9,3.1,0.6,1
514,49,Male,1.0,0.3,230,48,58,8.4,4.2,1.0,1
253,35,Female,0.6,0.2,180,12,15,5.2,2.7,,2


In [5]:
df.sgot.min()

10

**In the above data sample** 
*   result = 1 means (Liver disease)
*   result = 2 means (NOT Liver disease)




In [None]:
# Used datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          583 non-null    int64  
 1   gender       583 non-null    object 
 2   t_bilirubin  583 non-null    float64
 3   d_bilirubin  583 non-null    float64
 4   alkphos      583 non-null    int64  
 5   sgpt         583 non-null    int64  
 6   sgot         583 non-null    int64  
 7   t_protein    583 non-null    float64
 8   albumin      583 non-null    float64
 9   a_g_ratio    579 non-null    float64
 10  result       583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


# Handling Duplicate Values

In [None]:
# Duplicate values investigation
df.duplicated().sum()

13

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

(570, 11)

# Handling Missing Values & Categorical Values with Column transformer

In [None]:
# Null values investigation
df.isnull().sum()

age            0
gender         0
t_bilirubin    0
d_bilirubin    0
alkphos        0
sgpt           0
sgot           0
t_protein      0
albumin        0
a_g_ratio      4
result         0
dtype: int64

There are **4** null values for **a_g__ratio** out of total 583 data points.

In [None]:
# Transforming the 'gender' column to Encoded 'gender' attribute
transformations = [('simple imputation',SimpleImputer(strategy='median'),['a_g_ratio']),
                   ('one_hot_encoding',OneHotEncoder(drop='first',sparse=False),['gender'])]

transformer = ColumnTransformer(transformations,remainder='passthrough')

new_df = pd.DataFrame(transformer.fit_transform(df),
                      columns=['a_g_ratio','gender','age', 't_bilirubin', 'd_bilirubin', 'alkphos', 'sgpt',
                               'sgot', 't_protein', 'albumin', 'result'])

new_df.head()

Unnamed: 0,a_g_ratio,gender,age,t_bilirubin,d_bilirubin,alkphos,sgpt,sgot,t_protein,albumin,result
0,0.9,0.0,65.0,0.7,0.1,187.0,16.0,18.0,6.8,3.3,1.0
1,0.74,1.0,62.0,10.9,5.5,699.0,64.0,100.0,7.5,3.2,1.0
2,0.89,1.0,62.0,7.3,4.1,490.0,60.0,68.0,7.0,3.3,1.0
3,1.0,1.0,58.0,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.0
4,0.4,1.0,72.0,3.9,2.0,195.0,27.0,59.0,7.3,2.4,1.0


**gender = 0.0 is for female**

**gender = 1.0 is for male**

In [None]:
# Changing data types of 'gender' and 'result' to int64 type
new_df = new_df.astype({'gender':'int64','result':'int64'})

# To CSV

After handling Missing values and Duplicate cases, we are left with **570** Data Observations out of 583

---


In [None]:
new_df.to_csv('ilpd_md1.csv',index=False)

# Summary

1.   Handled Duplicates (Dropped)
2.   Handled Missing Values (SimpleImputer() with median)
3.   Handled Categorical Data (OHE)



