## Data Preprocessing
In this notebook, we explain some pre-processing techniques:
- Removing and imputing missing values.
- Normalization and standardization of numerical data .
- Encoding non-numerical data. 

<div class="alert alert-info">Read the dataset.</div>

<div class="alert alert-danger">Exercise 1: Write a code snippet to read the  titanic dataset from local drive?</div>

In [66]:
import pandas as pd
file = 'hepatitis.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,1.0,85.0,18.0,4.0,,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,0.9,135.0,42.0,3.5,,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,0.7,96.0,32.0,4.0,,False,live
3,31,female,True,False,False,False,True,False,False,False,False,False,0.7,46.0,52.0,4.0,80,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,1.0,200.0,4.0,,False,live,


<div class="alert alert-info">Removing the missing values.</div>

In [44]:
missing_percentage = (df.isna().sum() / len(df)) * 100
missing_percentage

age                 0.0
sex                 0.0
steroid             0.0
antivirals          0.0
fatigue             0.0
malaise             0.0
anorexia            0.0
liver_big           0.0
liver_firm          0.0
spleen_palpable     0.0
spiders             0.0
ascites             0.0
bilirubin          10.0
alk_phosphate      20.0
sgot               20.0
albumin            50.0
protime            50.0
histology           0.0
class              50.0
dtype: float64

<div class="alert alert-danger">Exercise 2: Which column has the highest number of missing values?</div>

In [45]:
df1=df.dropna(axis=1)
print(df.shape)
print(df1.shape)
df1.head()

(10, 19)
(10, 13)


Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,histology
0,30,male,False,False,False,False,False,False,False,False,False,False,False
1,50,female,False,False,True,False,False,False,False,False,False,False,False
2,78,female,True,False,True,False,False,True,False,False,False,False,False
3,31,female,True,False,False,False,True,False,False,False,False,False,False
4,34,female,True,False,False,False,False,True,False,False,False,False,live


In [46]:
df1=df.dropna(axis=0)
print(df.shape)
print(df1.shape)

(10, 19)
(2, 19)


In [47]:
df1=df.copy()
df1.dropna(subset=['protime'],axis=0,inplace=True)
df1.shape

(5, 19)

In [48]:
df1=df.copy()
df1.dropna(thresh=0.5*len(df),axis=1,inplace=True)
df1.shape

(10, 19)

In [49]:
df2=df.copy()
df2.dropna(thresh=0.95*len(df2),axis=1,inplace=True)

In [50]:
df2.shape

(10, 13)

<div class="alert alert-info">Impute the missing values for numeric data.</div>

In [56]:
import numpy as np

# Replace False values with NaN
df1['protime'] = df1['protime'].replace(False, np.nan)

# Convert to numeric, forcing non-numeric values to NaN
protime_c = pd.to_numeric(df['protime'], errors='coerce')

# Drop NaN values
protime_c_clean = protime_c.dropna()

# Calculate the mean
mean_value = protime_c_clean.mean()

print(mean_value)

77.5


In [70]:
#Mean Imputation for numerical data
df1=df.copy()
df1['protime'] = df1['protime'].replace(False, np.nan)
df1['protime'] = pd.to_numeric(df1['protime'], errors='coerce')
df1['protime'] = df1['protime'].dropna()
missing_count = df1['protime'].isna().sum()# check the original number of missing values
print(missing_count)
mean_value = protime_c_clean.mean()
df1['protime'].fillna(mean_value, inplace=True)
missing_count = df1['protime'].isna().sum()# check the number of missing values after imputation
print(missing_count)
protime_c_clean

8
0


3    80.0
5    75.0
Name: protime, dtype: float64

In [69]:
df1=df.copy()
median_value = df1['protime'].median()
df1['protime'].fillna(median_value, inplace=True)
missing_count = df1['protime'].isna().sum()
print(missing_count)

TypeError: could not convert string to float: 'False'

In [71]:
median_value = df1['protime'].median()

<div class="alert alert-info">Impute the missing values for categorical data.</div>

In [72]:
#Mode Imputation for categorical data
mode_value = df['steroid'].mode()[0] # why [0]? check the notion
df['steroid'].fillna(mode_value, inplace=True)


<div class="alert alert-info">Normalization of numerical features</div>

In [73]:
# First replce the missing values with mean of the column 
mean_value = df['protime'].mean()
df1['protime'].fillna(mean_value, inplace=True)

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [80]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df['protime'] = scaler.fit_transform(df[['protime']])

<div class="alert alert-danger">Exercise 4: Given the code below, the value "a" and "b" are equal to...?(use print to see the values)</div>

In [81]:
a=(df['protime'].min())
b=(df['protime'].max())

In [82]:
print(a)
print(b)

0.0
1.0


<div class="alert alert-info">Standardization of numerical features</div>

In [104]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df['protime'] = scaler.fit_transform(df[['protime']])
print(type(scaler.fit_transform(df[['protime']])))
# check the mean and standard deviation
print(round(df['protime'].mean(),2))
import numpy as np
std = np.std(df['protime'])
print(round(std,2))

<class 'numpy.ndarray'>
0.0
1.0


<div class="alert alert-info">Handling categorical data (creating dummy variables)</div>

In [86]:
dummy_df = pd.get_dummies(df, columns=['steroid'], drop_first=True)
dummy_df.head()
df.shape
dummy_df.shape

(155, 20)

<div class="alert alert-info">Handling categorical data (label encoding)</div>

In [88]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['sex'] = encoder.fit_transform(df['sex'])
df.head()

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,1,False,False,False,False,False,False,False,False,False,False,False,1.0,85.0,18.0,4.0,,False,live
1,50,0,False,False,True,False,False,False,False,False,False,False,False,0.9,135.0,42.0,3.5,,False,live
2,78,0,True,False,True,False,False,True,False,False,False,False,False,0.7,96.0,32.0,4.0,,False,live
3,31,0,,True,False,False,False,True,False,False,False,False,False,0.7,46.0,52.0,4.0,80.0,False,live
4,34,0,True,False,False,False,False,True,False,False,False,False,False,1.0,,200.0,4.0,,False,live


In [102]:
# Sample ordinal data
Cancer_risk = ['Low', 'Medium', 'High', 'Low', 'High', 'Medium', 'Low']

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the ordinal data using label encoding
encoded_data = label_encoder.fit_transform(Cancer_risk)

print("Cancer_risk:", Cancer_risk)
print("Cancer_risk:", encoded_data)

Cancer_risk: ['Low', 'Medium', 'High', 'Low', 'High', 'Medium', 'Low']
Cancer_risk: [1 2 0 1 0 2 1]
