## 1. Load the Dataset

In [2]:
# Import the pandas library (for data structures and data analysis) and alias it as "pd" for convenience
import pandas as pd
import numpy as np
from Univariate import Univariate

In [3]:
 # Load the CSV file "Placement.csv" into a DataFrame named `dataset` using pandas' read_csv function
dataset=pd.read_csv("kidney_disease.csv")

In [5]:
print("Shape of dataset:", dataset.shape)

Shape of dataset: (400, 26)


In [6]:
# Show first 5 rows
print("\nFirst 5 rows:")
print(dataset.head())


First 5 rows:
   id   age    bp     sg   al   su     rbc        pc         pcc          ba  \
0   0  48.0  80.0  1.020  1.0  0.0     NaN    normal  notpresent  notpresent   
1   1   7.0  50.0  1.020  4.0  0.0     NaN    normal  notpresent  notpresent   
2   2  62.0  80.0  1.010  2.0  3.0  normal    normal  notpresent  notpresent   
3   3  48.0  70.0  1.005  4.0  0.0  normal  abnormal     present  notpresent   
4   4  51.0  80.0  1.010  2.0  0.0  normal    normal  notpresent  notpresent   

   ...  pcv    wc   rc  htn   dm  cad appet   pe  ane classification  
0  ...   44  7800  5.2  yes  yes   no  good   no   no            ckd  
1  ...   38  6000  NaN   no   no   no  good   no   no            ckd  
2  ...   31  7500  NaN   no  yes   no  poor   no  yes            ckd  
3  ...   32  6700  3.9  yes   no   no  poor  yes  yes            ckd  
4  ...   35  7300  4.6   no   no   no  good   no   no            ckd  

[5 rows x 26 columns]


In [7]:
# Show column names
print("\nColumns:")
print(dataset.columns.tolist())


Columns:
['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'classification']


In [8]:
# Show info (types + missing values)
print("\nInfo:")
print(dataset.info())


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null

In [10]:
# 👉 Print missing values per column
print("\nMissing values per column:")
print(dataset.isnull().sum())


Missing values per column:
id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64


## 2. Replace missing values with Mean / Median / Mode

In [13]:
# Replace missing numeric values with mean
df_mean = dataset.fillna(dataset.mean(numeric_only=True))

# Replace missing numeric values with median
df_median = dataset.fillna(dataset.median(numeric_only=True))

# Replace missing values (both numeric & categorical) with mode
df_mode = dataset.fillna(dataset.mode().iloc[0])

print("Done: Mean, Median, Mode replacement")

Done: Mean, Median, Mode replacement


In [15]:
# 👉 Replace missing numeric values with Mean
df_mean = dataset.fillna(dataset.mean(numeric_only=True))
print("✅ Missing values handled with MEAN")
display(df_mean.head())
print("\nRemaining missing values:")
display(df_mean.isnull().sum())

✅ Missing values handled with MEAN


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd



Remaining missing values:


id                  0
age                 0
bp                  0
sg                  0
al                  0
su                  0
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                 0
bu                  0
sc                  0
sod                 0
pot                 0
hemo                0
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [16]:
# 👉 Replace missing numeric values with Median
df_median = dataset.fillna(dataset.median(numeric_only=True))
print("✅ Missing values handled with MEDIAN")
display(df_median.head())
print("\nRemaining missing values:")
display(df_median.isnull().sum())

✅ Missing values handled with MEDIAN


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd



Remaining missing values:


id                  0
age                 0
bp                  0
sg                  0
al                  0
su                  0
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                 0
bu                  0
sc                  0
sod                 0
pot                 0
hemo                0
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [19]:
# 👉 Replace missing values (both numeric & categorical) with Mode
df_mode = dataset.fillna(dataset.mode().iloc[0])
print("✅ Missing values handled with MODE")
display(df_mode.head())
print("\nRemaining missing values:")
display(df_mode.isnull().sum())

✅ Missing values handled with MODE


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,normal,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,normal,normal,notpresent,notpresent,...,38,6000,5.2,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,5.2,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd



Remaining missing values:


id                0
age               0
bp                0
sg                0
al                0
su                0
rbc               0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hemo              0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64

## 3. Delete entire rows with missing values

In [21]:
df_drop = dataset.dropna()
print("Shape after row delete:", df_drop.shape)

Shape after row delete: (158, 26)


In [23]:
# 👉 Drop all rows that have any missing value
df_drop = dataset.dropna()

print("✅ Rows with missing values deleted")
print(f"Original shape: {dataset.shape}")
print(f"New shape after dropping: {df_drop.shape}")

# Show first 5 rows after dropping
display(df_drop.head())

# Check if any missing values remain
print("\n📌 Remaining missing values per column:")
display(df_drop.isnull().sum())


✅ Rows with missing values deleted
Original shape: (400, 26)
New shape after dropping: (158, 26)


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
9,9,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,...,29,12100,3.7,yes,yes,no,poor,no,yes,ckd
11,11,63.0,70.0,1.01,3.0,0.0,abnormal,abnormal,present,notpresent,...,32,4500,3.8,yes,yes,no,poor,yes,no,ckd
14,14,68.0,80.0,1.01,3.0,2.0,normal,abnormal,present,present,...,16,11000,2.6,yes,yes,yes,poor,yes,no,ckd
20,20,61.0,80.0,1.015,2.0,0.0,abnormal,abnormal,notpresent,notpresent,...,24,9200,3.2,yes,yes,yes,poor,yes,yes,ckd



📌 Remaining missing values per column:


id                0
age               0
bp                0
sg                0
al                0
su                0
rbc               0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hemo              0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64

## 4. Problem-specific replacement (example: fill with 0 or "no")

In [24]:
df_problem = dataset.copy()

# Example: fill missing blood pressure (bp) with 0
df_problem["bp"] = df_problem["bp"].fillna(0)

# Example: fill missing categorical column (like htn) with "no"
if "htn" in df_problem.columns:
    df_problem["htn"] = df_problem["htn"].fillna("no")

print("Done: Problem-specific replacement")


Done: Problem-specific replacement


## 5. Predict missing values (simple model approach)

In [31]:
from sklearn.impute import KNNImputer
import pandas as pd

# If df is not already loaded, uncomment:
# df = pd.read_csv("/mnt/data/kidney_disease.csv")

# 1) Split numeric vs categorical
num_cols = dataset.select_dtypes(include=["number"]).columns.tolist()
cat_cols = [c for c in dataset.columns if c not in num_cols]

print("Numeric columns:", len(num_cols))
print("Categorical columns:", len(cat_cols))

# 2) KNN impute only numeric columns
imputer = KNNImputer(n_neighbors=5, weights="distance")
num_imputed = imputer.fit_transform(dataset[num_cols])

# 3) Put back into a DataFrame
df_num_imputed = pd.DataFrame(num_imputed, columns=num_cols, index=dataset.index)

# 4) Combine numeric (imputed) + categorical (as-is for now)
df_knn = pd.concat([df_num_imputed, dataset[cat_cols]], axis=1)

# 5) Fill remaining categorical NaNs with mode (most frequent)
for c in cat_cols:
    mode_val = df_knn[c].mode(dropna=True)
    if not mode_val.empty:
        df_knn[c] = df_knn[c].fillna(mode_val.iloc[0])

print("✅ KNN imputation complete.")
print("Shape:", df_knn.shape)

# 6) Quick check – all missing values now?
print("\n📌 Remaining missing values per column:")
print(df_knn.isnull().sum())


Numeric columns: 12
Categorical columns: 14
✅ KNN imputation complete.
Shape: (400, 26)

📌 Remaining missing values per column:
id                0
age               0
bp                0
sg                0
al                0
su                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hemo              0
rbc               0
pc                0
pcc               0
ba                0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64
