# **Importing Data**
---

- **Reading `CSV` file**

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/ckd-dataset-v2.csv')
df.head(20)

Unnamed: 0,bp (Diastolic),bp limit,sg,al,class,rbc,su,pc,pcc,ba,...,htn,dm,cad,appet,pe,ane,grf,stage,affected,age
0,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,...,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete,discrete
1,,,,,,,,,,,...,,,,,,,,,class,meta
2,0,0,1.019 - 1.021,1 - 1,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,≥ 227.944,s1,1,< 12
3,0,0,1.009 - 1.011,< 0,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,≥ 227.944,s1,1,< 12
4,0,0,1.009 - 1.011,≥ 4,ckd,1,< 0,1,0,1,...,0,0,0,1,0,0,127.281 - 152.446,s1,1,< 12
5,1,1,1.009 - 1.011,3 - 3,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,127.281 - 152.446,s1,1,< 12
6,0,0,1.015 - 1.017,< 0,ckd,0,< 0,0,0,0,...,0,1,0,1,1,0,127.281 - 152.446,s1,1,12 - 20
7,1,1,≥ 1.023,< 0,notckd,0,< 0,0,0,0,...,0,0,0,0,0,0,102.115 - 127.281,s1,0,12 - 20
8,0,0,1.019 - 1.021,3 - 3,ckd,0,< 0,0,0,0,...,1,1,0,0,0,0,177.612 - 202.778,s1,1,12 - 20
9,0,0,1.019 - 1.021,< 0,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,26.6175 - 51.7832,s4,1,12 - 20


- **Removing un-necessary rows**

In [2]:
df.drop([0, 1], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,bp (Diastolic),bp limit,sg,al,class,rbc,su,pc,pcc,ba,...,htn,dm,cad,appet,pe,ane,grf,stage,affected,age
0,0,0,1.019 - 1.021,1 - 1,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,≥ 227.944,s1,1,< 12
1,0,0,1.009 - 1.011,< 0,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,≥ 227.944,s1,1,< 12
2,0,0,1.009 - 1.011,≥ 4,ckd,1,< 0,1,0,1,...,0,0,0,1,0,0,127.281 - 152.446,s1,1,< 12
3,1,1,1.009 - 1.011,3 - 3,ckd,0,< 0,0,0,0,...,0,0,0,0,0,0,127.281 - 152.446,s1,1,< 12
4,0,0,1.015 - 1.017,< 0,ckd,0,< 0,0,0,0,...,0,1,0,1,1,0,127.281 - 152.446,s1,1,12 - 20


# **Data Preprocessing**
---

## **Handling for `NULL` Values**

In [7]:
df.isna().sum()

bp (Diastolic)    0
bp limit          0
sg                0
al                0
class             0
rbc               0
su                0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sod               0
sc                0
pot               0
hemo              0
pcv               0
rbcc              0
wbcc              0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
grf               0
stage             0
affected          0
age               0
dtype: int64

## **Handling duplicate values**

In [8]:
df.duplicated().sum()

0

## **Ordinal Encoding**

In [6]:
for col in df.columns:
    print(f'{col} -> {df[col].unique()}')
    print('--------------------------------------')

bp (Diastolic) -> ['0' '1']
--------------------------------------
bp limit -> ['0' '1' '2']
--------------------------------------
sg -> ['1.019 - 1.021' '1.009 - 1.011' '1.015 - 1.017' '≥ 1.023' '< 1.007']
--------------------------------------
al -> ['1 - 1' '< 0' '≥ 4' '3 - 3' '2 - 2']
--------------------------------------
class -> ['ckd' 'notckd']
--------------------------------------
rbc -> ['0' '1']
--------------------------------------
su -> ['< 0' '4 - 4' '2 - 2' '3 - 4' '1 - 2' '≥ 4']
--------------------------------------
pc -> ['0' '1']
--------------------------------------
pcc -> ['0' '1']
--------------------------------------
ba -> ['0' '1']
--------------------------------------
bgr -> ['< 112' '112 - 154' '154 - 196' '406 - 448' '238 - 280' '196 - 238'
 '≥ 448' '280 - 322' '364 - 406' '322 - 364']
--------------------------------------
bu -> ['< 48.1' '48.1 - 86.2' '200.5 - 238.6' '124.3 - 162.4' '86.2 - 124.3'
 '162.4 - 200.5' '≥ 352.9' '238.6 - 276.7']
----------

In [None]:
# Defining ordered mappings for ordinal encoding
mappings = {
    "bp (Diastolic)": {"0": 0, "1": 1},
    "bp limit": {"0": 0, "1": 1, "2": 2},
    "sg": {"< 1.007": 0, "1.009 - 1.011": 1, "1.015 - 1.017": 2, "1.019 - 1.021": 3, "≥ 1.023": 4},
    "al": {"< 0": 0, "1 - 1": 1, "2 - 2": 2, "3 - 3": 3, "≥ 4": 4},
    "class": {"notckd": 0, "ckd": 1},
    "rbc": {"0": 0, "1": 1},
    "su": {"< 0": 0, "1 - 2": 1, "2 - 2": 2, "3 - 4": 3, "4 - 4": 4, "≥ 4": 5},
    "pc": {"0": 0, "1": 1},
    "pcc": {"0": 0, "1": 1},
    "ba": {"0": 0, "1": 1},
    "bgr": {
        "< 112": 0, "112 - 154": 1, "154 - 196": 2, "196 - 238": 3, "238 - 280": 4,
        "280 - 322": 5, "322 - 364": 6, "364 - 406": 7, "406 - 448": 8, "≥ 448": 9
    },
    "bu": {
        "< 48.1": 0, "48.1 - 86.2": 1, "86.2 - 124.3": 2, "124.3 - 162.4": 3, 
        "162.4 - 200.5": 4, "200.5 - 238.6": 5, "238.6 - 276.7": 6, "≥ 352.9": 7
    },
    "sod": {
        "< 118": 0, "118 - 123": 1, "123 - 128": 2, "128 - 133": 3, "133 - 138": 4, 
        "138 - 143": 5, "143 - 148": 6, "148 - 153": 7, "≥ 158": 8
    },
    "sc": {
        "< 3.65": 0, "3.65 - 6.8": 1, "6.8 - 9.95": 2, "9.95 - 13.1": 3, 
        "13.1 - 16.25": 4, "16.25 - 19.4": 5, "≥ 28.85": 6
    },
    "pot": {"< 7.31": 0, "7.31 - 11.72": 1, "38.18 - 42.59": 2, "≥ 42.59": 3},
    "hemo": {
        "< 6.1": 0, "6.1 - 7.4": 1, "7.4 - 8.7": 2, "8.7 - 10": 3, "10 - 11.3": 4, 
        "11.3 - 12.6": 5, "12.6 - 13.9": 6, "13.9 - 15.2": 7, "15.2 - 16.5": 8, "≥ 16.5": 9
    },
    "pcv": {
        "< 17.9": 0, "17.9 - 21.8": 1, "21.8 - 25.7": 2, "25.7 - 29.6": 3, "29.6 - 33.5": 4,
        "33.5 - 37.4": 5, "37.4 - 41.3": 6, "41.3 - 45.2": 7, "45.2 - 49.1": 8, "≥ 49.1": 9
    },
    "rbcc": {
        "< 2.69": 0, "2.69 - 3.28": 1, "3.28 - 3.87": 2, "3.87 - 4.46": 3, "4.46 - 5.05": 4,
        "5.05 - 5.64": 5, "5.64 - 6.23": 6, "6.23 - 6.82": 7, "≥ 7.41": 8
    },
    "wbcc": {
        "< 4980": 0, "4980 - 7360": 1, "7360 - 9740": 2, "9740 - 12120": 3, "12120 - 14500": 4,
        "14500 - 16880": 5, "16880 - 19260": 6, "19260 - 21640": 7, "≥ 24020": 8
    },
    "htn": {"0": 0, "1": 1},
    "dm": {"0": 0, "1": 1},
    "cad": {"0": 0, "1": 1},
    "appet": {"0": 0, "1": 1},
    "pe": {"0": 0, "1": 1},
    "ane": {"0": 0, "1": 1},
    "grf": {
        "< 26.6175": 0, "26.6175 - 51.7832": 1, "51.7832 - 76.949": 2, "76.949 - 102.115": 3,
        "102.115 - 127.281": 4, "127.281 - 152.446": 5, "152.446 - 177.612": 6, "177.612 - 202.778": 7,
        "202.778 - 227.944": 8, "≥ 227.944": 9
    },
    "stage": {"s1": 1, "s2": 2, "s3": 3, "s4": 4, "s5": 5},
    "affected": {"0": 0, "1": 1},
    "age": {
        "< 12": 0, "12 - 20": 1, "20 - 27": 2, "27 - 35": 3, "35 - 43": 4, "43 - 51": 5, 
        "51 - 59": 6, "59 - 66": 7, "66 - 74": 8, "≥ 74": 9
    }
}

# Function to apply ordinal encoding
def encode_columns(df, mappings):
    for col, mapping in mappings.items():
        if col in df.columns:
            df[col] = df[col].map(mapping)
    return df

# Apply encoding
df = encode_columns(df, mappings)
df.head()

Unnamed: 0,bp (Diastolic),bp limit,sg,al,class,rbc,su,pc,pcc,ba,...,htn,dm,cad,appet,pe,ane,grf,stage,affected,age
0,0,0,3,1,1,0,0,0,0,0,...,0,0,0,0,0,0,9.0,1,1,0
1,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,9.0,1,1,0
2,0,0,1,4,1,1,0,1,0,1,...,0,0,0,1,0,0,5.0,1,1,0
3,1,1,1,3,1,0,0,0,0,0,...,0,0,0,0,0,0,5.0,1,1,0
4,0,0,2,0,1,0,0,0,0,0,...,0,1,0,1,1,0,5.0,1,1,1


- **Separating Numerical and Categorical Features**

In [11]:
cat_cols = ["bp (Diastolic)", "bp limit", "class", "rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane", "affected"]
num_cols = ["bgr", "bu", "sod", "sc", "pot", "hemo", "pcv", "rbcc", "wbcc", "grf", "age"]

## **Scaling Numerical Features**

In [12]:
from sklearn.preprocessing import StandardScaler

# Standard Scaling
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

df.head()

Unnamed: 0,bp (Diastolic),bp limit,sg,al,class,rbc,su,pc,pcc,ba,...,htn,dm,cad,appet,pe,ane,grf,stage,affected,age
0,0,0,3,1,1,0,0,0,0,0,...,0,0,0,0,0,0,3.038536,1,1,-2.781904
1,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,3.038536,1,1,-2.781904
2,0,0,1,4,1,1,0,1,0,1,...,0,0,0,1,0,0,1.284602,1,1,-2.781904
3,1,1,1,3,1,0,0,0,0,0,...,0,0,0,0,0,0,1.284602,1,1,-2.781904
4,0,0,2,0,1,0,0,0,0,0,...,0,1,0,1,1,0,1.284602,1,1,-2.304324


In [15]:
df = df.dropna()

# **Feature Selection**
---

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Define numerical columns
num_cols = ["bgr", "bu", "sod", "sc", "pot", "hemo", "pcv", "rbcc", "wbcc", "grf", "age"]
target_col = "affected"

# Assuming df is already preprocessed (scaled)
X_num = df[num_cols]  # Numerical features
y = df[target_col]  # Target variable

# Feature Selection - Numerical (ANOVA F-test)
num_selector = SelectKBest(score_func=f_classif, k='all')
num_selector.fit(X_num, y)

# Store scores
num_scores = pd.DataFrame({'Feature': num_cols, 'Score': num_selector.scores_}).sort_values(by='Score', ascending=False)

# Display numerical feature scores
print("Top Numerical Features:\n", num_scores)

Top Numerical Features:
    Feature       Score
5     hemo  278.111645
6      pcv  188.777899
9      grf  134.579622
7     rbcc  114.685729
2      sod   64.106276
1       bu   43.344534
0      bgr   30.548803
3       sc   19.769726
8     wbcc   11.534945
10     age   11.293577
4      pot    1.472991


In [20]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

# Define numerical columns
num_cols = ["bgr", "bu", "sod", "sc", "pot", "hemo", "pcv", "rbcc", "wbcc", "grf", "age"]
target_col = "affected"

# Extract numerical features and target
X_num = df[num_cols]
y = df[target_col]

### 🔹 1. ANOVA F-test
anova_selector = SelectKBest(score_func=f_classif, k='all')
anova_selector.fit(X_num, y)
anova_scores = pd.Series(anova_selector.scores_, index=num_cols)

### 🔹 2. Mutual Information
mi_selector = SelectKBest(score_func=mutual_info_classif, k='all')
mi_selector.fit(X_num, y)
mi_scores = pd.Series(mi_selector.scores_, index=num_cols)

### 🔹 3. Pearson Correlation
corr_scores = X_num.corrwith(y).abs()  # Use absolute values

### 🔹 4. Recursive Feature Elimination (RFE) with Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
rfe_selector = RFE(log_reg, n_features_to_select=5)  # Selecting top 5
rfe_selector.fit(X_num, y)
rfe_scores = pd.Series(rfe_selector.ranking_, index=num_cols)  # Lower rank = more important
rfe_scores = (1 / rfe_scores)  # Invert rankings so higher values mean more importance

### 🔹 5. Feature Importance from Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_num, y)
rf_scores = pd.Series(rf_model.feature_importances_, index=num_cols)

### 🔹 Combine Results into a DataFrame
feature_scores = pd.DataFrame({
    "ANOVA": anova_scores,
    "Mutual_Info": mi_scores,
    "Pearson_Corr": corr_scores,
    "RFE (LogReg)": rfe_scores,
    "Random_Forest": rf_scores
})

# Normalize scores for better comparison
feature_scores = feature_scores.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

# Compute average rank across all methods
feature_scores["Mean_Score"] = feature_scores.mean(axis=1)

# Sort features based on their average score
feature_scores = feature_scores.sort_values(by="Mean_Score", ascending=False)

# Display final ranked feature importance table
feature_scores

Unnamed: 0,ANOVA,Mutual_Info,Pearson_Corr,RFE (LogReg),Random_Forest,Mean_Score
hemo,1.0,1.0,1.0,1.0,1.0,1.0
pcv,0.677074,0.935677,0.903441,1.0,0.880721,0.879383
rbcc,0.409244,0.716275,0.766552,1.0,0.466474,0.671709
grf,0.481157,0.619607,0.811462,0.222222,0.550154,0.536921
sod,0.226408,0.404346,0.602924,1.0,0.278446,0.502425
bgr,0.105104,0.344208,0.412783,1.0,0.20625,0.413669
sc,0.066139,0.329751,0.317918,0.416667,0.021007,0.230296
bu,0.151358,0.382884,0.4986,0.0,0.111861,0.228941
age,0.0355,0.144004,0.216075,0.066667,0.15393,0.123235
wbcc,0.036372,0.105298,0.21952,0.125,0.070632,0.111365
