## Libraries & Environment Setup

In [1]:
!pip -q install lazypredict==0.2.13

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from imblearn.under_sampling import NearMiss
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from lazypredict.Supervised import LazyClassifier

## Data Loading

In [3]:
data_disease=pd.read_csv("../data/ckd_dataset.csv")
data_disease.head()

Mounted at /content/drive/


Unnamed: 0,Bp,Sg,Al,Su,Rbc,Bu,Sc,Sod,Pot,Hemo,Wbcc,Rbcc,Htn,Class
0,80.0,1.02,1.0,0.0,1.0,36.0,1.2,137.53,4.63,15.4,7800.0,5.2,1.0,1
1,50.0,1.02,4.0,0.0,1.0,18.0,0.8,137.53,4.63,11.3,6000.0,4.71,0.0,1
2,80.0,1.01,2.0,3.0,1.0,53.0,1.8,137.53,4.63,9.6,7500.0,4.71,0.0,1
3,70.0,1.0,4.0,0.0,1.0,56.0,3.8,111.0,2.5,11.2,6700.0,3.9,1.0,1
4,80.0,1.01,2.0,0.0,1.0,26.0,1.4,137.53,4.63,11.6,7300.0,4.6,0.0,1


## Feature Selection

Based on the exploratory data analysis, the following variables showed stronger relationships with the target variable:

- Hypertension
- Specific Gravity
- Red Blood Cell Count
- Hemoglobin
- Albumin

These were selected for initial modeling.

In [4]:
X= data_disease[['Htn', 'Sg', 'Rbcc', 'Hemo', 'Al']]

In [5]:
Y= data_disease['Class']

## Handling Class Imbalance

Since the dataset presents class imbalance, NearMiss undersampling is applied to balance the training data and evaluate its impact on model performance.


In [6]:
us=NearMiss(sampling_strategy='auto', n_neighbors=3, version=2)

# Case 1: First, the data is separated into training and testing data, then undersampling is performed

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=.2,random_state =42,stratify=Y)

#Undersampling
X_train_res,Y_train_res =us.fit_resample(X_train,Y_train)
X_test_res, Y_test_res= us.fit_resample(X_test,Y_test)
print('Para entrenamiento:')
print('Antea de la depuración:{}'.format(Counter(Y_train)))
print('Después de la depuración:{}'.format(Counter(Y_train_res)))
print('Antea de la depuración:{}'.format(Counter(X_train)))
print('Después de la depuración:{}'.format(Counter(X_train_res)))

print('Para evaluación:')
print('Antea de la depuración:{}'.format(Counter(Y_test)))
print('Después de la depuración:{}'.format(Counter(Y_test_res)))
print('Antea de la depuración:{}'.format(Counter(X_test)))
print('Después de la depuración:{}'.format(Counter(X_test_res)))

Para entrenamiento:
Antea de la depuración:Counter({1: 200, 0: 120})
Después de la depuración:Counter({0: 120, 1: 120})
Antea de la depuración:Counter({'Htn': 1, 'Sg': 1, 'Rbcc': 1, 'Hemo': 1, 'Al': 1})
Después de la depuración:Counter({'Htn': 1, 'Sg': 1, 'Rbcc': 1, 'Hemo': 1, 'Al': 1})
Para evaluación:
Antea de la depuración:Counter({1: 50, 0: 30})
Después de la depuración:Counter({0: 30, 1: 30})
Antea de la depuración:Counter({'Htn': 1, 'Sg': 1, 'Rbcc': 1, 'Hemo': 1, 'Al': 1})
Después de la depuración:Counter({'Htn': 1, 'Sg': 1, 'Rbcc': 1, 'Hemo': 1, 'Al': 1})


# Second Case: Undersampling is done first, and then the data is separated into testing and training.

In [8]:
Xu, Yu =us.fit_resample(X,Y)
print('Antea de la depuración:{}'.format(Counter(Y)))
print('Después de la depuración:{}'.format(Counter(Yu)))
print('Antea de la depuración:{}'.format(Counter(X)))
print('Después de la depuración:{}'.format(Counter(Xu)))

Antea de la depuración:Counter({1: 250, 0: 150})
Después de la depuración:Counter({0: 150, 1: 150})
Antea de la depuración:Counter({'Htn': 1, 'Sg': 1, 'Rbcc': 1, 'Hemo': 1, 'Al': 1})
Después de la depuración:Counter({'Htn': 1, 'Sg': 1, 'Rbcc': 1, 'Hemo': 1, 'Al': 1})


In [9]:
X_train_ru, X_test_ru, Y_train_ru, Y_test_ru = train_test_split(Xu, Yu, test_size=0.2, random_state=42, stratify=Yu)

In [10]:
print('Se usarán de la siguiente manera los datos')
print('Para entrenamiento:{}'.format(Counter(Y_train_ru)))
print('Para testeo:{}'.format(Counter(Y_test_ru)))

Se usarán de la siguiente manera los datos
Para entrenamiento:Counter({1: 120, 0: 120})
Para testeo:Counter({0: 30, 1: 30})


# Lazy Classifier para ambos modelos:

In [11]:
#Primero modelo: primero se divide los datos y se hace undersampling_
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train_res, X_test_res, Y_train_res, Y_test_res)

print(models)

100%|██████████| 32/32 [00:01<00:00, 19.03it/s]

[LightGBM] [Info] Number of positive: 120, number of negative: 120
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 90
[LightGBM] [Info] Number of data points in the train set: 240, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
AdaBoostClassifier                 0.98               0.98     0.98      0.98   
BaggingClassifier                  0.98               0.98     0.98      0.98   
DecisionTreeClassifier             0.98               0.98     0.98      0.98   
LabelPropagation                   0.98               0.98     0.98      0.98   
ExtraTreesClassifier               0.98               0.98     0.98      0.98   
XGB




In [12]:
#Segundo modelo: primero se hace undersampling y luego se dividen los datos
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models_2,predictions_2 = clf.fit(X_train_ru,X_test_ru, Y_train_ru, Y_test_ru)

print(models_2)

100%|██████████| 32/32 [00:01<00:00, 31.46it/s]

[LightGBM] [Info] Number of positive: 120, number of negative: 120
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000042 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 240, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
NuSVC                              0.98               0.98     0.98      0.98   
LinearSVC                          0.98               0.98     0.98      0.98   
LogisticRegression                 0.98               0.98     0.98      0.98   
SVC                                0.98               0.98     0.98      0.98   
SGDClassifier                      0.98               0.98     0.98      0.98   
Ext




In [13]:
#Con los datos originales, divididos:
models_3,predictions_3 = clf.fit(X_train, X_test, Y_train, Y_test )
print(models_3)

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 32/32 [00:01<00:00, 23.21it/s]

[LightGBM] [Info] Number of positive: 200, number of negative: 120
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 320, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.625000 -> initscore=0.510826
[LightGBM] [Info] Start training from score 0.510826
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
AdaBoostClassifier                 0.99               0.99     0.99      0.99   
ExtraTreesClassifier               0.99               0.99     0.99      0.99   
LabelPropagation                   0.99               0.99     0.99      0.99   
RandomForestClassifier             0.99               0.99     0.99      0.99   
LGBMClassifier                




# Algorithm Optimization Test

**Bernulli**

Primarily used for binary classification problems, where each feature is a Boolean variable (0 or 1), it may not be the best choice for all data types, especially if the features do not follow a Bernoulli distribution or if there are significant dependencies between the features.

In [14]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
bernoulli_nb = BernoulliNB()
bernoulli_nb.fit(X_train_res, Y_train_res)
predictions = bernoulli_nb.predict(X_test_res)

# Evaluar la precisión del modelo
accuracy = accuracy_score(Y_test_res, predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))

print("Classification Report:")
print(classification_report(Y_test_res, predictions))

Accuracy: 91.67%
Classification Report:
              precision    recall  f1-score   support

           0       0.86      1.00      0.92        30
           1       1.00      0.83      0.91        30

    accuracy                           0.92        60
   macro avg       0.93      0.92      0.92        60
weighted avg       0.93      0.92      0.92        60



**Ridge Classifier**

Ridge regression is a regularization technique that adds a penalty to the model coefficients to prevent overfitting. Here are some key features of RidgeClassifier:

- Hyperparameter Tuning: You can adjust the regularization hyperparameter alpha to control the balance between fitting the training data and avoiding overfitting. It is viewed on a logarithmic scale (0.1, 1.0, 10.0).
The default alpha value in RidgeClassifier is 1.0.

Effect of Alpha:

- A smaller alpha allows for larger model coefficients, which can result in a model that fits the training data better.

- A larger alpha imposes a stronger penalty on the coefficients, which can help prevent overfitting.

In [17]:
from sklearn.linear_model import RidgeClassifierCV
ridge_classifier = RidgeClassifierCV(alphas=[0.1, 1.0, 10.0], cv=3)  # Puedes ajustar los valores de alpha según sea necesario
ridge_classifier.fit(X_train_res, Y_train_res)

# Obtener el mejor valor de alpha encontrado durante la búsqueda
best_alpha = ridge_classifier.alpha_
print("Mejor valor de alpha:", best_alpha)

Mejor valor de alpha: 0.1


In [16]:
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

ridge_classifier = RidgeClassifier(alpha=0.1)  # Puedes ajustar el valor de alpha según sea necesario
ridge_classifier.fit(X_train_res, Y_train_res)
predictions = ridge_classifier.predict(X_test_res)

# Evaluar la precisión del modelo
accuracy = accuracy_score(Y_test_res, predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 90.00%


**DummyClassifier:**

`strategy="uniform"` indicates that the classifier will generate random predictions.

This code uses the `most_frequent` strategy, which always predicts the most frequent class in the training set.

In [18]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

dummy_classifier = DummyClassifier()
dummy_classifier.fit(X_train_res, Y_train_res)
predictions_random = dummy_classifier.predict(X_test_res)

# Evaluar la precisión del modelo
accuracy_random = accuracy_score(Y_test_res, predictions_random)
print("Accuracy con estrategia aleatoria: {:.2f}%".format(accuracy_random * 100))

Accuracy con estrategia aleatoria: 50.00%


In [19]:
from sklearn.model_selection import GridSearchCV
# Definición conjuntp de hiperparámetros que se van a probar
parametro_dummy = {'strategy': ["stratified", "most_frequent", "uniform", "constant"]}


grid_search_dummy = GridSearchCV(dummy_classifier, parametro_dummy, cv=5, scoring='accuracy') #cv=5 significa 5-fold cross-validation
grid_search_dummy.fit(X_train_res, Y_train_res)

print("Mejores hiperparámetros:", grid_search_dummy.best_params_)

# Predicción para el X test
y_pred_dummy= grid_search_dummy.predict(X_test_res)
accuracy_dummy=accuracy_score(Y_test_res, y_pred_dummy)
print(f"Precisión del modelo: {accuracy_dummy}")

Mejores hiperparámetros: {'strategy': 'stratified'}
Precisión del modelo: 0.48333333333333334


In [20]:
# Estrategia de predecir la clase mayoritaria
dummy_classifier_majority = DummyClassifier(strategy="most_frequent")
dummy_classifier_majority.fit(X_train_res, Y_train_res)
predictions_majority = dummy_classifier_majority.predict(X_test_res)
accuracy_majority = accuracy_score(Y_test_res, predictions_majority)
print("Accuracy con estrategia de clase mayoritaria: {:.2f}%".format(accuracy_majority * 100))


Accuracy con estrategia de clase mayoritaria: 50.00%


In [21]:
from sklearn.linear_model import RidgeClassifierCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

ridge_classifier_cv = RidgeClassifierCV(alphas=np.arange(0.1, 10, 0.1), cv=5)
ridge_classifier_cv.fit(X_train_res, Y_train_res)
y_pred = ridge_classifier_cv.predict(X_test_res)

# Evaluar la precisión del modelo
accuracy = accuracy_score(Y_test_res, y_pred)
print("Precisión del modelo: {:.2f}%".format(accuracy * 100))
print("Classification Report:")
print(classification_report(Y_test_res, y_pred))

# Optimizar hiperparámetros si es necesario
param_grid = {'alphas': [np.arange(0.1, 1, 0.1), np.arange(1, 10, 1)]}
grid_search = GridSearchCV(RidgeClassifierCV(cv=5), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_res, Y_train_res)
print("Mejores hiperparámetros:", grid_search.best_params_)

# Realizar predicciones con el modelo optimizado
y_pred_optimized = grid_search.predict(X_test_res)

# Evaluar la precisión del modelo optimizado
accuracy_optimized = accuracy_score(Y_test_res, y_pred_optimized)
print("Precisión del modelo optimizado: {:.2f}%".format(accuracy_optimized * 100))


Precisión del modelo: 90.00%
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.97      0.91        30
           1       0.96      0.83      0.89        30

    accuracy                           0.90        60
   macro avg       0.91      0.90      0.90        60
weighted avg       0.91      0.90      0.90        60

Mejores hiperparámetros: {'alphas': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])}
Precisión del modelo optimizado: 90.00%


**Linear Discriminant Analysis**
The Linear Discriminant Analysis (LDA) algorithm is a classification method that seeks to find the directions (linear discriminants) that maximize the separation between classes in the data.

- solver (default='svd'): Specifies the algorithm to use for optimization. Options include 'svd' for singular value decomposition, 'lsqr' for least squares, and 'eigen' for eigenvalue decomposition. 'eigen' is only valid if shrinkage is set to 'auto'.

- shrinkage (default=None): Controls the application of Shrinkage regularization. It can be None (no regularization), 'auto' (uses a heuristic to automatically choose between 'lsqr' or 'eigen'), or a value between 0 and 1 that controls the amount of regularization.

In [22]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

lda = LinearDiscriminantAnalysis()
lda.fit(X_train_res, Y_train_res)
y_pred = lda.predict(X_test_res)

# Evaluar la precisión del modelo
accuracy = accuracy_score(Y_test_res, y_pred)
print("Precisión del modelo: {:.2f}%".format(accuracy * 100))
print("Classification Report:")
print(classification_report(Y_test_res, y_pred))

# Optimizar hiperparámetros si es necesario
param_grid = {'solver': ['svd', 'lsqr', 'eigen']}
grid_search = GridSearchCV(LinearDiscriminantAnalysis(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_res, Y_train_res)
print("Mejores hiperparámetros:", grid_search.best_params_)

# Realizar predicciones con el modelo optimizado
y_pred_optimized = grid_search.predict(X_test_res)

# Evaluar la precisión del modelo optimizado
accuracy_optimized = accuracy_score(Y_test_res, y_pred_optimized)
print("Precisión del modelo optimizado: {:.2f}%".format(accuracy_optimized * 100))


Precisión del modelo: 95.00%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95        30
           1       0.97      0.93      0.95        30

    accuracy                           0.95        60
   macro avg       0.95      0.95      0.95        60
weighted avg       0.95      0.95      0.95        60

Mejores hiperparámetros: {'solver': 'svd'}
Precisión del modelo optimizado: 95.00%


**Nearest Centroid**
The Nearest Centroid algorithm is a classifier that assigns samples to the class whose centroid (mean) is closest.
shrink_threshold-->controls the minimum magnitude of the inverse of the square of the Euclidean distance that centroids must reach before regularization is performed.
- shrink_threshold: float or None, default=None
If None, no regularization is applied.
If a float value, regularization will be applied for those centroids whose squared distances are smaller than shrink_threshold.
Higher values of shrink_threshold lead to greater regularization.

- The metric parameter allows you to specify the metric used to calculate distances between points. Some common options include:
‘euclidean’: The standard Euclidean distance.
‘manhattan’ or ‘l1’: The Manhattan distance.
‘chebyshev’ or ‘linf’: The Chebyshev distance (maximum difference across all dimensions).
‘cosine’: The cosine similarity.

In [24]:
from sklearn.neighbors import NearestCentroid
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

nearest_centroid = NearestCentroid(metric='euclidean')
nearest_centroid.fit(X_train_res, Y_train_res)
y_pred = nearest_centroid.predict(X_test_res)

# Evaluar la precisión del modelo
accuracy = accuracy_score(Y_test_res, y_pred)
print("Precisión del modelo: {:.2f}%".format(accuracy * 100))
print("Classification Report:")
print(classification_report(Y_test_res, y_pred))

# Optimizar hiperparámetros si es necesario
param_grid = {'shrink_threshold': [None, 0.1, 0.5, 1.0]}
grid_search = GridSearchCV(NearestCentroid(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_res, Y_train_res)
print("Mejores hiperparámetros:", grid_search.best_params_)

# Realizar predicciones con el modelo optimizado
y_pred_optimized = grid_search.predict(X_test_res)

# Evaluar la precisión del modelo optimizado
accuracy_optimized = accuracy_score(Y_test_res, y_pred_optimized)
print("Precisión del modelo optimizado: {:.2f}%".format(accuracy_optimized * 100))

Precisión del modelo: 86.67%
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87        30
           1       0.89      0.83      0.86        30

    accuracy                           0.87        60
   macro avg       0.87      0.87      0.87        60
weighted avg       0.87      0.87      0.87        60

Mejores hiperparámetros: {'shrink_threshold': None}
Precisión del modelo optimizado: 86.67%
