In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import matplotlib as plt
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df4 = pd.read_csv('../df_cleaned.csv')

In [3]:
df4.drop(columns='Unnamed: 0',inplace=True)

In [4]:
df4

Unnamed: 0,index,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,0,51,1,1,125.0,213.0,0.0,2.0,125.0,1.0,1.4,1.0,1.0,3.0,0.0
1,1,54,1,3,120.0,237.0,0.0,0.0,150.0,1.0,1.5,,,7.0,2.0
2,2,63,1,4,140.0,0.0,,2.0,149.0,0.0,2.0,1.0,,,2.0
3,3,52,0,2,140.0,,0.0,0.0,140.0,0.0,0.0,,,,0.0
4,4,55,1,4,140.0,217.0,0.0,0.0,111.0,1.0,5.6,3.0,0.0,7.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,179,54,1,4,200.0,198.0,0.0,0.0,142.0,1.0,2.0,2.0,,,
912,180,55,1,2,110.0,214.0,1.0,1.0,180.0,0.0,,,,,
913,181,67,1,3,152.0,212.0,0.0,2.0,150.0,0.0,0.8,2.0,0.0,7.0,
914,182,59,1,1,170.0,288.0,0.0,2.0,159.0,0.0,0.2,2.0,0.0,7.0,


In [5]:
df4.drop('index', axis=1, inplace=True)

In [6]:
df4.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,51,1,1,125.0,213.0,0.0,2.0,125.0,1.0,1.4,1.0,1.0,3.0,0.0
1,54,1,3,120.0,237.0,0.0,0.0,150.0,1.0,1.5,,,7.0,2.0
2,63,1,4,140.0,0.0,,2.0,149.0,0.0,2.0,1.0,,,2.0
3,52,0,2,140.0,,0.0,0.0,140.0,0.0,0.0,,,,0.0
4,55,1,4,140.0,217.0,0.0,0.0,111.0,1.0,5.6,3.0,0.0,7.0,3.0


In [7]:
media_thalach = df4['thalach'].mean()

# Rellenar los NaN con la media calculada
df4['thalach'].fillna(value=media_thalach, inplace=True)

# Mostrar los primeros registros para verificar el resultado
print(df4[['thalach']].head())

   thalach
0    125.0
1    150.0
2    149.0
3    140.0
4    111.0


In [8]:
# Suponiendo que 'df' es tu DataFrame y 'df' es tu DataFrame final
# Convertir '<NA>' a NaN
df4.replace('<NA>', np.nan, inplace=True)

# Obtener columnas numéricas y categóricas
numeric_cols4 = ['age', 'oldpeak']
categorical_cols4 = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'label']

# Imputación para variables numéricas (usando la mediana)
numeric_imputer3 = SimpleImputer(strategy='mean')
df4[numeric_cols4] = numeric_imputer3.fit_transform(df4[numeric_cols4])

# Imputación para variables categóricas (usando la moda)
categorical_imputer3 = SimpleImputer(strategy='most_frequent')
df4[categorical_cols4] = categorical_imputer3.fit_transform(df4[categorical_cols4])

# Verificar que ya no hay valores faltantes
print(df4.isnull().sum())

# El DataFrame 'df' ahora contiene los valores imputados


age           0
sex           0
cp            0
trestbps     59
chol         30
fbs           0
restecg       0
thalach       0
exang         0
oldpeak       0
slope         0
ca            0
thal        483
label         0
dtype: int64


In [9]:
# Función para categorizar los valores de chol
def categorizar_chol(valor):
    if valor == 0 or pd.isnull(valor):  
        return 0
    elif valor < 200:
        return 1
    elif 200 <= valor <= 239:
        return 2
    else:  # valor >= 240
        return 3

# Aplicar la función al DataFrame
df4['chol'] = df4['chol'].apply(categorizar_chol)

# Mostrar los primeros registros para verificar el resultado
print(df4['chol'].head())


0    2
1    2
2    0
3    0
4    2
Name: chol, dtype: int64


In [10]:
def sust_thal(valor):
    if pd.isnull(valor):
        return 0
    else:
        return valor

df4['thal'] = df4['thal'].apply(sust_thal)
print(df4['thal'].head())

0    3.0
1    7.0
2    0.0
3    0.0
4    7.0
Name: thal, dtype: float64


In [11]:
def categorizar_trestbps(valor):
    if pd.isnull(valor):
        return 0
    elif valor < 120:
        return 1
    elif 120 <= valor <= 129:
        return 2
    else:  # valor >= 130
        return 3


df4['trestbps'] = df4['trestbps'].apply(categorizar_trestbps)

# Mostrar los primeros registros para verificar el resultado
print(df4['trestbps'].head())


0    2
1    2
2    3
3    3
4    3
Name: trestbps, dtype: int64


In [12]:
df4.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
label       0
dtype: int64

In [13]:
from sklearn.preprocessing import MinMaxScaler

# Crear un objeto StandardScaler
scaler3 = MinMaxScaler()

# Lista de columnas a estandarizar
columns_to_scale3 = ['age','oldpeak', 'thalach']

# Ajustar el escalador solo a las columnas especificadas y transformarlas
df4[columns_to_scale3] = scaler3.fit_transform(df4[columns_to_scale3])


In [14]:
from sklearn.preprocessing import OrdinalEncoder

# Inicializar el codificador ordinal
ordinal_encoder = OrdinalEncoder()

# Variables para codificación ordinal
ordinal_vars = ["slope", "ca", "thal", "restecg"]

# Aplicar codificación ordinal
df4[ordinal_vars] = ordinal_encoder.fit_transform(df4[ordinal_vars])

df4.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,0.469388,1.0,1.0,2,2,0.0,2.0,0.457746,1.0,0.454545,0.0,1.0,1.0,0.0
1,0.530612,1.0,3.0,2,2,0.0,0.0,0.633803,1.0,0.465909,1.0,0.0,3.0,2.0
2,0.714286,1.0,4.0,3,0,0.0,2.0,0.626761,0.0,0.522727,0.0,0.0,0.0,2.0
3,0.489796,0.0,2.0,3,0,0.0,0.0,0.56338,0.0,0.295455,1.0,0.0,0.0,0.0
4,0.55102,1.0,4.0,3,2,0.0,0.0,0.359155,1.0,0.931818,2.0,0.0,3.0,3.0


In [15]:
# Aplicar codificación de conteo
for var in ["chol","trestbps", "cp"]:
    df4[var + '_count'] = df4[var].map(df4[var].value_counts())

df4.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label,chol_count,trestbps_count,cp_count
0,0.469388,1.0,1.0,2,2,0.0,2.0,0.457746,1.0,0.454545,0.0,1.0,1.0,0.0,230,211,44
1,0.530612,1.0,3.0,2,2,0.0,0.0,0.633803,1.0,0.465909,1.0,0.0,3.0,2.0,230,211,204
2,0.714286,1.0,4.0,3,0,0.0,2.0,0.626761,0.0,0.522727,0.0,0.0,0.0,2.0,201,485,495
3,0.489796,0.0,2.0,3,0,0.0,0.0,0.56338,0.0,0.295455,1.0,0.0,0.0,0.0,201,485,173
4,0.55102,1.0,4.0,3,2,0.0,0.0,0.359155,1.0,0.931818,2.0,0.0,3.0,3.0,230,485,495


In [16]:
df4

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label,chol_count,trestbps_count,cp_count
0,0.469388,1.0,1.0,2,2,0.0,2.0,0.457746,1.0,0.454545,0.0,1.0,1.0,0.0,230,211,44
1,0.530612,1.0,3.0,2,2,0.0,0.0,0.633803,1.0,0.465909,1.0,0.0,3.0,2.0,230,211,204
2,0.714286,1.0,4.0,3,0,0.0,2.0,0.626761,0.0,0.522727,0.0,0.0,0.0,2.0,201,485,495
3,0.489796,0.0,2.0,3,0,0.0,0.0,0.563380,0.0,0.295455,1.0,0.0,0.0,0.0,201,485,173
4,0.551020,1.0,4.0,3,2,0.0,0.0,0.359155,1.0,0.931818,2.0,0.0,3.0,3.0,230,485,495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,0.530612,1.0,4.0,3,1,0.0,0.0,0.577465,1.0,0.522727,1.0,0.0,0.0,0.0,128,485,495
912,0.551020,1.0,2.0,1,2,1.0,1.0,0.845070,0.0,0.394986,1.0,0.0,0.0,0.0,230,161,173
913,0.795918,1.0,3.0,3,2,0.0,2.0,0.633803,0.0,0.386364,1.0,0.0,3.0,0.0,230,485,204
914,0.632653,1.0,1.0,3,3,0.0,2.0,0.697183,0.0,0.318182,1.0,0.0,3.0,0.0,357,485,44


In [17]:
train_end4 = df4[0:732]
test_end4 = df4[(916-184):]

In [18]:
train_end4.isna().sum()

age               0
sex               0
cp                0
trestbps          0
chol              0
fbs               0
restecg           0
thalach           0
exang             0
oldpeak           0
slope             0
ca                0
thal              0
label             0
chol_count        0
trestbps_count    0
cp_count          0
dtype: int64

In [19]:
test_end4.isna().sum()

age               0
sex               0
cp                0
trestbps          0
chol              0
fbs               0
restecg           0
thalach           0
exang             0
oldpeak           0
slope             0
ca                0
thal              0
label             0
chol_count        0
trestbps_count    0
cp_count          0
dtype: int64

In [20]:
train_end4['label'].value_counts()

0.0    327
1.0    156
2.0    108
3.0    107
4.0     34
Name: label, dtype: int64

In [21]:
from sklearn.model_selection import train_test_split

X4 = train_end4.drop('label', axis=1)  # Todas las columnas excepto la columna objetivo
Y4 = train_end4['label']




In [22]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# Instanciar LDA
lda = LDA()

# Ajustar el modelo: Esto encuentra la dirección de proyección que maximiza la separación entre las clases
X_lda = lda.fit_transform(X4, Y4)  # Asegúrate de que 'X' son tus características y 'Y' tus etiquetas

# Dividimos el conjunto de entrenamiento en nuevo entrenamiento y validación (85% entrenamiento, 15% validación del conjunto de entrenamiento original)
X4_train, X4_val, Y4_train, Y4_val = train_test_split(X_lda, Y4, test_size=0.125, random_state=42)



In [23]:
# from catboost import CatBoostClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score

# # Define los hiperparámetros que deseas ajustar
# param_grid = {
#     'depth': [4, 6, 8, 10],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'iterations': [30, 50, 100],
#     'l2_leaf_reg': [1, 3, 5, 7, 9]
# }

# # Inicializa el modelo CatBoost
# cb_model = CatBoostClassifier(verbose=False, loss_function='MultiClass')

# # Inicializa GridSearchCV
# grid_search = GridSearchCV(estimator=cb_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# # Ajusta GridSearchCV en los datos de entrenamiento
# grid_search.fit(X4_train, Y4_train)

# # Obtén los mejores hiperparámetros
# best_params = grid_search.best_params_
# print(f"Mejores hiperparámetros: {best_params}")

# # Entrena un nuevo modelo con los mejores hiperparámetros
# best_cb_model = CatBoostClassifier(**best_params)
# best_cb_model.fit(X4_train, Y4_train)

# # Evalúa el modelo
# predictions = best_cb_model.predict(X4_val)
# accuracy = accuracy_score(Y4_val, predictions)
# print(f"La precisión del modelo CatBoost en el conjunto de validación con los mejores hiperparámetros es: {accuracy}")


In [24]:
from catboost import CatBoostClassifier

# Definir los mejores hiperparámetros obtenidos de la búsqueda en cuadrícula
best_hyperparams = {'depth': 6, 'iterations': 30, 'l2_leaf_reg': 7, 'learning_rate': 0.01}

# Inicializar el modelo CatBoost con los mejores hiperparámetros
best_cb_model = CatBoostClassifier(**best_hyperparams)

# Entrenar el modelo con los datos de entrenamiento
best_cb_model.fit(X4_train, Y4_train, verbose=False)



<catboost.core.CatBoostClassifier at 0x22fa4c7d7d0>

In [25]:
from sklearn.metrics import accuracy_score

# Hacer predicciones en el conjunto de validación
Y4_val_pred = best_cb_model.predict(X4_val)

# Calcular la precisión
accuracy = accuracy_score(Y4_val, Y4_val_pred)

print(f"La precisión del modelo en el conjunto de validación es: {accuracy}")


La precisión del modelo en el conjunto de validación es: 0.5760869565217391


In [26]:
# Aplica la misma transformación LDA a los datos de prueba
test_end4_sin_label = test_end4.drop('label', axis=1)
X_test_lda = lda.transform(test_end4_sin_label)  # Transformar características del conjunto de prueb

# Ahora puedes hacer predicciones con el conjunto de datos preparado
predicciones4 = best_cb_model.predict(X_test_lda)

# Y también obtener las probabilidades de cada clase si lo necesitas
probabilidades4 = best_cb_model.predict_proba(X_test_lda)



In [27]:
# Copiar el DataFrame 'test_end' a 'predicted_df'
predicted_df4 = test_end4.copy()

# Actualizar la columna 'label' con las predicciones
predicted_df4['label'] = predicciones4

# Mostrar las primeras filas del nuevo DataFrame para verificar
print(predicted_df4.head())



          age  sex   cp  trestbps  chol  fbs  restecg   thalach  exang  \
732  0.591837  1.0  4.0         3     1  0.0      2.0  0.415493    1.0   
733  0.489796  1.0  2.0         3     1  0.0      0.0  0.739437    0.0   
734  0.408163  1.0  2.0         1     0  0.0      0.0  0.281690    0.0   
735  0.693878  1.0  4.0         1     0  0.0      0.0  0.478873    1.0   
736  0.469388  1.0  3.0         1     1  0.0      0.0  0.443662    0.0   

      oldpeak  slope   ca  thal  label  chol_count  trestbps_count  cp_count  
732  0.636364    2.0  0.0   0.0    2.0         128             485       495  
733  0.295455    1.0  0.0   0.0    0.0         128             485       173  
734  0.295455    1.0  0.0   0.0    1.0         201             161       173  
735  0.579545    2.0  0.0   0.0    2.0         201             161       495  
736  0.363636    0.0  0.0   1.0    0.0         128             161       204  


In [28]:
predicted_df4

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label,chol_count,trestbps_count,cp_count
732,0.591837,1.0,4.0,3,1,0.0,2.0,0.415493,1.0,0.636364,2.0,0.0,0.0,2.0,128,485,495
733,0.489796,1.0,2.0,3,1,0.0,0.0,0.739437,0.0,0.295455,1.0,0.0,0.0,0.0,128,485,173
734,0.408163,1.0,2.0,1,0,0.0,0.0,0.281690,0.0,0.295455,1.0,0.0,0.0,1.0,201,161,173
735,0.693878,1.0,4.0,1,0,0.0,0.0,0.478873,1.0,0.579545,2.0,0.0,0.0,2.0,201,161,495
736,0.469388,1.0,3.0,1,1,0.0,0.0,0.443662,0.0,0.363636,0.0,0.0,1.0,0.0,128,161,204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,0.530612,1.0,4.0,3,1,0.0,0.0,0.577465,1.0,0.522727,1.0,0.0,0.0,2.0,128,485,495
912,0.551020,1.0,2.0,1,2,1.0,1.0,0.845070,0.0,0.394986,1.0,0.0,0.0,0.0,230,161,173
913,0.795918,1.0,3.0,3,2,0.0,2.0,0.633803,0.0,0.386364,1.0,0.0,3.0,0.0,230,485,204
914,0.632653,1.0,1.0,3,3,0.0,2.0,0.697183,0.0,0.318182,1.0,0.0,3.0,0.0,357,485,44


In [29]:
predicted_df4['label'].value_counts()

0.0    90
1.0    51
2.0    33
3.0    10
Name: label, dtype: int64

In [30]:
# Restablecer el índice de predicted_df para que comience en 0 y luego sumar 1 para comenzar desde 1
predicted_df4.reset_index(drop=True, inplace=True)
predicted_df4['ID'] = predicted_df4.index 

# Seleccionar las columnas 'ID' y 'label' para el nuevo DataFrame df_randomf
df_randomf3 = predicted_df4[['ID', 'label']]

# Guardar df_randomf en un archivo CSV sin el índice de pandas
df_randomf3.to_csv('tryX_data.csv', index=False)


In [31]:
df55best = pd.read_csv('try56best_data.csv')
dfactualtry = pd.read_csv('tryX_data.csv')

In [32]:
difference = df55best['label'].compare(dfactualtry['label'])
difference = difference[difference['self']!= difference ['other']]

print(difference)

     self  other
2     0.0    1.0
3     1.0    2.0
7     0.0    1.0
11    2.0    1.0
14    1.0    0.0
18    0.0    2.0
19    3.0    2.0
21    1.0    2.0
22    1.0    0.0
27    3.0    2.0
29    0.0    1.0
30    0.0    1.0
47    0.0    3.0
49    0.0    1.0
51    1.0    2.0
59    0.0    1.0
61    1.0    3.0
69    0.0    1.0
70    1.0    3.0
73    3.0    2.0
83    0.0    1.0
88    0.0    1.0
90    3.0    2.0
95    3.0    2.0
98    0.0    1.0
104   3.0    2.0
106   2.0    0.0
110   3.0    2.0
112   0.0    1.0
113   0.0    3.0
114   0.0    2.0
115   0.0    1.0
117   1.0    0.0
119   0.0    3.0
126   2.0    1.0
127   0.0    3.0
135   2.0    1.0
136   1.0    0.0
138   2.0    3.0
148   1.0    2.0
153   1.0    3.0
170   0.0    1.0
173   0.0    2.0
178   0.0    1.0


In [33]:
total_diferencias = len(difference)

print(f"Total de entradas diferentes: {total_diferencias}")

Total de entradas diferentes: 44
