In [142]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [143]:
df = pd.read_csv('../df_cleaned.csv')

In [144]:
df.drop('index', axis=1, inplace=True)


In [145]:
df.replace({'<NA>':np.nan}, inplace=True)


In [146]:
df

Unnamed: 0.1,Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,0,51,1,1,125.0,213.0,0.0,2.0,125.0,1.0,1.4,1.0,1.0,3.0,0.0
1,1,54,1,3,120.0,237.0,0.0,0.0,150.0,1.0,1.5,,,7.0,2.0
2,2,63,1,4,140.0,0.0,,2.0,149.0,0.0,2.0,1.0,,,2.0
3,3,52,0,2,140.0,,0.0,0.0,140.0,0.0,0.0,,,,0.0
4,4,55,1,4,140.0,217.0,0.0,0.0,111.0,1.0,5.6,3.0,0.0,7.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,911,54,1,4,200.0,198.0,0.0,0.0,142.0,1.0,2.0,2.0,,,
912,912,55,1,2,110.0,214.0,1.0,1.0,180.0,0.0,,,,,
913,913,67,1,3,152.0,212.0,0.0,2.0,150.0,0.0,0.8,2.0,0.0,7.0,
914,914,59,1,1,170.0,288.0,0.0,2.0,159.0,0.0,0.2,2.0,0.0,7.0,


In [147]:
df.dtypes


Unnamed: 0      int64
age             int64
sex             int64
cp              int64
trestbps      float64
chol          float64
fbs           float64
restecg       float64
thalach       float64
exang         float64
oldpeak       float64
slope         float64
ca            float64
thal          float64
label         float64
dtype: object

In [148]:
float_cols = ['trestbps', 'chol', 'thalach', 'oldpeak']
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
# Convertir columnas numéricas de nuevo a tipo numérico
df[float_cols] = df[float_cols].astype(float)

# Convertir columnas categóricas de nuevo a tipo categórico
df[categorical_cols] = df[categorical_cols].astype('category')

# Verificar que los dtypes han cambiado
print(df.dtypes)

Unnamed: 0       int64
age              int64
sex           category
cp            category
trestbps       float64
chol           float64
fbs           category
restecg       category
thalach        float64
exang         category
oldpeak        float64
slope         category
ca            category
thal          category
label          float64
dtype: object


In [149]:
df.isnull().sum()


Unnamed: 0      0
age             0
sex             0
cp              0
trestbps       59
chol           30
fbs            89
restecg         2
thalach        55
exang          55
oldpeak        62
slope         308
ca            608
thal          483
label         184
dtype: int64

In [150]:
# Suponiendo que 'df' es tu DataFrame y 'df' es tu DataFrame final
# Convertir '<NA>' a NaN
df.replace('<NA>', np.nan, inplace=True)

# Obtener columnas numéricas y categóricas
numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'label']

# Imputación para variables numéricas (usando la mediana)
numeric_imputer = SimpleImputer(strategy='median')
df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])

# Imputación para variables categóricas (usando la moda)
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

# Verificar que ya no hay valores faltantes
print(df.isnull().sum())

# El DataFrame 'df' ahora contiene los valores imputados


Unnamed: 0    0
age           0
sex           0
cp            0
trestbps      0
chol          0
fbs           0
restecg       0
thalach       0
exang         0
oldpeak       0
slope         0
ca            0
thal          0
label         0
dtype: int64


In [151]:
train_end = df[0:732]
test_end = df[(916-184):]

In [152]:
train_end.isna().sum()

Unnamed: 0    0
age           0
sex           0
cp            0
trestbps      0
chol          0
fbs           0
restecg       0
thalach       0
exang         0
oldpeak       0
slope         0
ca            0
thal          0
label         0
dtype: int64

In [153]:
test_end.isna().sum()

Unnamed: 0    0
age           0
sex           0
cp            0
trestbps      0
chol          0
fbs           0
restecg       0
thalach       0
exang         0
oldpeak       0
slope         0
ca            0
thal          0
label         0
dtype: int64

In [154]:
train_end['label'].value_counts()

0.0    327
1.0    156
2.0    108
3.0    107
4.0     34
Name: label, dtype: int64

In [155]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd

# Asumiendo que ya tienes cargado tu DataFrame df_train
# Primero, vamos a separar las características (X) de las etiquetas (y)
X = train_end.drop('label', axis=1)  # Eliminar la columna 'label' para obtener las características
y = train_end['label']  # Las etiquetas son los valores de 'label'

# Aplicar SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Ahora, X_res y y_res son tus características y etiquetas balanceadas, respectivamente


In [156]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imblearnPipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Definir explícitamente la tubería
pipeline = imblearnPipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', DecisionTreeClassifier())  # Usar 'classifier' como placeholder
])

# Definir el espacio de parámetros para los modelos
param_grid = [
    {
        'classifier': [DecisionTreeClassifier()],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_leaf': [1, 2, 4]
    },
    {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [10, 50, 100],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_leaf': [1, 2, 4]
    }
]

# Configurar GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Asumiendo que X_res y y_res son tus características y etiquetas balanceadas
grid_search.fit(X_res, y_res)

print("Mejores parámetros:", grid_search.best_params_)
print("Mejor score:", grid_search.best_score_)


Mejores parámetros: {'classifier': RandomForestClassifier(), 'classifier__max_depth': 20, 'classifier__min_samples_leaf': 1, 'classifier__n_estimators': 100}
Mejor score: 0.7896024464831805


In [157]:
from sklearn.ensemble import RandomForestClassifier

# Entrenar el modelo con los mejores parámetros encontrados
mejor_modelo = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_leaf=1,
    random_state=42  # Fijar una semilla para reproducibilidad
)

# Asumiendo que X_res y y_res son tus características y etiquetas balanceadas
mejor_modelo.fit(X_res, y_res)




In [158]:
# Excluyendo la columna 'label'
test_end_sin_label = test_end.drop('label', axis=1)

# Ahora puedes hacer predicciones con el conjunto de datos preparado
predicciones = mejor_modelo.predict(test_end_sin_label)

# Y también obtener las probabilidades de cada clase si lo necesitas
probabilidades = mejor_modelo.predict_proba(test_end_sin_label)



In [159]:
# Copiar el DataFrame 'test_end' a 'predicted_df'
predicted_df = test_end.copy()

# Actualizar la columna 'label' con las predicciones
predicted_df['label'] = predicciones

# Mostrar las primeras filas del nuevo DataFrame para verificar
print(predicted_df.head())



     Unnamed: 0   age  sex   cp  trestbps   chol  fbs  restecg  thalach  \
732         732  57.0  1.0  4.0     156.0  173.0  0.0      2.0    119.0   
733         733  52.0  1.0  2.0     160.0  196.0  0.0      0.0    165.0   
734         734  48.0  1.0  2.0     100.0  223.0  0.0      0.0    100.0   
735         735  62.0  1.0  4.0     115.0    0.0  0.0      0.0    128.0   
736         736  51.0  1.0  3.0     110.0  175.0  0.0      0.0    123.0   

     exang  oldpeak  slope   ca  thal  label  
732    1.0      3.0    3.0  0.0   3.0    4.0  
733    0.0      0.0    2.0  0.0   3.0    0.0  
734    0.0      0.0    2.0  0.0   3.0    0.0  
735    1.0      2.5    3.0  0.0   3.0    2.0  
736    0.0      0.6    1.0  0.0   3.0    0.0  


In [169]:
predicted_df

Unnamed: 0.1,ID,Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label,index
0,732,732,57.0,1.0,4.0,156.0,173.0,0.0,2.0,119.0,1.0,3.0,3.0,0.0,3.0,4.0,0
1,733,733,52.0,1.0,2.0,160.0,196.0,0.0,0.0,165.0,0.0,0.0,2.0,0.0,3.0,0.0,1
2,734,734,48.0,1.0,2.0,100.0,223.0,0.0,0.0,100.0,0.0,0.0,2.0,0.0,3.0,0.0,2
3,735,735,62.0,1.0,4.0,115.0,0.0,0.0,0.0,128.0,1.0,2.5,3.0,0.0,3.0,2.0,3
4,736,736,51.0,1.0,3.0,110.0,175.0,0.0,0.0,123.0,0.0,0.6,1.0,0.0,3.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,911,911,54.0,1.0,4.0,200.0,198.0,0.0,0.0,142.0,1.0,2.0,2.0,0.0,3.0,3.0,179
180,912,912,55.0,1.0,2.0,110.0,214.0,1.0,1.0,180.0,0.0,0.5,2.0,0.0,3.0,0.0,180
181,913,913,67.0,1.0,3.0,152.0,212.0,0.0,2.0,150.0,0.0,0.8,2.0,0.0,7.0,0.0,181
182,914,914,59.0,1.0,1.0,170.0,288.0,0.0,2.0,159.0,0.0,0.2,2.0,0.0,7.0,0.0,182


In [172]:
# Restablecer el índice de predicted_df para que comience en 0 y luego sumar 1 para comenzar desde 1
predicted_df.reset_index(drop=True, inplace=True)
predicted_df['ID'] = predicted_df.index 

# Seleccionar las columnas 'ID' y 'label' para el nuevo DataFrame df_randomf
df_randomf = predicted_df[['ID', 'label']]

# Guardar df_randomf en un archivo CSV sin el índice de pandas
df_randomf.to_csv('try6_data.csv', index=False)


In [170]:
predicted_df.drop(['Unnamed: 0', 'index'], axis=1)
predicted_df.rename(columns={'level_0': 'ID'}, inplace=True)
df_randomf = predicted_df[['ID', 'label']]
df_randomf.to_csv('try6_data.csv', index=False)


In [167]:
try3_df = pd.read_csv('try6_data.csv')