## MIGUEL

In [180]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [181]:
df = pd.read_csv('../df_cleaned.csv')

In [182]:
df.drop(columns='Unnamed: 0', inplace=True)

In [183]:
df.drop('index', axis=1, inplace=True)

In [184]:
df.replace({'<NA>':np.nan}, inplace=True)

In [185]:
numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
# Convertir columnas numéricas de nuevo a tipo numérico
df[numeric_cols] = df[numeric_cols].astype(float)

# Convertir columnas categóricas de nuevo a tipo categórico
df[categorical_cols] = df[categorical_cols].astype('category')

# Verificar que los dtypes han cambiado
print(df.dtypes)


age          float64
sex         category
cp          category
trestbps     float64
chol         float64
fbs         category
restecg     category
thalach      float64
exang       category
oldpeak      float64
slope       category
ca          category
thal        category
label        float64
dtype: object


In [186]:
# Convertir '<NA>' a NaN
df.replace('<NA>', np.nan, inplace=True)

# Obtener columnas numéricas y categóricas
numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'label', 'ca']

# Imputación para variables numéricas (usando la media)
numeric_imputer = SimpleImputer(strategy='mean')  # Cambio a 'mean' en lugar de 'median'
df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])

# Imputación para variables categóricas (usando la moda)
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

# Verificar que ya no hay valores faltantes
print(df.isnull().sum())


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
label       0
dtype: int64


In [187]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,51.0,1.0,1.0,125.000000,213.000000,0.0,2.0,125.000000,1.0,1.400000,1.0,1.0,3.0,0.0
1,54.0,1.0,3.0,120.000000,237.000000,0.0,0.0,150.000000,1.0,1.500000,2.0,0.0,7.0,2.0
2,63.0,1.0,4.0,140.000000,0.000000,0.0,2.0,149.000000,0.0,2.000000,1.0,0.0,3.0,2.0
3,52.0,0.0,2.0,140.000000,199.146727,0.0,0.0,140.000000,0.0,0.000000,2.0,0.0,3.0,0.0
4,55.0,1.0,4.0,140.000000,217.000000,0.0,0.0,111.000000,1.0,5.600000,3.0,0.0,7.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,54.0,1.0,4.0,200.000000,198.000000,0.0,0.0,142.000000,1.0,2.000000,2.0,0.0,3.0,0.0
912,55.0,1.0,2.0,110.000000,214.000000,1.0,1.0,180.000000,0.0,0.875878,2.0,0.0,3.0,0.0
913,67.0,1.0,3.0,152.000000,212.000000,0.0,2.0,150.000000,0.0,0.800000,2.0,0.0,7.0,0.0
914,59.0,1.0,1.0,170.000000,288.000000,0.0,2.0,159.000000,0.0,0.200000,2.0,0.0,7.0,0.0


In [188]:
train_end = df[0:732]
test_end = df[(916-184):]

In [189]:
test_end

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
732,57.0,1.0,4.0,156.000000,173.000000,0.0,2.0,119.000000,1.0,3.000000,3.0,0.0,3.0,0.0
733,52.0,1.0,2.0,160.000000,196.000000,0.0,0.0,165.000000,0.0,0.000000,2.0,0.0,3.0,0.0
734,48.0,1.0,2.0,100.000000,199.146727,0.0,0.0,100.000000,0.0,0.000000,2.0,0.0,3.0,0.0
735,62.0,1.0,4.0,115.000000,0.000000,0.0,0.0,128.000000,1.0,2.500000,3.0,0.0,3.0,0.0
736,51.0,1.0,3.0,110.000000,175.000000,0.0,0.0,123.000000,0.0,0.600000,1.0,0.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,54.0,1.0,4.0,200.000000,198.000000,0.0,0.0,142.000000,1.0,2.000000,2.0,0.0,3.0,0.0
912,55.0,1.0,2.0,110.000000,214.000000,1.0,1.0,180.000000,0.0,0.875878,2.0,0.0,3.0,0.0
913,67.0,1.0,3.0,152.000000,212.000000,0.0,2.0,150.000000,0.0,0.800000,2.0,0.0,7.0,0.0
914,59.0,1.0,1.0,170.000000,288.000000,0.0,2.0,159.000000,0.0,0.200000,2.0,0.0,7.0,0.0


## 4 TRY - Random Forest

In [190]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# Identificar la variable objetivo y las características para el entrenamiento
target_column = 'label'  # Nombre real de la variable objetivo
features_train = train_end.drop(columns=[target_column])
target_train = train_end[target_column]

# Identificar las características para la predicción en el conjunto de prueba
features_test = test_end.drop(columns=[target_column])

# Inicializar y entrenar el modelo de Random Forest para clasificación
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(features_train, target_train)

# Predecir las clases de las variables categóricas faltantes en el conjunto de prueba
predicted_classes = model.predict(features_test)

# Crear un nuevo DataFrame con las clases predichas
predicted_df = test_end.copy()
predicted_df[target_column] = predicted_classes

# Imprimir el DataFrame con las clases predichas
print(predicted_df)


      age  sex   cp    trestbps        chol  fbs  restecg     thalach  exang  \
732  57.0  1.0  4.0  156.000000  173.000000  0.0      2.0  119.000000    1.0   
733  52.0  1.0  2.0  160.000000  196.000000  0.0      0.0  165.000000    0.0   
734  48.0  1.0  2.0  100.000000  199.146727  0.0      0.0  100.000000    0.0   
735  62.0  1.0  4.0  115.000000    0.000000  0.0      0.0  128.000000    1.0   
736  51.0  1.0  3.0  110.000000  175.000000  0.0      0.0  123.000000    0.0   
..    ...  ...  ...         ...         ...  ...      ...         ...    ...   
911  54.0  1.0  4.0  200.000000  198.000000  0.0      0.0  142.000000    1.0   
912  55.0  1.0  2.0  110.000000  214.000000  1.0      1.0  180.000000    0.0   
913  67.0  1.0  3.0  152.000000  212.000000  0.0      2.0  150.000000    0.0   
914  59.0  1.0  1.0  170.000000  288.000000  0.0      2.0  159.000000    0.0   
915  58.0  1.0  4.0  132.142357  203.000000  1.0      0.0  137.533101    0.0   

      oldpeak  slope   ca  thal  label 

In [191]:
predicted_df['index'] = predicted_df.index

In [192]:
predicted_df.reset_index(inplace=True)

In [193]:
predicted_df.drop(columns=['index'], inplace=True)

# Renombrar la columna 'level_0' a 'index'
predicted_df.rename(columns={'level_0': 'index'}, inplace=True)

predicted_df['ID'] = range(len(predicted_df))
# Guardar las columnas 'index' y 'label' en un archivo CSV
predicted_df[['ID', 'label']].to_csv('try4_data.csv', index=False)

In [194]:
predicted_df

Unnamed: 0,index,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label,ID
0,732,57.0,1.0,4.0,156.000000,173.000000,0.0,2.0,119.000000,1.0,3.000000,3.0,0.0,3.0,2.0,0
1,733,52.0,1.0,2.0,160.000000,196.000000,0.0,0.0,165.000000,0.0,0.000000,2.0,0.0,3.0,0.0,1
2,734,48.0,1.0,2.0,100.000000,199.146727,0.0,0.0,100.000000,0.0,0.000000,2.0,0.0,3.0,0.0,2
3,735,62.0,1.0,4.0,115.000000,0.000000,0.0,0.0,128.000000,1.0,2.500000,3.0,0.0,3.0,1.0,3
4,736,51.0,1.0,3.0,110.000000,175.000000,0.0,0.0,123.000000,0.0,0.600000,1.0,0.0,3.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,911,54.0,1.0,4.0,200.000000,198.000000,0.0,0.0,142.000000,1.0,2.000000,2.0,0.0,3.0,3.0,179
180,912,55.0,1.0,2.0,110.000000,214.000000,1.0,1.0,180.000000,0.0,0.875878,2.0,0.0,3.0,0.0,180
181,913,67.0,1.0,3.0,152.000000,212.000000,0.0,2.0,150.000000,0.0,0.800000,2.0,0.0,7.0,0.0,181
182,914,59.0,1.0,1.0,170.000000,288.000000,0.0,2.0,159.000000,0.0,0.200000,2.0,0.0,7.0,0.0,182


In [195]:
try4_df = pd.read_csv('try4_data.csv')

In [196]:
try4_df

Unnamed: 0,ID,label
0,0,2.0
1,1,0.0
2,2,0.0
3,3,1.0
4,4,0.0
...,...,...
179,179,3.0
180,180,0.0
181,181,0.0
182,182,0.0


In [197]:
try4_df['label'].value_counts()

0.0    96
1.0    37
3.0    27
2.0    24
Name: label, dtype: int64

## 5 TRY - SVM

In [198]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Crear un modelo de clasificador SVM
svm_model = make_pipeline(StandardScaler(), SVC(kernel='linear', C=1.0, random_state=42))

# Entrenar el modelo SVM
svm_model.fit(features_train, target_train)

# Predecir las clases en el conjunto de prueba
predicted_classes_svm = svm_model.predict(features_test)

# Crear un nuevo DataFrame con las clases predichas por SVM
predicted_df_svm = test_end.copy()
predicted_df_svm[target_column] = predicted_classes_svm

# Imprimir el DataFrame con las clases predichas por SVM
print(predicted_df_svm)


      age  sex   cp    trestbps        chol  fbs  restecg     thalach  exang  \
732  57.0  1.0  4.0  156.000000  173.000000  0.0      2.0  119.000000    1.0   
733  52.0  1.0  2.0  160.000000  196.000000  0.0      0.0  165.000000    0.0   
734  48.0  1.0  2.0  100.000000  199.146727  0.0      0.0  100.000000    0.0   
735  62.0  1.0  4.0  115.000000    0.000000  0.0      0.0  128.000000    1.0   
736  51.0  1.0  3.0  110.000000  175.000000  0.0      0.0  123.000000    0.0   
..    ...  ...  ...         ...         ...  ...      ...         ...    ...   
911  54.0  1.0  4.0  200.000000  198.000000  0.0      0.0  142.000000    1.0   
912  55.0  1.0  2.0  110.000000  214.000000  1.0      1.0  180.000000    0.0   
913  67.0  1.0  3.0  152.000000  212.000000  0.0      2.0  150.000000    0.0   
914  59.0  1.0  1.0  170.000000  288.000000  0.0      2.0  159.000000    0.0   
915  58.0  1.0  4.0  132.142357  203.000000  1.0      0.0  137.533101    0.0   

      oldpeak  slope   ca  thal  label 

In [199]:
predicted_df_svm['index'] = predicted_df.index
predicted_df_svm.reset_index(inplace=True)

In [200]:
predicted_df_svm.drop(columns=['index'], inplace=True)

# Renombrar la columna 'level_0' a 'index'
predicted_df_svm.rename(columns={'level_0': 'index'}, inplace=True)

predicted_df_svm['ID'] = range(len(predicted_df_svm))
# Guardar las columnas 'index' y 'label' en un archivo CSV
predicted_df_svm[['ID', 'label']].to_csv('try5_data.csv', index=False)

In [201]:
try5_df = pd.read_csv('try5_data.csv')

In [202]:
try5_df

Unnamed: 0,ID,label
0,0,2.0
1,1,0.0
2,2,0.0
3,3,2.0
4,4,0.0
...,...,...
179,179,2.0
180,180,0.0
181,181,0.0
182,182,0.0


In [203]:
try5_df['label'].value_counts()

0.0    101
1.0     44
3.0     25
2.0     14
Name: label, dtype: int64

## 9 TRY - Random Forest

### Fue un error de try, se me olvidó escalar la edad.

## 10 TRY - Random Forest

In [204]:
# Variables tratadas con one-hot encoding
categorical_variables = ['cp', 'restecg', 'slope', 'thal', 'ca']

# Aplicar one-hot encoding a las variables categóricas
df = pd.get_dummies(df, columns=categorical_variables, drop_first=True)

# Mostrar el DataFrame con las variables codificadas
print(df.head())

    age  sex  trestbps        chol  fbs  thalach  exang  oldpeak  label  \
0  51.0  1.0     125.0  213.000000  0.0    125.0    1.0      1.4    0.0   
1  54.0  1.0     120.0  237.000000  0.0    150.0    1.0      1.5    2.0   
2  63.0  1.0     140.0    0.000000  0.0    149.0    0.0      2.0    2.0   
3  52.0  0.0     140.0  199.146727  0.0    140.0    0.0      0.0    0.0   
4  55.0  1.0     140.0  217.000000  0.0    111.0    1.0      5.6    3.0   

   cp_2.0  ...  cp_4.0  restecg_1.0  restecg_2.0  slope_2.0  slope_3.0  \
0       0  ...       0            0            1          0          0   
1       0  ...       0            0            0          1          0   
2       0  ...       1            0            1          0          0   
3       1  ...       0            0            0          1          0   
4       0  ...       1            0            0          0          1   

   thal_6.0  thal_7.0  ca_1.0  ca_2.0  ca_3.0  
0         0         0       1       0       0  
1       

In [205]:
# Selecciona solo las características numéricas
numeric_data = df[['trestbps', 'chol', 'thalach', 'oldpeak']]  # Las características deben estar dentro de una lista []

# Inicializa el escalador Min-Max
scaler = MinMaxScaler()

# Escala las características numéricas
scaled_numeric_data = scaler.fit_transform(numeric_data)

# Crea un nuevo DataFrame con las características escaladas
scaled_data = pd.DataFrame(scaled_numeric_data, columns=['trestbps', 'chol', 'thalach', 'oldpeak'])

# Reemplaza las características originales con las características escaladas en tu DataFrame
df[['trestbps', 'chol', 'thalach', 'oldpeak']] = scaled_data

print(df.head())

    age  sex  trestbps      chol  fbs   thalach  exang   oldpeak  label  \
0  51.0  1.0     0.625  0.353234  0.0  0.457746    1.0  0.454545    0.0   
1  54.0  1.0     0.600  0.393035  0.0  0.633803    1.0  0.465909    2.0   
2  63.0  1.0     0.700  0.000000  0.0  0.626761    0.0  0.522727    2.0   
3  52.0  0.0     0.700  0.330260  0.0  0.563380    0.0  0.295455    0.0   
4  55.0  1.0     0.700  0.359867  0.0  0.359155    1.0  0.931818    3.0   

   cp_2.0  ...  cp_4.0  restecg_1.0  restecg_2.0  slope_2.0  slope_3.0  \
0       0  ...       0            0            1          0          0   
1       0  ...       0            0            0          1          0   
2       0  ...       1            0            1          0          0   
3       1  ...       0            0            0          1          0   
4       0  ...       1            0            0          0          1   

   thal_6.0  thal_7.0  ca_1.0  ca_2.0  ca_3.0  
0         0         0       1       0       0  
1       

In [206]:
train_end = df[0:732]
test_end = df[(916-184):]

In [207]:
test_end

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,label,cp_2.0,...,cp_4.0,restecg_1.0,restecg_2.0,slope_2.0,slope_3.0,thal_6.0,thal_7.0,ca_1.0,ca_2.0,ca_3.0
732,57.0,1.0,0.780000,0.286899,0.0,0.415493,1.0,0.636364,0.0,0,...,1,0,1,0,1,0,0,0,0,0
733,52.0,1.0,0.800000,0.325041,0.0,0.739437,0.0,0.295455,0.0,1,...,0,0,0,1,0,0,0,0,0,0
734,48.0,1.0,0.500000,0.330260,0.0,0.281690,0.0,0.295455,0.0,1,...,0,0,0,1,0,0,0,0,0,0
735,62.0,1.0,0.575000,0.000000,0.0,0.478873,1.0,0.579545,0.0,0,...,1,0,0,0,1,0,0,0,0,0
736,51.0,1.0,0.550000,0.290216,0.0,0.443662,0.0,0.363636,0.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,54.0,1.0,1.000000,0.328358,0.0,0.577465,1.0,0.522727,0.0,0,...,1,0,0,1,0,0,0,0,0,0
912,55.0,1.0,0.550000,0.354892,1.0,0.845070,0.0,0.394986,0.0,1,...,0,1,0,1,0,0,0,0,0,0
913,67.0,1.0,0.760000,0.351575,0.0,0.633803,0.0,0.386364,0.0,0,...,0,0,1,1,0,0,1,0,0,0
914,59.0,1.0,0.850000,0.477612,0.0,0.697183,0.0,0.318182,0.0,0,...,0,0,1,1,0,0,1,0,0,0


In [208]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# Identificar la variable objetivo y las características para el entrenamiento
target_column = 'label'  # Nombre real de la variable objetivo
features_train = train_end.drop(columns=[target_column])
target_train = train_end[target_column]

# Identificar las características para la predicción en el conjunto de prueba
features_test = test_end.drop(columns=[target_column])

# Inicializar y entrenar el modelo de Random Forest para clasificación
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(features_train, target_train)

# Predecir las clases de las variables categóricas faltantes en el conjunto de prueba
predicted_classes = model.predict(features_test)

# Crear un nuevo DataFrame con las clases predichas
predicted_df = test_end.copy()
predicted_df[target_column] = predicted_classes

# Imprimir el DataFrame con las clases predichas
print(predicted_df)

      age  sex  trestbps      chol  fbs   thalach  exang   oldpeak  label  \
732  57.0  1.0  0.780000  0.286899  0.0  0.415493    1.0  0.636364    2.0   
733  52.0  1.0  0.800000  0.325041  0.0  0.739437    0.0  0.295455    0.0   
734  48.0  1.0  0.500000  0.330260  0.0  0.281690    0.0  0.295455    0.0   
735  62.0  1.0  0.575000  0.000000  0.0  0.478873    1.0  0.579545    1.0   
736  51.0  1.0  0.550000  0.290216  0.0  0.443662    0.0  0.363636    0.0   
..    ...  ...       ...       ...  ...       ...    ...       ...    ...   
911  54.0  1.0  1.000000  0.328358  0.0  0.577465    1.0  0.522727    2.0   
912  55.0  1.0  0.550000  0.354892  1.0  0.845070    0.0  0.394986    0.0   
913  67.0  1.0  0.760000  0.351575  0.0  0.633803    0.0  0.386364    0.0   
914  59.0  1.0  0.850000  0.477612  0.0  0.697183    0.0  0.318182    0.0   
915  58.0  1.0  0.660712  0.336650  1.0  0.546008    0.0  0.394986    1.0   

     cp_2.0  ...  cp_4.0  restecg_1.0  restecg_2.0  slope_2.0  slope_3.0  \

In [209]:
predicted_df['index'] = predicted_df.index

In [210]:
predicted_df.reset_index(inplace=True)

In [211]:
predicted_df.drop(columns=['index'], inplace=True)

# Renombrar la columna 'level_0' a 'index'
predicted_df.rename(columns={'level_0': 'index'}, inplace=True)

predicted_df['ID'] = range(len(predicted_df))
# Guardar las columnas 'index' y 'label' en un archivo CSV
predicted_df[['ID', 'label']].to_csv('try10_data.csv', index=False)

In [213]:
try10_df = pd.read_csv('try10_data.csv')

In [214]:
try10_df

Unnamed: 0,ID,label
0,0,2.0
1,1,0.0
2,2,0.0
3,3,1.0
4,4,0.0
...,...,...
179,179,2.0
180,180,0.0
181,181,0.0
182,182,0.0


In [215]:
try10_df['label'].value_counts()

0.0    98
1.0    39
3.0    24
2.0    22
4.0     1
Name: label, dtype: int64