In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


# Funciones axuiliares

In [2]:
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

In [None]:
def remove_labels(df, label_name):
    X = df.drop(label_name, axis=1)
    y = df[label_name].copy()
    return (X, y)

In [None]:
def evaluate_result(y_pred, y, y_prep_pred, y_prep, metric):
    print(metric.__name__, "WITHOUT preparation:", metric(y_pred, y, average='weighted'))
    print(metric.__name__, "WITH preparation:", metric(y_prep_pred, y_prep, average='weighted'))

In [23]:
df = pd.read_csv(r"data.csv")
df = df.drop("Unnamed: 32", axis=1)
df

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [25]:
df.isnull().sum()

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [26]:
count = df.groupby("diagnosis")["diagnosis"].count()
count

diagnosis
B    357
M    212
Name: diagnosis, dtype: int64

# Division del conjunto

In [27]:
train_set, val_set, test_set = train_val_test_split(df)

In [28]:
X_train, y_train = remove_labels(train_set, 'diagnosis')
X_val, y_val = remove_labels(val_set, 'diagnosis')
X_test, y_test = remove_labels(test_set, 'diagnosis')


In [29]:
print("Longitud del Training Set:", len(train_set))
print("Longitud del Validation Set:", len(val_set))
print("Longitud del Test Set:", len(test_set))

Longitud del Training Set: 341
Longitud del Validation Set: 114
Longitud del Test Set: 114


# Escalado y procesado

## Imputando

In [31]:
# class ImputeMeanMode(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         self.quantitative_imputer = SimpleImputer(strategy='mean')
#         self.categorical_imputer = SimpleImputer(strategy='most_frequent')
#     def fit(self, X, y=None):
#         quantitative_cols = X.select_dtypes(include='number').columns
#         categorical_cols = X.select_dtypes(include='object').columns
#         self.quantitative_imputer.fit(X[quantitative_cols])
#         self.categorical_imputer.fit(X[categorical_cols])
#         return self
#     def transform(self, X, y=None):
#         X_copy = X.copy()
#         quantitative_cols = X_copy.select_dtypes(include='number').columns
#         categorical_cols = X_copy.select_dtypes(include='object').columns
#         X_copy[quantitative_cols] = self.quantitative_imputer.transform(X_copy[quantitative_cols])
#         X_copy[categorical_cols] = self.categorical_imputer.transform(X_copy[categorical_cols])
#         return X_copy

In [30]:
# imputador = ImputeMeanMode()
# X_train_imp = imputador.fit_transform(X_train)
# X_val_imp = imputador.transform(X_val)
# X_test_imp = imputador.transform(X_test)


## Dummificar

In [34]:
# # Transormador para codificar únicamente las columnas categoricas y devolver un DataFrame
# class CustomOneHotEncoding(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         self._oh = OneHotEncoder(sparse=False)
#         self._columns = None
#     def fit(self, X, y=None):
#         X_cat = X.select_dtypes(include=['object'])
#         self._columns = pd.get_dummies(X_cat).columns
#         self._oh.fit(X_cat)
#         return self
#     def transform(self, X, y=None):
#         X_copy = X.copy()
#         X_cat = X_copy.select_dtypes(include=['object'])
#         X_num = X_copy.select_dtypes(exclude=['object'])
#         X_cat_oh = self._oh.transform(X_cat)
#         X_cat_oh = pd.DataFrame(X_cat_oh, 
#                                 columns=self._columns, 
#                                 index=X_copy.index)
#         X_copy.drop(list(X_cat), axis=1, inplace=True)
#         return X_copy.join(X_cat_oh)

In [35]:
# # Crear una instancia del transformador
# custom_encoding = CustomOneHotEncoding()

# X_train_dum = custom_encoding.fit_transform(X_train_imp)
# X_val_dum = custom_encoding.fit_transform(X_val_imp)
# X_test_dum = custom_encoding.fit_transform(X_test_imp)

## Escalado

In [37]:
# Transofrmador diseñado para escalar de manera sencilla únicamente unas columnas seleccionadas
class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, attributes):
        self.attributes = attributes
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        X_copy = X.copy()
        scale_attrs = X_copy[self.attributes]
        robust_scaler = RobustScaler()
        X_scaled = robust_scaler.fit_transform(scale_attrs)
        X_scaled = pd.DataFrame(X_scaled, columns=self.attributes, index=X_copy.index)
        for attr in self.attributes:
            X_copy[attr] = X_scaled[attr]
        return X_copy

In [41]:
column_to_exclude = "diagnosis"

# Obtener todas las columnas excepto la columna a eliminar
columns_to_scale = df.columns.drop(column_to_exclude)

# Pasar las columnas al CustomScaler
custom_scaler = CustomScaler(list(columns_to_scale))

X_train_pro = custom_scaler.fit_transform(X_train)
X_val_pro = custom_scaler.fit_transform(X_val)
X_test_pro = custom_scaler.fit_transform(X_test)

In [44]:
X_train_pro.head(5)

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
296,-0.001648,-0.535147,-1.191529,-0.547464,-0.483563,-0.522013,-0.717837,-0.520628,-0.39856,-0.994135,...,-0.614991,-1.297924,-0.619452,-0.528719,-1.216613,-0.692199,-0.721227,-0.71839,-0.960756,-0.686996
490,11.249809,-0.231293,0.666667,-0.253576,-0.211029,-0.692872,-0.645676,-0.474008,-0.418497,-0.715543,...,-0.13799,0.798535,-0.126607,-0.117196,-0.178914,-0.158561,-0.385301,-0.392594,0.430233,0.099294
519,0.001479,-0.117914,-0.390424,-0.112809,-0.138653,0.909853,0.253691,-0.262752,-0.098394,0.973607,...,-0.09029,-0.452991,-0.105021,-0.115055,0.520767,-0.069909,-0.316447,-0.215227,0.388081,0.277722
513,0.001337,0.297052,-0.950276,0.270156,0.29878,0.166667,-0.085568,0.160733,0.15156,-0.143695,...,0.303237,-1.002442,0.255639,0.309311,-0.284345,-0.095745,0.06493,-0.096878,-0.258721,-0.482863
473,1.020717,-0.226757,2.053407,-0.278283,-0.213945,-0.951258,-0.917144,-0.641178,-0.651283,-0.255132,...,-0.260647,1.538462,-0.312394,-0.231359,-1.18147,-0.808359,-0.824117,-1.049684,-0.574128,-0.636593


In [42]:
X_train_pro.head(5)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
570,0.237885,-1.0,2.726304,0.0,0.0,10.5,0.0,1.0,0.0,0.0,1.0
787,0.715859,0.0,-1.773696,4.0,1.0,29.125,0.0,1.0,0.0,1.0,0.0
74,-0.854626,0.0,0.226304,0.0,0.0,56.4958,0.0,1.0,0.0,0.0,1.0
113,-0.768722,0.0,-0.773696,1.0,0.0,9.825,1.0,0.0,0.0,0.0,1.0
635,0.381057,-1.0,-0.10703,0.0,0.0,13.0,1.0,0.0,0.0,0.0,1.0


# Aplicar algoritmo

In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Modelo entrenado con el conjunto de datos sin escalar
clf_rnd = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf_rnd.fit(X_train, y_train)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [52]:
# Modelo entrenado con el conjunto de datos escalado
clf_rnd_scaled = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf_rnd_scaled.fit(X_train_pro, y_train)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [53]:
# Predecimos con el conjunto de datos de entrenamiento
y_train_pred = clf_rnd.predict(X_train)
y_train_prep_pred = clf_rnd_scaled.predict(X_train_pro)

In [54]:
# Comparamos resultados entre escalado y sin escalar
evaluate_result(y_train_pred, y_train, y_train_prep_pred, y_train, f1_score)

f1_score WITHOUT preparation: 1.0
f1_score WITH preparation: 1.0


In [55]:
# Predecimos con el conjunto de datos de validación
y_pred = clf_rnd.predict(X_val)
y_prep_pred = clf_rnd_scaled.predict(X_val_pro)

In [56]:
# Comparamos resultados entre escalado y sin escalar
evaluate_result(y_pred, y_val, y_prep_pred, y_val, f1_score)

f1_score WITHOUT preparation: 0.9737519952769336
f1_score WITH preparation: 0.9824561403508771


# Random forest para seleccion de caracteristicas

In [57]:
clf_rnd.feature_importances_

array([0.00554534, 0.02161659, 0.01362335, 0.03658899, 0.04430244,
       0.00646133, 0.01794752, 0.06008778, 0.1168853 , 0.00394096,
       0.00550564, 0.00708118, 0.00315612, 0.02762352, 0.03198856,
       0.00664561, 0.00659977, 0.01254746, 0.00528416, 0.00343699,
       0.00806163, 0.08787078, 0.02084592, 0.12586435, 0.09972345,
       0.01274512, 0.0137689 , 0.0603438 , 0.1096755 , 0.0156118 ,
       0.00862016])

In [58]:
# Podemos extraer que características son más importantes para la correcta clasificación de los datos
feature_importances = {name: score for name, score in zip(list(df), clf_rnd.feature_importances_)}

In [60]:
feature_importances_sorted = pd.Series(feature_importances).sort_values(ascending=False)
feature_importances_sorted.head(20)

texture_worst           0.125864
concavity_mean          0.116885
concavity_worst         0.109675
perimeter_worst         0.099723
fractal_dimension_se    0.087871
compactness_worst       0.060344
compactness_mean        0.060088
perimeter_mean          0.044302
texture_mean            0.036589
perimeter_se            0.031989
texture_se              0.027624
diagnosis               0.021617
radius_worst            0.020846
smoothness_mean         0.017948
concave points_worst    0.015612
smoothness_worst        0.013769
radius_mean             0.013623
area_worst              0.012745
compactness_se          0.012547
symmetry_worst          0.008620
dtype: float64