# Algoritmos para clasificar el tipo de tumor en cáncer de mama

## Librerías

In [2]:
# Manejo y análisis de estructuras de datos
import pandas as pd

# Estandarización de datos
from sklearn.preprocessing import StandardScaler

# Crear conjuntos de entrenamiento y prueba
from sklearn.model_selection import train_test_split

# Importar los algoritmos de los análisis discriminantes
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Medir la precisión del modelo
from sklearn.metrics import accuracy_score

## Preparación del conjunto de datos

In [3]:
# Importar archivo CSV de la actividad 2
data_csv = "data.csv"

# Cargar el archivo CSV en un DataFrame e indicar que la columna 0 sea el identificador
data = pd.read_csv(data_csv, index_col=0)

# Ver DataFrame
data.head(10)

Unnamed: 0_level_0,Diagnosis,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
844981,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,...,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075


In [4]:
# Ver el tamaño del DataFrame
data.shape

(569, 31)

In [5]:
# Ver si hay valores nulos
valores_nulos = data.isna().sum().sum()
print(f"Número total de valores nulos: {valores_nulos}")

Número total de valores nulos: 0


In [6]:
# Ver los tipos de datos
data.dtypes

Diagnosis              object
radius1               float64
texture1              float64
perimeter1            float64
area1                 float64
smoothness1           float64
compactness1          float64
concavity1            float64
concave_points1       float64
symmetry1             float64
fractal_dimension1    float64
radius2               float64
texture2              float64
perimeter2            float64
area2                 float64
smoothness2           float64
compactness2          float64
concavity2            float64
concave_points2       float64
\tsymmetry2           float64
fractal_dimension2    float64
radius3               float64
texture3              float64
perimeter3            float64
area3                 float64
smoothness3           float64
compactness3          float64
concavity3            float64
concave_points3       float64
symmetry3             float64
fractal_dimension3    float64
dtype: object

In [7]:
# Crear una copia del DataFrame original
data_original = data.copy()

In [8]:
# Transformar los valores B y M de tipo objeto de la columna "Diagnosis" a tipo integer 0 y 1 respectivamente.
valores_nuevos = {'B': 0, 'M': 1}
data['Diagnosis'] = data['Diagnosis'].replace(valores_nuevos)
data.head(10)

Unnamed: 0_level_0,Diagnosis,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
843786,1,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
844359,1,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
84458202,1,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
844981,1,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
84501001,1,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,...,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075


## División del conjunto de datos

In [9]:
# Creación del conjunto de variables predictoras (X)
X = data.drop("Diagnosis", axis = 1) # El conjunto de datos sin la columna de "Diagnosis"
X.head(10)

Unnamed: 0_level_0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
843786,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
844359,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
84458202,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
844981,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
84501001,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,...,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075


In [10]:
# Creación del conjunto de la variable objetivo (y)
y = data["Diagnosis"].copy() # Copiar la columna "Diagnosis"
y.head(10)

ID
842302      1
842517      1
84300903    1
84348301    1
84358402    1
843786      1
844359      1
84458202    1
844981      1
84501001    1
Name: Diagnosis, dtype: int64

In [11]:
# Ver la cantidad de valores 0 (Benigno) y 1 (Maligno) que hay en el DataFrame
y.value_counts()

Diagnosis
0    357
1    212
Name: count, dtype: int64

In [12]:
# Estandarización de los datos del conjunto de variables predictoras (X)
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
print(X_standardized[5]) # Mostrar los 5 primeros resultados

[-0.47637467 -0.8353353  -0.38714807 -0.50565045  2.23742148  1.24433549
  0.8663016   0.82465565  1.0054018   1.89000504 -0.25507029 -0.59266165
 -0.32130419 -0.28925822  0.1563467   0.44554365  0.1600252  -0.06912355
  0.13411881  0.48684584 -0.16549825 -0.31383633 -0.11500946 -0.24432021
  2.04851283  1.72161644  1.2632432   0.90588779  1.75406939  2.24180161]


In [13]:
# Crear conjunto de prueba y conjunto de entrenamiento
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, random_state=0)

# Mostrar los 5 primeros resultados de los conjuntos de entrenamiento
print(X_train[:5])
print(y_train[:5])

[[-0.64678318 -0.42577149 -0.67671518 -0.63192861 -0.8995512  -0.90817355
  -0.77739522 -0.67371679  0.2314018  -0.80060746 -0.71524457  0.03836716
  -0.80791928 -0.58210097  0.18468118 -0.58561835 -0.58932352 -0.52201294
  -0.31750401 -0.76062709 -0.66456714  0.01185125 -0.68242968 -0.63774107
   0.19863822 -0.49914731 -0.67447656 -0.35335182  0.32395133 -0.76893975]
 [-0.82571213  0.13272462 -0.8249999  -0.76105087  0.64331558 -0.69269522
  -1.05202266 -1.06622382  0.46871312 -0.35689739 -0.38825014  1.35920951
  -0.44902208 -0.45581116  1.94975275 -0.80694093 -0.94818198 -1.10775204
   2.65013011 -0.6929201  -0.88821628  0.01673656 -0.90403632 -0.78136254
   0.43973562 -1.00239744 -1.24178371 -1.43718102  0.63294742 -1.03770647]
 [ 1.70485436  2.08513394  1.61593137  1.72384158  0.10245823 -0.01783304
   0.69304299  1.26366923 -0.21766424 -1.0586114   1.30049923  2.26093843
   1.15685722  1.29156462 -0.42401016 -0.0697579   0.25220172  0.80843074
  -0.1891608  -0.49055563  1.5367201

## Entrenamiento de los algoritmos

### Análisis Discriminante Lineal (LDA)

In [14]:
# Crear instancia del modelo
lda = LinearDiscriminantAnalysis()

# Entrenamiento del modelo
lda.fit(X_train, y_train)

# Predicción con los datos de prueba
y_pred_lda = lda.predict(X_test)

### Análisis Discriminante Cuadrático (QDA)

In [15]:
# Crear instancia del modelo
qda = QuadraticDiscriminantAnalysis()

# Entrenamiento del modelo
qda.fit(X_train, y_train)

# Predicción con los datos de prueba
y_pred_qda = qda.predict(X_test)

### LDA Regularizado como aproximación a RDA

In [16]:
# Crear instancia del modelo
lda_rda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')

# Entrenamiento del modelo
lda_rda.fit(X_train, y_train)

# Predicción con los datos de prueba
y_pred_lda_rda = lda_rda.predict(X_test)