In [None]:
!pip install dataprep


In [2]:
import pandas as pd

# Cargar el dataset
data = pd.read_csv('cancer.csv')

# Imprimir las 10 primeras filas
print(data.head(10))

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   
5        12.45         15.70           82.57      477.1          0.12780   
6        18.25         19.98          119.60     1040.0          0.09463   
7        13.71         20.83           90.20      577.9          0.11890   
8        13.00         21.82           87.50      519.8          0.12730   
9        12.46         24.04           83.97      475.9          0.11860   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760         0.30010              0.14710         0.2419   
1           0

In [5]:
from dataprep.eda import create_report

report = create_report(data).show()

Output hidden; open in https://colab.research.google.com to view.

In [6]:
# Obtener estadísticas
stats = data.describe()

column_info = []

for column in data.columns:
    if column != "target":
        mean = stats[column]['mean']
        std = stats[column]['std']

        # Asumimos que si la media es cercana a 0 y la desviación estándar es cercana a 1, entonces está normalizada.
        normalized = (abs(mean) < 0.1) and (0.9 < std < 1.1)

        # Asumimos que si la media es significativamente diferente de 0, hay sesgo.
        biased = mean > 0.1 or mean < -0.1

        column_info.append([column, normalized, biased])

# Convertir a DataFrame para mostrarlo en un formato similar a Excel en el cuaderno
column_info_df = pd.DataFrame(column_info, columns=['Nombre de variable', 'Está normalizada', 'Tiene sesgo'])
print(column_info_df)


         Nombre de variable  Está normalizada  Tiene sesgo
0               mean radius             False         True
1              mean texture             False         True
2            mean perimeter             False         True
3                 mean area             False         True
4           mean smoothness             False        False
5          mean compactness             False         True
6            mean concavity             False        False
7       mean concave points             False        False
8             mean symmetry             False         True
9    mean fractal dimension             False        False
10             radius error             False         True
11            texture error             False         True
12          perimeter error             False         True
13               area error             False         True
14         smoothness error             False        False
15        compactness error             False        Fal

In [7]:
target_counts = data['target'].value_counts()
print(target_counts)

# Si las categorías tienen aproximadamente el mismo número de registros, consideramos que está balanceada
balanced = abs(target_counts[0] - target_counts[1]) < 0.1 * data.shape[0]
print("¿Es una variable balanceada?", balanced)


1.0    357
0.0    212
Name: target, dtype: int64
¿Es una variable balanceada? False


Vista minable para Redes Neuronales

In [17]:
# Transformaciones para Redes Neuronales
data_nn = data.copy()
scaler = StandardScaler()

for column in data_nn.columns:
    if column != "target":
        data_nn[column] = scaler.fit_transform(data_nn[[column]])

if not balanced:
    ros = RandomOverSampler()
    X_resampled, y_resampled = ros.fit_resample(data_nn.drop('target', axis=1), data_nn['target'])

print("\nVista minable para Redes Neuronales:")
print(X_resampled.head(10))
print(y_resampled.head(10))


Vista minable para Redes Neuronales:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0     1.412972     -1.412358        1.412972   1.412972         1.409878   
1     1.412972     -0.706799        1.412972   1.412972        -0.706799   
2     1.412972      0.704319        1.412972   1.412972         1.409878   
3    -0.706486      0.704319       -0.706486  -1.412972         1.409878   
4     1.412972     -1.412358        1.412972   1.412972         0.704319   
5    -0.706486     -0.706799        0.000000  -0.706486         1.409878   
6     1.412972      0.704319        1.412972   1.412972        -0.001240   
7     0.000000      0.704319        0.000000   0.000000         1.409878   
8     0.000000      0.704319        0.000000   0.000000         1.409878   
9    -0.706486      1.409878        0.000000  -0.706486         1.409878   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0          1.412972        1.412972             1.

Vista minable para árboles de Decisión

In [18]:
data_tree = data.copy()
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')

for column in data_tree.columns:
    if column != "target":
        data_tree[column] = discretizer.fit_transform(data_tree[[column]])

print("\nVista minable para Árboles de Decisión:")
print(data_tree.head(10))


Vista minable para Árboles de Decisión:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          4.0           0.0             4.0        4.0              3.0   
1          4.0           1.0             4.0        4.0              1.0   
2          4.0           3.0             4.0        4.0              3.0   
3          1.0           3.0             1.0        0.0              3.0   
4          4.0           0.0             4.0        4.0              3.0   
5          1.0           1.0             2.0        1.0              3.0   
6          4.0           3.0             4.0        4.0              2.0   
7          2.0           3.0             2.0        2.0              3.0   
8          2.0           3.0             2.0        2.0              3.0   
9          1.0           3.0             2.0        1.0              3.0   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0               4.0             4.0            

