In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [20]:
data = pd.read_csv('../data/base.csv', delimiter=",")
data.head()
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 48 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   fecha                          640 non-null    str    
 1   referencia                     640 non-null    str    
 2   alcalinidad_total              93 non-null     str    
 3   aluminio                       233 non-null    str    
 4   bario                          12 non-null     str    
 5   bicarbonatos                   71 non-null     str    
 6   cadmio                         404 non-null    str    
 7   calcio                         13 non-null     str    
 8   carbonatos                     71 non-null     str    
 9   cianuro_libre                  121 non-null    float64
 10  cloruros                       27 non-null     str    
 11  cobre                          374 non-null    str    
 12  coliformes_termotolerantes     550 non-null    str    
 13  c

In [23]:
# Ajustes de base de datos
# 1. Convertir todo lo posible a numérico, ignorando errores (los vuelve NaN)
cols_a_limpiar = data.columns.drop(['fecha', 'referencia', 'obs'])
for col in cols_a_limpiar:
    data[col] = pd.to_numeric(data[col].astype(str).str.replace(',', '.').str.extract(r'(\d+\.?\d*)')[0], errors='coerce')

# 2. Eliminar columnas con más del 50% de nulos
umbral = len(data) * 0.5
data_fixed = data.copy().dropna(thresh=umbral, axis=1)

# 3. Imputar nulos restantes con la mediana (simple y robusto)
data_fixed = data_fixed.fillna(data.median(numeric_only=True))

data_fixed.head()


Unnamed: 0,fecha,referencia,cadmio,cobre,coliformes_termotolerantes,coliformes_totales,cromo,dbo5,dqo,hierro,manganeso,niquel,ph,plomo,solidos_suspendidos,solidos_totales,zinc
0,06/04/01,P1,0.5,1.0,3.0,3.0,2.0,0.68,11.81,1.883,0.05285,4.0,8.2,10.0,41.0,313.0,0.1353
1,06/04/01,P2,0.5,1.0,2.2,7.0,2.0,0.18,23.42,5.179,0.1641,4.0,8.36,10.0,185.0,453.0,0.09056
2,06/04/01,P3,0.5,1.0,7.0,1.7,2.0,0.48,19.68,6.043,0.2047,4.0,8.47,10.0,209.0,470.0,0.09999
3,06/04/01,P4,0.5,1.0,1.1,1.1,2.0,0.58,7.87,3.159,0.09761,4.0,8.34,10.0,74.0,350.0,0.09698
4,07/04/01,P1,0.5,2.89,1.3,1.3,2.0,0.35,7.75,1.093,0.03675,5.2,8.44,10.0,0.0,293.0,0.1172


In [24]:
data_fixed.info()

<class 'pandas.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   fecha                       640 non-null    str    
 1   referencia                  640 non-null    str    
 2   cadmio                      640 non-null    float64
 3   cobre                       640 non-null    float64
 4   coliformes_termotolerantes  640 non-null    float64
 5   coliformes_totales          640 non-null    float64
 6   cromo                       640 non-null    float64
 7   dbo5                        640 non-null    float64
 8   dqo                         640 non-null    float64
 9   hierro                      640 non-null    float64
 10  manganeso                   640 non-null    float64
 11  niquel                      640 non-null    float64
 12  ph                          640 non-null    float64
 13  plomo                       640 non-null    fl