# Limpieza de datos

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("personas.csv")

In [64]:
print(df)

    age  height  sex  score
0  24.0   175.0    F   88.0
1  31.0     NaN    M   92.0
2   NaN   180.0    M   79.0
3  41.0   170.0  NaN   85.0
4  34.0     NaN    F    NaN
5  22.0   162.0    F   94.0
6  65.0   158.0    F   87.0
7  38.0   170.0    M   94.0
8  42.0     NaN    M   99.0
9   NaN   168.0    F   98.0


### Contar cuántos valores inválidos hay en cada columna

In [65]:
df.isna().sum()

age       2
height    3
sex       1
score     1
dtype: int64

### Descartar individuos que contengan algún valor invalido (nan)

In [66]:
df_drop_any = df.dropna()
print(df_drop_any)

    age  height sex  score
0  24.0   175.0   F   88.0
5  22.0   162.0   F   94.0
6  65.0   158.0   F   87.0
7  38.0   170.0   M   94.0


### Descartar individuos que tengan valores inválidos en una columan específica

In [67]:
df_drop_subset = df.dropna(subset=["score"])
print(df_drop_subset)

    age  height  sex  score
0  24.0   175.0    F   88.0
1  31.0     NaN    M   92.0
2   NaN   180.0    M   79.0
3  41.0   170.0  NaN   85.0
5  22.0   162.0    F   94.0
6  65.0   158.0    F   87.0
7  38.0   170.0    M   94.0
8  42.0     NaN    M   99.0
9   NaN   168.0    F   98.0


### Usando imputación en scikitlearn para rellenar valores inválidos

In [68]:
from sklearn.impute import SimpleImputer

### Usando media

In [69]:
df1 = df.copy()
num_cols = ["age", "height", "score"]
imputer_mean = SimpleImputer(strategy="mean")
df1[num_cols] = imputer_mean.fit_transform(df[num_cols])

In [70]:
print(df1)

      age  height  sex      score
0  24.000   175.0    F  88.000000
1  31.000   169.0    M  92.000000
2  37.125   180.0    M  79.000000
3  41.000   170.0  NaN  85.000000
4  34.000   169.0    F  90.666667
5  22.000   162.0    F  94.000000
6  65.000   158.0    F  87.000000
7  38.000   170.0    M  94.000000
8  42.000   169.0    M  99.000000
9  37.125   168.0    F  98.000000


### Usando más frecuente (para datos categóricos)

In [71]:
print(df["sex"])

0      F
1      M
2      M
3    NaN
4      F
5      F
6      F
7      M
8      M
9      F
Name: sex, dtype: object


In [72]:
df["sex"].value_counts()

sex
F    5
M    4
Name: count, dtype: int64

In [73]:
df2 = df.copy()
cat_cols = ["sex"]
imputer_mode = SimpleImputer(strategy="most_frequent")
df2[cat_cols] = imputer_mode.fit_transform(df[cat_cols])

In [74]:
print(df2)

    age  height sex  score
0  24.0   175.0   F   88.0
1  31.0     NaN   M   92.0
2   NaN   180.0   M   79.0
3  41.0   170.0   F   85.0
4  34.0     NaN   F    NaN
5  22.0   162.0   F   94.0
6  65.0   158.0   F   87.0
7  38.0   170.0   M   94.0
8  42.0     NaN   M   99.0
9   NaN   168.0   F   98.0
