In [1]:
import pandas as pd
import numpy as np

data = {
    'WorkId': [1, 2, 3, 4, 5, 6, 7, 8],
    'Age': [25, np.nan, 28, 40, np.nan, 32, 45, 27],
    'Department': ['HR', np.nan, 'HR', 'IT', 'Finance', np.nan, 'Finance', 'HR'],
    'Salary': [50000, 60000, np.nan, 80000, np.nan, 100000, 85000, 47000],
    'Perf_Score': [85, 90, np.nan, np.nan, 78, 95, 88, 79],
    'WorkHours': [8, 9, 7, 8, np.nan, np.nan, 9, 8],
    'Bonus': [100, 150, 200, 250, np.nan, 350, 400, 450],
    'N_Children': [np.nan, 1, 2, np.nan, 4, np.nan, 3, 1],
    'TimeInComp': [36, 24, np.nan, 18, 17, np.nan, 13, np.nan],    # Months
}

df = pd.DataFrame(data)

df

Unnamed: 0,WorkId,Age,Department,Salary,Perf_Score,WorkHours,Bonus,N_Children,TimeInComp
0,1,25.0,HR,50000.0,85.0,8.0,100.0,,36.0
1,2,,,60000.0,90.0,9.0,150.0,1.0,24.0
2,3,28.0,HR,,,7.0,200.0,2.0,
3,4,40.0,IT,80000.0,,8.0,250.0,,18.0
4,5,,Finance,,78.0,,,4.0,17.0
5,6,32.0,,100000.0,95.0,,350.0,,
6,7,45.0,Finance,85000.0,88.0,9.0,400.0,3.0,13.0
7,8,27.0,HR,47000.0,79.0,8.0,450.0,1.0,


## **Tratamiento de nulos - Parte 1**

In [None]:
# Eliminar filas con valores nulos
df_dropna_rows = df.dropna()
df_dropna_rows

In [None]:
# Eliminar filas con valores nulos
df_dropna_rows = df.dropna(thresh=6)
df_dropna_rows

In [None]:
# Eliminar columnas con valores nulos
df_dropna_cols = df.drop(columns=['N_Children'])
df_dropna_cols

In [None]:
# Rellenar valores nulos con un valor específico
df['N_Children'] = df['N_Children'].fillna(0)
df

In [None]:
# Rellenar valores nulos con un valor específico
df_fillna_constant = df.fillna({'Age': 0, 'Salary': 50000, 'Department': 'Unknown'})
df_fillna_constant

In [None]:
# Rellenar con mediana
df['Age'] = df['Age'].fillna(df['Age'].median())
df

In [None]:
# Rellenar con moda
df['Department'] = df['Department'].fillna(df['Department'].mode()[0])
df

## **Tratamiento de nulos - Parte 2**

In [2]:
# Rellenar por agrupación
df_fill_by_group = df.copy()
df_fill_by_group['Salary'] = df_fill_by_group.groupby(
    'Department')['Salary'].transform(lambda x: x.fillna(x.mean()))
df_fill_by_group

Unnamed: 0,WorkId,Age,Department,Salary,Perf_Score,WorkHours,Bonus,N_Children,TimeInComp
0,1,25.0,HR,50000.0,85.0,8.0,100.0,,36.0
1,2,,,,90.0,9.0,150.0,1.0,24.0
2,3,28.0,HR,48500.0,,7.0,200.0,2.0,
3,4,40.0,IT,80000.0,,8.0,250.0,,18.0
4,5,,Finance,85000.0,78.0,,,4.0,17.0
5,6,32.0,,,95.0,,350.0,,
6,7,45.0,Finance,85000.0,88.0,9.0,400.0,3.0,13.0
7,8,27.0,HR,47000.0,79.0,8.0,450.0,1.0,


In [3]:
df.groupby('Department')[['Salary']].mean()

Unnamed: 0_level_0,Salary
Department,Unnamed: 1_level_1
Finance,85000.0
HR,48500.0
IT,80000.0


In [4]:
# Solo filtrar donde Department no es NaN
df_fill_by_group = df.copy()
mask = df_fill_by_group['Department'].notna()  

df_fill_by_group.loc[mask, 'Salary'] = df_fill_by_group.loc[mask].groupby(
    'Department')['Salary'].transform(lambda x: x.fillna(x.mean()))
df_fill_by_group

Unnamed: 0,WorkId,Age,Department,Salary,Perf_Score,WorkHours,Bonus,N_Children,TimeInComp
0,1,25.0,HR,50000.0,85.0,8.0,100.0,,36.0
1,2,,,60000.0,90.0,9.0,150.0,1.0,24.0
2,3,28.0,HR,48500.0,,7.0,200.0,2.0,
3,4,40.0,IT,80000.0,,8.0,250.0,,18.0
4,5,,Finance,85000.0,78.0,,,4.0,17.0
5,6,32.0,,100000.0,95.0,,350.0,,
6,7,45.0,Finance,85000.0,88.0,9.0,400.0,3.0,13.0
7,8,27.0,HR,47000.0,79.0,8.0,450.0,1.0,


In [5]:
# Interpolación lineal
df_interpolated = df.copy()
df_interpolated['TimeInComp'] = df['TimeInComp'].interpolate(method='linear')
df_interpolated

Unnamed: 0,WorkId,Age,Department,Salary,Perf_Score,WorkHours,Bonus,N_Children,TimeInComp
0,1,25.0,HR,50000.0,85.0,8.0,100.0,,36.0
1,2,,,60000.0,90.0,9.0,150.0,1.0,24.0
2,3,28.0,HR,,,7.0,200.0,2.0,21.0
3,4,40.0,IT,80000.0,,8.0,250.0,,18.0
4,5,,Finance,,78.0,,,4.0,17.0
5,6,32.0,,100000.0,95.0,,350.0,,15.0
6,7,45.0,Finance,85000.0,88.0,9.0,400.0,3.0,13.0
7,8,27.0,HR,47000.0,79.0,8.0,450.0,1.0,13.0


In [6]:
from sklearn.impute import KNNImputer

# Preparar datos para KNN Imputation
knn_df = df.drop(columns=['WorkId', 'Department'])
imputer = KNNImputer(n_neighbors=2)
df_imputed = pd.DataFrame(imputer.fit_transform(knn_df),
                          columns=knn_df.columns)

# Reconstruir DataFrame con datos imputados
df_knn = df.copy()
df_knn[['Salary', 'Bonus']] = df_imputed[['Salary', 'Bonus']]
df_knn

Unnamed: 0,WorkId,Age,Department,Salary,Perf_Score,WorkHours,Bonus,N_Children,TimeInComp
0,1,25.0,HR,50000.0,85.0,8.0,100.0,,36.0
1,2,,,60000.0,90.0,9.0,150.0,1.0,24.0
2,3,28.0,HR,70000.0,,7.0,200.0,2.0,
3,4,40.0,IT,80000.0,,8.0,250.0,,18.0
4,5,,Finance,63500.0,78.0,,225.0,4.0,17.0
5,6,32.0,,100000.0,95.0,,350.0,,
6,7,45.0,Finance,85000.0,88.0,9.0,400.0,3.0,13.0
7,8,27.0,HR,47000.0,79.0,8.0,450.0,1.0,


In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

# Aplicar MinMaxScaler
scaler = MinMaxScaler()
df_to_normalize = df.drop(columns=['WorkId', 'Department'])
df_scaled = pd.DataFrame(scaler.fit_transform(df_to_normalize),
                         columns=df_to_normalize.columns)

# Aplicar KNNImputer
imputer = KNNImputer(n_neighbors=2)
df_imputed = pd.DataFrame(imputer.fit_transform(df_scaled),
                          columns=df_to_normalize.columns)

# Desescalar los datos a su escala original
df_imputed_rev = pd.DataFrame(scaler.inverse_transform(df_imputed),
                              columns=df_to_normalize.columns)
df_knn = df.copy()
df_knn[['Salary', 'Bonus']] = df_imputed_rev[['Salary', 'Bonus']]
df_knn

Unnamed: 0,WorkId,Age,Department,Salary,Perf_Score,WorkHours,Bonus,N_Children,TimeInComp
0,1,25.0,HR,50000.0,85.0,8.0,100.0,,36.0
1,2,,,60000.0,90.0,9.0,150.0,1.0,24.0
2,3,28.0,HR,75000.0,,7.0,200.0,2.0,
3,4,40.0,IT,80000.0,,8.0,250.0,,18.0
4,5,,Finance,82500.0,78.0,,325.0,4.0,17.0
5,6,32.0,,100000.0,95.0,,350.0,,
6,7,45.0,Finance,85000.0,88.0,9.0,400.0,3.0,13.0
7,8,27.0,HR,47000.0,79.0,8.0,450.0,1.0,


In [8]:
# Definir el modelo de vecinos más cercanos
knn = NearestNeighbors(n_neighbors=4, metric='euclidean')
knn.fit(df_imputed.dropna())  # Ajustamos solo con datos sin nulos

# Buscamos los vecinos más cercanos de la fila con índice 3 (antes tenía nulos)
fila_index = 4
fila = pd.DataFrame([df_imputed.iloc[fila_index].values], columns=df_imputed.columns)

# Obtener los índices de los vecinos más cercanos
distancias, indices = knn.kneighbors(fila)

# Mostrar los vecinos
print('Índices de los vecinos más cercanos\n', indices)
print('\nDistancias a los vecinos\n', distancias)
print('\nFilas vecinas')
df_imputed.iloc[indices[0]]

Índices de los vecinos más cercanos
 [[4 3 6 5]]

Distancias a los vecinos
 [[0.         0.49238368 0.78335559 1.37622851]]

Filas vecinas


Unnamed: 0,Age,Salary,Perf_Score,WorkHours,Bonus,N_Children,TimeInComp
4,0.875,0.669811,0.0,0.75,0.642857,1.0,0.173913
3,0.75,0.622642,0.294118,0.5,0.428571,0.833333,0.217391
6,1.0,0.716981,0.588235,1.0,0.857143,0.666667,0.0
5,0.35,1.0,1.0,0.25,0.714286,0.5,0.108696


In [9]:
df_imputed_rev.iloc[indices[0]]

Unnamed: 0,Age,Salary,Perf_Score,WorkHours,Bonus,N_Children,TimeInComp
4,42.5,82500.0,78.0,8.5,325.0,4.0,17.0
3,40.0,80000.0,83.0,8.0,250.0,3.5,18.0
6,45.0,85000.0,88.0,9.0,400.0,3.0,13.0
5,32.0,100000.0,95.0,7.5,350.0,2.5,15.5


## **Tratamiento de nulos - Parte 3**