In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Dataset con absurdos y NaN
data = {
    "distancia_km": [2, 5, 7, 0, 10, 12, 15],
    "pasajeros":    [1, 2, np.nan, 4, 5, np.nan, 3],
    "tarifa":       [8, 15, 20, 999, 40, 60, np.nan]
}
df = pd.DataFrame(data)

# Paso 1: marcar absurdos como NaN
df["distancia_km"] = df["distancia_km"].replace(0, np.nan)
df["tarifa"] = df["tarifa"].replace(999, np.nan)

# Paso 2: split train-test
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

print("Train con NaN:")
print(train_df)
print("\nTest con NaN:")
print(test_df)

# Paso 3: Escalado
scaler = StandardScaler()

# Fit en train
train_scaled = scaler.fit_transform(train_df)
test_scaled = scaler.transform(test_df)

# Paso 4: Imputación
imputer = KNNImputer(n_neighbors=2, weights="distance")

# Fit-transform en train
train_imputed_scaled = imputer.fit_transform(train_scaled)

# Solo transform en test (usando lo aprendido en train)
test_imputed_scaled = imputer.transform(test_scaled)

# Paso 5: Desescalar
train_imputed = scaler.inverse_transform(train_imputed_scaled)
test_imputed = scaler.inverse_transform(test_imputed_scaled)

# Paso 6: volver a DataFrames
train_imputed = pd.DataFrame(train_imputed, columns=df.columns)
test_imputed = pd.DataFrame(test_imputed, columns=df.columns)

print("\nTrain imputado:")
print(train_imputed.round(2))

print("\nTest imputado:")
print(test_imputed.round(2))


Train con NaN:
   distancia_km  pasajeros  tarifa
2           7.0        NaN    20.0
4          10.0        5.0    40.0
3           NaN        4.0     NaN
6          15.0        3.0     NaN

Test con NaN:
   distancia_km  pasajeros  tarifa
0           2.0        1.0     8.0
1           5.0        2.0    15.0
5          12.0        NaN    60.0

Train imputado:
   distancia_km  pasajeros  tarifa
0           7.0       4.22   20.00
1          10.0       5.00   40.00
2          12.5       4.00   40.00
3          15.0       3.00   30.87

Test imputado:
   distancia_km  pasajeros  tarifa
0           2.0       1.00     8.0
1           5.0       2.00    15.0
2          12.0       3.76    60.0
