# Nulos: -> Resumen con necesidad de contexto, no es una norma estricta

## Primer paso:
    **Exploración y entendimiento de los valores nulos**
    Tenemos que buscar patrones y analizar tanto el núm cómo la naturaleza de los valores nulos.

## Segundo paso:

    ### Tipos de valores nulos:

        - Columnas categoricas:
            - Si el % de nulos es pequeño y tenemos una moda representativa, cambiamos por la moda
            - EN caso contrario, categorizamos cómo "Unk"

        - Columnas numericas:
            - La mediana es más robusta que la media a la hora de imputar los valores nulos

In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [2]:
df = pd.read_csv("files/merchant_ejercicios_apply.csv", index_col=0)

df.head()

Unnamed: 0,title,price,retail_price,units_sold,uses_ad_boosts,rating_sold,rating_count,rating_five_count,rating_four_count,rating_three_count,rating_two_count,rating_one_count,badge_local_product,badge_product_quality,badge_fast_shipping,tags,product_color,product_variation_size_id,shipping_option_name,shipping_option_price,shipping_is_express,countries_shipped_to,inventory_total,origin_country,merchant_name,merchant_info_subtitle,merchant_rating_count,merchant_id,product_url,product_id,listed_products,total_units_sold,mean_units_sold_per_product,rating_employee,merchant_ratings_count,mean_product_prices,mean_retail_prices,mean_product_ratings_count,discount_percentage,seller_reputation
0,2020 Summer Vintage Flamingo Print Pajamas Se...,16.0,14,100,No,3.76,54,26.0,8.0,10.0,1.0,9.0,No,Buena,No,"Summer,Fashion,womenunderwearsuit,printedpajam...",white,M,Livraison standard,4,No,34,50,CN,zgrdejia,,568,595097d6a26f6e070cb878d1,https://www.wish.com/c/5e9ae51d43d6a96e303acdb0,5e9ae51d43d6a96e303acdb0,1,100,100.0,4.129,568.0,16.0,14.0,54.0,14.29,Buena
1,SSHOUSE Summer Casual Sleeveless Soirée Party ...,8.0,22,20000,Si,3.45,6135,2269.0,1027.0,1118.0,644.0,1077.0,No,Buena,No,"Mini,womens dresses,Summer,Patchwork,fashion d...",green,XS,Livraison standard,2,No,41,50,CN,sarahouse,83 % avis positifs,17752,56458aa03a698c35c9050988,https://www.wish.com/c/58940d436a0d3d5da4e95a38,58940d436a0d3d5da4e95a38,6,21400,3567.0,3.9,17752.0,5.68,10.33,1057.0,-63.64,Regular
2,Nouvelle mode d'été femmes robe décontractée c...,4.9,8,1000,Si,3.83,99,43.0,18.0,23.0,8.0,7.0,No,Buena,No,"Summer,Fashion,Necks,Beach,Dress,Loose,beach d...",white,XXS,Livraison standard,1,No,41,50,CN,sarahouse,83 % avis positifs,17752,56458aa03a698c35c9050988,https://www.wish.com/c/5df2576d68963c1660471f9a,5df2576d68963c1660471f9a,6,21400,3567.0,3.9,17752.0,5.68,10.33,1057.0,-38.75,Regular
3,Summer Women s Fashion Lace Up Tie Pants Plus...,4.93,6,100,No,3.77,47,22.0,7.0,8.0,5.0,5.0,No,Buena,No,"Summer,Shorts,Lace,pants,Waist,Short pants,Yel...",red,S,Standard Shipping,1,No,41,50,CN,sarahouse,83% Positive Feedback,17752,56458aa03a698c35c9050988,https://www.wish.com/c/5d58daef3159a812b05933d2,5d58daef3159a812b05933d2,6,21400,3567.0,3.9,17752.0,5.68,10.33,1057.0,-17.83,Regular
4,Nouvelle mode d'été femmes robe décontractée c...,5.65,10,100,Si,2.86,7,1.0,1.0,2.0,2.0,1.0,No,Buena,No,"Summer,Fashion,Necks,Skirts,Dress,Loose,Women'...",black,XXS,Livraison standard,1,No,41,50,CN,sarahouse,83 % avis positifs,17752,56458aa03a698c35c9050988,https://www.wish.com/c/5e9fa7de39682a0043ab7898,5e9fa7de39682a0043ab7898,6,21400,3567.0,3.9,17752.0,5.68,10.33,1057.0,-43.5,Regular


In [3]:
data = {
    'Experiencia': [1, 3, 5, 2, 8, 10, 4, 6, 7, 9],  
    'Salario': [30000, 45000, np.nan, 35000, 80000, 95000, np.nan, 65000, 75000, np.nan]  
}

df_ejemplo = pd.DataFrame(data)

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,1573.0,8.325372,3.93203,1.0,5.81,8.0,11.0,49.0
retail_price,1573.0,23.28862,30.357863,1.0,7.0,10.0,26.0,252.0
units_sold,1573.0,4339.005086,9356.539302,1.0,100.0,1000.0,5000.0,100000.0
rating_sold,1573.0,3.820896,0.515374,1.0,3.55,3.85,4.11,5.0
rating_count,1573.0,889.65925,1983.928834,0.0,24.0,150.0,855.0,20744.0
rating_five_count,1528.0,442.263743,980.20327,0.0,12.0,79.0,413.5,11548.0
rating_four_count,1528.0,179.599476,400.516231,0.0,5.0,31.5,168.25,4152.0
rating_three_count,1528.0,134.549738,311.690656,0.0,4.0,24.0,129.25,3658.0
rating_two_count,1528.0,63.711387,151.343933,0.0,2.0,11.0,62.0,2003.0
rating_one_count,1528.0,95.735602,214.075544,0.0,4.0,20.0,94.0,2789.0


In [5]:
df_ejemplo

Unnamed: 0,Experiencia,Salario
0,1,30000.0
1,3,45000.0
2,5,
3,2,35000.0
4,8,80000.0
5,10,95000.0
6,4,
7,6,65000.0
8,7,75000.0
9,9,


In [6]:
df_ejemplo.style.highlight_null(color='yellow')

Unnamed: 0,Experiencia,Salario
0,1,30000.0
1,3,45000.0
2,5,
3,2,35000.0
4,8,80000.0
5,10,95000.0
6,4,
7,6,65000.0
8,7,75000.0
9,9,


In [7]:
imputer_knn = KNNImputer(n_neighbors=3)

In [None]:
imputado = imputer_knn.fit_transform(df_ejemplo[["Experiencia",	"Salario"]])

In [9]:
imputado

array([[1.00000000e+00, 3.00000000e+04],
       [3.00000000e+00, 4.50000000e+04],
       [5.00000000e+00, 6.16666667e+04],
       [2.00000000e+00, 3.50000000e+04],
       [8.00000000e+00, 8.00000000e+04],
       [1.00000000e+01, 9.50000000e+04],
       [4.00000000e+00, 4.83333333e+04],
       [6.00000000e+00, 6.50000000e+04],
       [7.00000000e+00, 7.50000000e+04],
       [9.00000000e+00, 8.33333333e+04]])

In [10]:
pd.DataFrame(imputado)

Unnamed: 0,0,1
0,1.0,30000.0
1,3.0,45000.0
2,5.0,61666.666667
3,2.0,35000.0
4,8.0,80000.0
5,10.0,95000.0
6,4.0,48333.333333
7,6.0,65000.0
8,7.0,75000.0
9,9.0,83333.333333


In [11]:
df_ejemplo.head()

Unnamed: 0,Experiencia,Salario
0,1,30000.0
1,3,45000.0
2,5,
3,2,35000.0
4,8,80000.0


In [12]:
df_ejemplo[["exp_knn", "sal_knn"]] = imputado

In [13]:
df_ejemplo

Unnamed: 0,Experiencia,Salario,exp_knn,sal_knn
0,1,30000.0,1.0,30000.0
1,3,45000.0,3.0,45000.0
2,5,,5.0,61666.666667
3,2,35000.0,2.0,35000.0
4,8,80000.0,8.0,80000.0
5,10,95000.0,10.0,95000.0
6,4,,4.0,48333.333333
7,6,65000.0,6.0,65000.0
8,7,75000.0,7.0,75000.0
9,9,,9.0,83333.333333


In [14]:
df_ejemplo.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Experiencia,10.0,5.5,3.02765,1.0,3.25,5.5,7.75,10.0
Salario,7.0,60714.285714,24567.690746,30000.0,40000.0,65000.0,77500.0,95000.0
exp_knn,10.0,5.5,3.02765,1.0,3.25,5.5,7.75,10.0
sal_knn,10.0,61833.333333,21793.786633,30000.0,45833.333333,63333.333333,78750.0,95000.0


In [15]:
df_ejemplo = df_ejemplo.drop(columns=["Salario", "exp_knn"])

In [16]:
df_ejemplo

Unnamed: 0,Experiencia,sal_knn
0,1,30000.0
1,3,45000.0
2,5,61666.666667
3,2,35000.0
4,8,80000.0
5,10,95000.0
6,4,48333.333333
7,6,65000.0
8,7,75000.0
9,9,83333.333333
