# Braki danych

## 0) Importy


In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.impute import SimpleImputer, KNNImputer

# Część A

## A1) Dane wejściowe

In [None]:
df = pd.DataFrame({
    "customer_id": ["C001","C002","C003","C004","C005","C006"],
    "age": ["34", "NULL", "27", "", None, "9999"],
    "income": ["4000", "5200", "?", "6100", None, "7000"],
    "city": ["Warszawa", "Kraków", "NULL", "Gdańsk", "", None],
    "signup_date": ["2024-01-01", "2024/01/05", "bad_date", "", None, "2024-02-10"],
    "spent_total": [120.5, np.nan, 0.0, 9999.0, 55.0, np.nan]
})
display(df)

## A2) Standaryzujemy „braki udawane” (placeholdery) → `NaN`

In [None]:
PLACEHOLDERS = ["", " ", "NULL", "null", "?", "N/A", "NA"]
df2 = df.replace(PLACEHOLDERS, np.nan)
display(df2)

## A3) Sentinele (np. 9999) → `NaN`

In [None]:
df3 = df2.copy()

# 'age' jest tekstem, więc porównujemy jako string
df3.loc[df3["age"].astype("string") == "9999", "age"] = np.nan

# 'spent_total' jest liczbą (float) więc porównujemy jako float
df3.loc[df3["spent_total"] == 9999.0, "spent_total"] = np.nan

display(df3)

## A4) Konwersje typów 

In [None]:
df4 = df3.copy()

df4["age"] = pd.to_numeric(df4["age"], errors="coerce")
df4["income"] = pd.to_numeric(df4["income"], errors="coerce")
df4["signup_date"] = pd.to_datetime(df4["signup_date"], errors="coerce")

display(df4)
print(df4.dtypes)


## A5) Raport braków (po standaryzacji)


In [None]:
display(((df4.isna().mean() * 100).sort_values(ascending=False)).to_frame("missing_%"))

## A6) Uzupełnianie proste: `fillna(...)`

In [None]:
df_simple = df4.copy()

# 1) 
for col in ["age", "income", "spent_total"]:
    df_simple[col] = df_simple[col].fillna(df_simple[col].median())

# 2) 
df_simple["city"] = df_simple["city"].fillna("unknown")

display(df_simple)

## A7) Ile braków zostało po prostej wersji?

In [None]:
display(((df_simple.isna().mean() * 100).sort_values(ascending=False)).to_frame("missing_%_after_simple"))

## A8) Uzupełnianie z biblioteką: `SimpleImputer`

In [None]:
df_imp = df4.copy()

num_cols = ["age", "income", "spent_total"]
cat_cols = ["city"]

num_imp = SimpleImputer(strategy="median")
cat_imp = SimpleImputer(strategy="constant", fill_value="unknown")

df_imp[num_cols] = num_imp.fit_transform(df_imp[num_cols])
df_imp[cat_cols] = cat_imp.fit_transform(df_imp[cat_cols])

display(df_imp)


## A9) Ile braków zostało po `SimpleImputer`?


In [None]:
display(((df_imp.isna().mean() * 100).sort_values(ascending=False)).to_frame("missing_%_after_simpleimputer"))


## A10) Szybkie porównanie: czy wynik jest podobny?

In [None]:
compare_cols = ["age", "income", "spent_total", "city"]
display(pd.DataFrame({
    "simple_fillna": df_simple[compare_cols].fillna("NA").astype(str).head(6).apply(lambda x: " | ".join(x), axis=1),
    "simpleimputer": df_imp[compare_cols].fillna("NA").astype(str).head(6).apply(lambda x: " | ".join(x), axis=1),
}))

# Część B

## B1) Wczytanie danych (products)


In [None]:
import pandas as pd

products = pd.read_csv("olist_products_dataset.csv")
display(products.head(3))
print("shape:", products.shape)

## B2) Raport braków (TOP 15)


In [None]:
display(((products.isna().mean()*100).sort_values(ascending=False).head(15)).to_frame("missing_%"))


## B3) Proste uzupełnianie fillna


In [None]:
p_simple = products.copy()

cols = [c for c in ["product_weight_g","product_length_cm","product_height_cm","product_width_cm"] if c in p_simple.columns]
print("Kolumny:", cols)
for c in cols:
   p_simple[c] = p_simple[c].fillna(p_simple[c].median())

display(((p_simple[cols].isna().mean()*100).sort_values(ascending=False)).to_frame("missing_%_after_simple"))


## B4) `SimpleImputer` (mediana) dla tych samych kolumn


In [None]:
p_imp = products.copy()

cols = [c for c in ["product_weight_g","product_length_cm","product_height_cm","product_width_cm"] if c in p_imp.columns]
imp = SimpleImputer(strategy="median")
p_imp[cols] = imp.fit_transform(p_imp[cols])

display(((p_imp[cols].isna().mean()*100).sort_values(ascending=False)).to_frame("missing_%_after_simpleimputer"))


## B5) KNNImputer — uzupełnianie “po podobieństwie”

In [None]:
cols = [c for c in ["product_weight_g","product_length_cm","product_height_cm","product_width_cm"] if c in products.columns]
print("Kolumny:", cols)

 # KNNImputer: n_neighbors=5 
knn = KNNImputer(n_neighbors=5)
p_knn = products.copy()
p_knn[cols] = knn.fit_transform(p_knn[cols])

# kontrola braków 
display(((p_knn[cols].isna().mean()*100).sort_values(ascending=False)).to_frame("missing_%_after_knn"))

display(p_knn[cols].head(10))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer, KNNImputer
import pandas as pd

col_target = "product_description_lenght"

original_data = products[col_target].dropna()

imp_median = SimpleImputer(strategy="median")
data_median = imp_median.fit_transform(products[[col_target]])

cols_for_knn = [col_target, "product_photos_qty", "product_weight_g"]
imp_knn = KNNImputer(n_neighbors=5)
data_knn_all = imp_knn.fit_transform(products[cols_for_knn])
data_knn = data_knn_all[:, 0]

plt.figure(figsize=(14, 7))
sns.kdeplot(original_data, label='Oryginał', color='black', linestyle='--', linewidth=2)
sns.kdeplot(data_median.flatten(), label='SimpleImputer (Mediana)', color='red', fill=True, alpha=0.2)
sns.kdeplot(data_knn, label='KNN Imputer', color='green', fill=True, alpha=0.2)

plt.title(f"Rozkład: {col_target}")
plt.xlabel("Długość opisu")
plt.ylabel("Gęstość")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

stats_compare = pd.DataFrame({
    "Oryginał": original_data.describe(),
    "Simple (Mediana)": pd.Series(data_median.flatten()).describe(),
    "KNN": pd.Series(data_knn).describe()
})
display(stats_compare.loc[['mean', '50%', 'std']].round(2))