In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from sklearn.impute import KNNImputer
from copy import deepcopy
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import statistics 
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("allegro-api-transactions.csv")
df.info()

# Part 1

## Target Encoder

In [None]:
encoder1 = ce.TargetEncoder(cols=['it_location'], smoothing=0, return_df=True)

df_transformed = encoder1.fit_transform(df, df['price'])
df_transformed["it_location"].head()


One hot encoding może zwrócić wielowymiarową zmieną którą pozwala nam w pewnien sposób zakodować zmienne kategoryczne
tak by były łatwiejsze do przetwarzania. Użycie Target encoding pozwala nie tylko na zakodowanie zmiennych ale także
zrobienie to w taki sposób by powiązać je z naszym targetem np zamiana lokacji na średnią cenę towarów z tej lokacji

## One hot encoder

In [None]:
encoder2 = ce.OneHotEncoder(cols=["main_category"])
df_transformed = encoder2.fit_transform(df, df["price"])
df.head(2)

In [None]:
le = LabelEncoder()
integer_encoded = le.fit_transform(df.main_category)
print(integer_encoded)

In [None]:
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

In [None]:
pd.get_dummies(df["main_category"],drop_first=True).head(2)

## HashingEncoder


In [None]:
encoder3 = ce.HashingEncoder(cols=["main_category"])
df_transformed = encoder3.fit_transform(df, df['price'])

In [None]:
df_transformed.iloc[:,0:9].head(10)

- Na pierwszy rzut oka wygląda podobnie do One Hot Encoding, ale HashingEncoder zwrocił mniej kolumn. Cieżko wieć domyślić się czym są te kolumny.

## CatBoostEncoder

In [None]:
encoder4 = ce.CatBoostEncoder(cols=["main_category"])
df_transformed = encoder4.fit_transform(df, df['price'])

In [None]:
df_transformed["main_category"].head(20)

In [None]:
df.main_category.head(20)

Wygląda podobnie do target encoding. Dla pierwszych 20 wierszy często pojawia się wartość 76.811350, ale kiedy przyjrzymy się jak wcześniej wyglądała kolumna `main_category` to widzimy że ta wartość nie jest przyporządkowana do jednej kategori

# Part 2

In [None]:
df2 = df[["price", 'it_seller_rating', 'it_quantity']].head(10000)
samp =df2['it_seller_rating'].sample(frac=0.1)
df_nan = df2
df_nan.loc[df_nan.index.isin(samp.index),"it_seller_rating"] = None
df2.info()

## Nearest neighbors imputation

In [None]:
error1 =[]
for i in range(10):
    np.random.seed(i)
    samp =df2['it_seller_rating'].sample(frac=0.1)
    df_nan = df2
    df_nan.loc[df_nan.index.isin(samp.index),"it_seller_rating"] = None
    df2 = df[["price", 'it_seller_rating', 'it_quantity']].head(10000)
    imputer = KNNImputer(n_neighbors=5)
    df_nni = pd.DataFrame(imputer.fit_transform(df_nan),columns = df_nan.columns)
    e = np.sqrt(mean_squared_error(df2, df_nni))
    error1.append(e)
    
statistics.stdev(error1)

## Multivariate feature imputation

In [None]:
error2 =[]
for i in range(10):
    np.random.seed(i)
    samp =df2['it_seller_rating'].sample(frac=0.1)
    df_nan = df2
    df_nan.loc[df_nan.index.isin(samp.index),"it_seller_rating"] = None
    df2 = df[["price", 'it_seller_rating', 'it_quantity']].head(10000)
    imputer = IterativeImputer()
    imputer.fit(df_nan)
    df_ii = pd.DataFrame(imputer.transform(df_nan),columns = df_nan.columns)
    e = np.sqrt(mean_squared_error(df2, df_ii))
    error2.append(e)
    
statistics.stdev(error2)

## Nearest neighbors imputation - braki w dwóch klumnach

In [None]:
error3 =[]
for i in range(10):
    np.random.seed(i)
    samp1 =df2['it_seller_rating'].sample(frac=0.1)
    samp2 =df2['it_seller_rating'].sample(frac=0.1)
    df_nan = df2
    df_nan.loc[df_nan.index.isin(samp1.index),"it_seller_rating"] = None
    df_nan.loc[df_nan.index.isin(samp2.index),"it_quantity"] = None
    df2 = df[["price", 'it_seller_rating', 'it_quantity']].head(10000)
    imputer = KNNImputer(n_neighbors=5)
    df_nni = pd.DataFrame(imputer.fit_transform(df_nan),columns = df_nan.columns)
    e = np.sqrt(mean_squared_error(df2, df_nni))
    error3.append(e)
    
statistics.stdev(error3)

## Multivariate feature imputation - braki w dwóch kolumnach

In [None]:
error4 =[]
for i in range(10):
    np.random.seed(i)
    samp1 =df2['it_seller_rating'].sample(frac=0.1)
    samp2 =df2['it_seller_rating'].sample(frac=0.1)
    df_nan = df2
    df_nan.loc[df_nan.index.isin(samp1.index),"it_seller_rating"] = None
    df_nan.loc[df_nan.index.isin(samp2.index),"it_quantity"] = None
    df2 = df[["price", 'it_seller_rating', 'it_quantity']].head(10000)
    imputer = IterativeImputer()
    imputer.fit(df_nan)
    df_ii = pd.DataFrame(imputer.transform(df_nan),columns = df_nan.columns)
    e = np.sqrt(mean_squared_error(df2, df_ii))
    error4.append(e)
    
statistics.stdev(error4)

In [None]:
wyniki = pd.DataFrame([error1,error3,error2,error4])
wyniki = wyniki.transpose()
wyniki.columns =['NNI-1','NNI-2', 'II-1',  'II-2'] 
sns.lineplot(data=wyniki)

- Mniejsze odchylenie kiedy usuneliśmy dane z 2 kolumn
- Multivariate działą szybciej niż nni
- Podobne wyniki