In [1]:
import pandas as pd
import joblib
from tools.transformer import TabularTransformer

In [2]:
import plotly.express as px
# import plotly.graph_objs as go
# from plotly.offline import init_notebook_mode

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
df = pd.read_csv("data/data.csv")

In [5]:
# filtering prices
df = df.loc[(df["PRICE"] > 15) & (df["PRICE"] < 2500), :]

In [None]:
df

In [None]:
px.histogram(df, x="PRICE", title="Histogram")

In [None]:
from sklearn.preprocessing import PowerTransformer

pow_trans = PowerTransformer(method="box-cox")

In [None]:
df["PRICE_NORM"] = pow_trans.fit_transform(df["PRICE"].values.reshape(len(df), 1)).flatten()

In [None]:
px.histogram(df, x="PRICE_NORM", title="Histogram")

In [None]:
df.columns

In [6]:
tt = TabularTransformer(
    numeric=["HOURS_ONLINE"],
    dates=["DATE_SOLD"],
    ordinal=["ID_UNIVERSE", "ID_CATEGORY", "ID_SUBCAT", "ID_SUB_SUBCAT", "ID_BRAND", "ID_MODEL", "ID_COLOR", "ID_MATERIAL", "ID_MATERIAL_TYPE", "ID_CONDITION", "VINTAGE", "ID_BRACELET", "ID_BOX", "ID_MECHANISM", "ID_SIZE_TYPE", "GEO2_SELLER", "ORDER_CURRENCY"], # ordinal
    cat=[], # one-hot
    highcat=[]
)

In [7]:
input_list = tt.fit_transform(df)

In [10]:
# saving preprocessor
joblib.dump(tt, 'models/preprocess.pkl', compress=1)

['models/preprocess.pkl']

In [None]:
len(input_list)

In [None]:
# print([(item.shape, item.dtype) for item in input_list])

In [8]:
df_transformed = pd.DataFrame(input_list, columns=tt.columns)
# df_transformed[tt.columns[1][0]] = input_list[1]
# df_transformed[tt.columns[1][1]] = input_list[2]

In [9]:
df_transformed

Unnamed: 0,HOURS_ONLINE,DATE_SOLD_day_sin,DATE_SOLD_day_cos,DATE_SOLD_dayofweek_sin,DATE_SOLD_dayofweek_cos,DATE_SOLD_month_sin,DATE_SOLD_month_cos,DATE_SOLD_year,ID_UNIVERSE,ID_CATEGORY,...,ID_MATERIAL,ID_MATERIAL_TYPE,ID_CONDITION,VINTAGE,ID_BRACELET,ID_BOX,ID_MECHANISM,ID_SIZE_TYPE,GEO2_SELLER,ORDER_CURRENCY
0,1.374887,-0.485302,-0.874347,8.660254e-01,0.5,-8.660254e-01,-5.000000e-01,0.5,1.0,1.0,...,10.0,0.0,2.0,0.0,0.0,0.0,0.0,201.0,11.0,10.0
1,2.153557,-0.897805,-0.440394,-8.660254e-01,-0.5,-8.660254e-01,-5.000000e-01,0.5,1.0,2.0,...,23.0,0.0,2.0,0.0,0.0,0.0,0.0,164.0,9.0,5.0
2,-0.581196,-0.848644,0.528964,8.660254e-01,-0.5,-8.660254e-01,-5.000000e-01,0.5,1.0,4.0,...,32.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,9.0,11.0
3,0.070862,-0.485302,-0.874347,8.660254e-01,0.5,-8.660254e-01,-5.000000e-01,0.5,2.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,34.0,7.0,11.0
4,0.700494,0.571268,0.820763,0.000000e+00,1.0,-8.660254e-01,-5.000000e-01,0.5,2.0,2.0,...,23.0,0.0,2.0,0.0,0.0,0.0,0.0,185.0,9.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173735,-0.448972,-0.651372,-0.758758,-8.660254e-01,-0.5,1.000000e+00,6.123234e-17,1.0,1.0,1.0,...,12.0,0.0,2.0,0.0,0.0,0.0,0.0,201.0,11.0,11.0
173736,0.463826,0.651372,-0.758758,0.000000e+00,1.0,8.660254e-01,-5.000000e-01,1.0,1.0,2.0,...,32.0,0.0,6.0,0.0,0.0,0.0,0.0,160.0,3.0,11.0
173737,-0.488263,0.201299,0.979530,0.000000e+00,1.0,1.224647e-16,-1.000000e+00,0.5,2.0,3.0,...,39.0,0.0,1.0,0.0,0.0,0.0,0.0,9.0,9.0,4.0
173738,-0.658630,0.937752,0.347305,-2.449294e-16,1.0,-2.449294e-16,1.000000e+00,0.5,1.0,4.0,...,8.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,7.0,4.0


In [None]:
# adding PRICE data
df_transformed["PRICE"] = df["PRICE"]
df_transformed["ID_PRODUCT"] = df["ID_PRODUCT"]

df_transformed

In [None]:
df_transformed.to_csv("data/data_encoded.csv", index=False)