In [38]:
from functools import reduce
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer

from utils import get_dataset, DataSet, cost

In [2]:
train = get_dataset(DataSet.Train)

In [3]:
train.head(2)

Unnamed: 0.1,Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,587969,587969,Men's H&M tshirt,1,Men/Tops/T-shirts,H&M,8.0,0,"Light yellow color, NWT"
1,94528,94528,Victoria Secret Vneck lot 3,2,Women/Tops & Blouses/T-Shirts,Victoria's Secret,13.0,1,victoria-s-secret-pink-essential-v-neck-tee vi...


In [4]:
train["shipping"].value_counts()

0    598209
1    484326
Name: shipping, dtype: int64

In [52]:
class MercariTranformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        vectorizer_name,
        vectorizer_desc,
    ):
        self._vectorizer_name = vectorizer_name
        self._vectorizer_desc = vectorizer_desc
        self._vectorizer_brand = LabelBinarizer(sparse_output=True)
        self._vectorizer_cond = LabelBinarizer()
        self._unique_categories = []
        self._category_map = {}
        self._m_category = None
        
    def fit(self, df: pd.DataFrame):
        self._vectorizer_name.fit(df["name"].fillna(""))        
        self._vectorizer_desc.fit(df["item_description"].fillna(""))
        self._vectorizer_brand.fit(df["brand_name"].fillna("Missing"))
        self._vectorizer_cond.fit(df["item_condition_id"])
        
        self._unique_categories = reduce(
            lambda x, y: set(x).union(y),
            df.loc[~df["category_name"].isnull(), "category_name"].str.split("/").values
        )
        self._category_map = dict([
            (c, i) for i, c in enumerate(self._unique_categories, 1)
        ])
        self._m_category = df["category_name"].fillna("").str.split("/").apply(len).max()
        
    def transform(self, df):
        return {
            "name": self._vectorizer_name.transform(df["name"].fillna("")),
            "item_description": self._vectorizer_desc.transform(df["item_description"].fillna('')),
            "brand_name": self._vectorizer_brand.transform(df["brand_name"].fillna("Missing")),
            "item_condition_id": self._vectorizer_cond.transform(df["item_condition_id"]),
            "shippting": df["shipping"],
            "category_name": np.vstack(self._transform_category(df))
        }
    
    def _transform_category(self, df):
        def f(lst):
            v = [0] * self._m_category
            for i, x in enumerate(lst):
                if x in self._category_map:
                    v[i] = self._category_map[x]
            return v
        return df["category_name"].fillna("").str.split("/").apply(f)

In [53]:
mercari_transformer = MercariTranformer(
    CountVectorizer(min_df=10),
    CountVectorizer(min_df=10),
)

In [54]:
%%time
mercari_transformer.fit(train)

CPU times: user 41.3 s, sys: 660 ms, total: 41.9 s
Wall time: 41.9 s


In [55]:
%%time
d = mercari_transformer.transform(train)

CPU times: user 1min 50s, sys: 492 ms, total: 1min 50s
Wall time: 1min 50s


In [57]:
d['name']

<1082535x15246 sparse matrix of type '<class 'numpy.int64'>'
	with 4412505 stored elements in Compressed Sparse Row format>

In [58]:
d['item_description']

<1082535x28037 sparse matrix of type '<class 'numpy.int64'>'
	with 22121241 stored elements in Compressed Sparse Row format>

In [59]:
d['brand_name']

<1082535x4427 sparse matrix of type '<class 'numpy.int64'>'
	with 1082535 stored elements in Compressed Sparse Row format>

In [60]:
d['item_condition_id']

array([[1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0]])

In [61]:
d['shippting']

0          0
1          1
2          1
3          0
4          1
5          1
6          1
7          1
8          1
9          0
10         0
11         0
12         1
13         0
14         0
15         1
16         1
17         0
18         0
19         1
20         0
21         1
22         1
23         0
24         0
25         1
26         0
27         0
28         1
29         1
          ..
1082505    0
1082506    1
1082507    0
1082508    1
1082509    0
1082510    0
1082511    1
1082512    1
1082513    1
1082514    0
1082515    0
1082516    1
1082517    1
1082518    1
1082519    0
1082520    0
1082521    0
1082522    0
1082523    0
1082524    1
1082525    1
1082526    1
1082527    0
1082528    0
1082529    0
1082530    0
1082531    1
1082532    0
1082533    1
1082534    0
Name: shipping, Length: 1082535, dtype: int64

In [62]:
d['category_name']

array([[810, 826, 638,   0,   0],
       [ 93,   8,  86,   0,   0],
       [ 93, 224, 358,   0,   0],
       ...,
       [824, 714,  28,   0,   0],
       [293, 127, 886,   0,   0],
       [293, 115, 480,   0,   0]])