In [5]:
from utils import css_from_file
css_from_file('style/style.css')

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer

import warnings
warnings.filterwarnings("ignore")

def rmsle(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y0), 2)))

Normally the data comes in various shapes and formats

We need a way merge together sklearn and pandas dataframes in order to do something like this:

```python
pipeline = make_pipeline(
     CleanData(),
     make_union(
         make_pipeline(
             Selector('text_column'), 
             CountVectorizer()
         ),
         make_pipeline(
             Selector('numerical_column_1', 'numerical_column_2'), 
             StandardScaler()
         ),
         make_pipeline(
             Selector('categorical_column'), 
             OneHotEncoder()
         ),
      ),
      model
)
```

In [6]:
data = pd.read_csv("data/train.tsv.gz", sep="\t").iloc[:10000,:]
data_tr, data_va = train_test_split(data, random_state=0)
data_tr.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
2967,2967,Levi's Skinny Jeans,1,"Women/Jeans/Slim, Skinny",Levi's®,28.0,0,Brand new super skinny Levi's jeans! Size 29. ...
700,700,Converse,2,Kids/Boys 0-24 Mos/Shoes,Nike,14.0,0,Converse kids
3481,3481,PINK varsity crew,3,Women/Sweaters/Crewneck,PINK,31.0,0,I've gotten so many compliments on this shirt....
1621,1621,Victoria's Secret PINK legging,3,"Women/Athletic Apparel/Pants, Tights, Leggings",PINK,27.0,1,I believe the are the ultimate yoga legging. W...
800,800,Chicken painting framed,3,Home/Artwork/Paintings,,11.0,0,Great condition frames chicken painting. I'd s...


We must write `Selector` class to glue together Pandas DataFrames and scikit-learn transformations

Exercise
-----------

1. Write `Selector` class
2. Write a pipeline to create a Mercari vectorizer. Use `CountVectorizer` for text, one-hot encoding for categorical columns.
3. Add a class to clean the data (fill missing values etc)
4. Add a model (Ridge) to the pipeline and train it

DO NOT USE pandas.get_dummies to create binary features or I will haunt you
-----------

We'll use Ridge model as the work horse for this data - it is quick and accurate

In [None]:
ridge_model = Ridge(
    solver='auto', fit_intercept=True, alpha=1.0,
    max_iter=100, normalize=False, tol=0.05, random_state = 1,
)

In [None]:
def to_records(df):
    return df.to_dict(orient='records')

one_hot_encoder = make_pipeline(FunctionTransformer(to_records, validate=False), DictVectorizer())

# write your answers here

**Double click to see the solution**

<div class="spoiler">
class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return X[self.column]
        
class CleanData(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        X["name"] = X["name"].fillna("")
        X["brand_name"] = X["brand_name"].fillna("")
        X["item_description"] = X["item_description"].fillna("")
        X["category_name"] = X["category_name"].fillna("")
        return X

pipeline = make_pipeline(
    CleanData(),
    make_union(
        make_pipeline(Selector("name"), CountVectorizer(min_df=10, dtype=np.float64)),
        make_pipeline(Selector("brand_name"), CountVectorizer(min_df=10, dtype=np.float64)),
        make_pipeline(Selector("item_description"), CountVectorizer(min_df=10, dtype=np.float64)),
        make_pipeline(Selector(["category_name", "shipping", "item_condition_id"]), one_hot_encoder),    
    ),
    ridge_model
)

</div>

In [None]:
pipeline.fit(data_tr, np.log1p(data_tr.price))
preds = np.expm1(pipeline.predict(data_va))
print("Validation error=", rmsle(preds, data_va.price))

Once you have your pipeline ready implement a very important trick for this competition.

We found out that it is not worth to create separate vectorizers for each text field. Instead concatenate together

1. name and brand_name 
2. item_description, name and category_name

As in the simplified solution https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s

Exercise 
------------
1. In order to do this create a transformation class that adds 2 new columns to the dataframe and uses them for vectorization
2. Add the new class and modify your vectorization pipeline, did the results improve?

In [None]:
# write your solution here

**Double-click to see the solution**


<div class="spoiler">

class ConcatTexts(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        X['concat_1'] = X['name'] + ' ' + X['brand_name']
        X['concat_2'] = X['item_description'] + ' ' + X['name'] + ' ' + X['category_name']
        return X
    

class ConcatTextsGeneric(BaseEstimator, TransformerMixin):

    def __init__(self, columns, use_separators=True, output_col='text_concat'):
        self.use_separators = use_separators
        self.columns = columns
        self.output_col = output_col

    def fit(self, X, *args):
        return self

    def transform(self, X):
        X[self.output_col] = ''
        if self.use_separators:
            for i, col in enumerate(self.columns):
                X[self.output_col] += ' cs00{} '.format(i)
                X[self.output_col] += X[col]
        else:
            for i, col in enumerate(self.columns):
                X[self.output_col] += X[col]
        return X
    
pipeline = make_pipeline(
    CleanData(),
    ConcatTexts(),
    make_union(
        make_pipeline(Selector("concat_1"), CountVectorizer(min_df=10, dtype=np.float64)),
        make_pipeline(Selector("concat_2"), CountVectorizer(min_df=10, dtype=np.float64)),
        make_pipeline(Selector(["category_name", "shipping", "item_condition_id"]), one_hot_encoder),
        n_jobs=-1
    ),
    ridge_model
)
</div>

In [None]:
pipeline.fit(data_tr, np.log1p(data_tr.price))
preds = np.expm1(pipeline.predict(data_va))
print("Validation error=", rmsle(preds, data_va.price))