In [68]:
import eli5
import numpy as np
import pandas as pd
import gensim
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import FeatureUnion
import codecs
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge


# from keras.api._v2.keras import Model, Sequential
# from keras.api._v2.keras.layers import Dense

In [2]:
dataset = pd.read_csv('data/train.tsv', sep='\t', header=0)
dataset['category_name'] = dataset['category_name'].fillna('Other').astype(str)
dataset['brand_name'] = dataset['brand_name'].fillna('missing').astype(str)
dataset['shipping'] = dataset['shipping'].astype(str)  # makes this categorical
dataset['item_condition_id'] = dataset['item_condition_id'].astype(str)
dataset['item_description'] = dataset['item_description'].fillna('None')

X = dataset.loc[:, dataset.columns != 'price']
Y = np.log1p(dataset['price'])

# 80% training data, 20% test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
X_train.head(10)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,shipping,item_description
604635,604635,Enzo,3,Women/Shoes/Mules & Clogs,missing,0,Enzo Angiolini Mules/Clogs. Super Cute with so...
496799,496799,Black dress,3,Women/Dresses/Full-Length,missing,1,Xl long black dress Solid under with lace over...
1035231,1035231,2 items for Brittany,1,Electronics/Cell Phones & Accessories/Cables &...,missing,0,- Urban Decay Eyeshadow This has never been us...
628659,628659,Texas budle,2,Women/Tops & Blouses/T-Shirts,missing,0,Very cute!! No flaws
261459,261459,North face rain coat,3,Women/Athletic Apparel/Jackets,The North Face,0,Sea foam green rain coat Size 18 XL in youth I...
959361,959361,Michael kors crossbody bag,3,Women/Women's Handbags/Messenger & Crossbody,Michael Kors,0,"Used , the color is rose gold"
199415,199415,Cream studded wedges,3,Women/Shoes/Sandals,missing,0,The brand is Hot Rated purchased at Buckle. I ...
1460850,1460850,Pink and gold trinket box,3,Home/Home Décor/Home Décor Accents,missing,0,"5.75"" tall and 4.5"" wide. Heavy resin. The fel..."
568899,568899,True religion button shirt slimsize XXXL,3,Men/Tops/T-shirts,True Religion Brand Jeans,1,Please remember the brand name usually run sma...
1365752,1365752,WD Scorpio Blue 1TB 2.5 hard drive,3,"Electronics/Computers & Tablets/Drives, Storag...",Western Digital,1,Used like new condition Can be used on PS4 (yo...


In [3]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('data/glove.6B.300d.w2vformat.txt') 

In [57]:
class w2vTransformer(TransformerMixin):
    """
    Wrapper class for running word2vec into pipelines and FeatureUnions
    """
    def __init__(self,word2vec,**kwargs):
        self.word2vec=word2vec
        self.kwargs=kwargs
        self.dim = len(word2vec.index_to_key)
    def fit(self,x, y=None):
        return self

    def transform(self, X):
        return np.array([
        np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
            or [np.zeros(self.dim)], axis=0)
       for words in X
])

In [58]:
%%time

preprocessor = CountVectorizer().build_preprocessor()


def build_field_preprocessor(field):
    field_idx = list(X_train.columns).index(field)
    return lambda x: preprocessor(x[field_idx])  # this preprocesses like stripping accents, etc.


vectorizer = FeatureUnion([
    ('name', CountVectorizer(
        ngram_range=(1, 2),  # extract words and pairs of words
        max_features=50_000,
        preprocessor=build_field_preprocessor('name')
    )),
    ('category_name', CountVectorizer(
        token_pattern='.+',  # separate by space
        preprocessor=build_field_preprocessor('category_name')
    )),
    ('brand_name', CountVectorizer(
        token_pattern='.+',  # separate by space
        preprocessor=build_field_preprocessor('brand_name')
    )),
    ('shipping', CountVectorizer(
        token_pattern='\d+',  # decimal numbers
        preprocessor=build_field_preprocessor('shipping')
    )),
    ('item_condition_id', CountVectorizer(
        token_pattern='\d+',  # decimal numbers
        preprocessor=build_field_preprocessor('item_condition_id')
    )),
    ("w2v_class",w2vTransformer(word2vec_model))
], n_jobs=8)

X_train_transformed = vectorizer.fit_transform(X_train.values)
X_test_transformed = vectorizer.transform(X_test.values)  # don't fit to the test

CPU times: total: 33 s
Wall time: 57.1 s


In [66]:
%%time

def rmsle(y_pred, y_true):
    return np.sqrt(mean_squared_log_error(np.expm1(y_pred.clip(0)), np.expm1(y_true)))


ridge_model = make_pipeline(StandardScaler(with_mean=False),
                            Ridge(
                                solver='auto',
                                fit_intercept=True,
                                alpha=0.5,
                                max_iter=100,
                                tol=0.05,
                            ))
ridge_model.fit(X_train_transformed, Y_train.values)

y_pred_validation_ridge = ridge_model.predict(X_test_transformed)
ridge_error = rmsle(y_pred_validation_ridge, Y_test.values)
print(f'Validation RMSLE for ridge regression = {ridge_error:.5f}')

Validation RMSLE for ridge regression = 0.49133
CPU times: total: 13 s
Wall time: 13.2 s


In [69]:
eli5.show_weights(ridge_model, vec=vectorizer, top=100, feature_filter=lambda x: x != '<BIAS>')



AttributeError: Transformer w2v_class (type w2vTransformer) does not provide get_feature_names.