In [10]:
from math import sqrt

import eli5
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm

In [11]:
dataset = pd.read_csv('data/train.tsv', sep='\t', header=0)
dataset['category_name'] = dataset['category_name'].fillna('Other').astype(str)
dataset['brand_name'] = dataset['brand_name'].fillna('missing').astype(str)
dataset['shipping'] = dataset['shipping']
dataset['item_condition_id'] = dataset['item_condition_id']
dataset['item_description'] = dataset['item_description'].fillna('None')

In [12]:
dataset.iloc[1]

train_id                                                             1
name                                  Razer BlackWidow Chroma Keyboard
item_condition_id                                                    3
category_name        Electronics/Computers & Tablets/Components & P...
brand_name                                                       Razer
price                                                             52.0
shipping                                                             0
item_description     This keyboard is in great condition and works ...
Name: 1, dtype: object

In [13]:
dataset['category_name']

0                                          Men/Tops/T-shirts
1          Electronics/Computers & Tablets/Components & P...
2                                Women/Tops & Blouses/Blouse
3                         Home/Home Décor/Home Décor Accents
4                                    Women/Jewelry/Necklaces
                                 ...                        
1482530                               Women/Dresses/Mid-Calf
1482531                             Kids/Girls 2T-5T/Dresses
1482532       Sports & Outdoors/Exercise/Fitness accessories
1482533                   Home/Home Décor/Home Décor Accents
1482534                    Women/Women's Accessories/Wallets
Name: category_name, Length: 1482535, dtype: object

In [14]:
# split category into general, sub1, and sub2
categories = list(dataset['category_name'])

gencat_list = []
subcat1_list = []
subcat2_list = []

for cat in categories:
    cats = cat.split("/")
    cats[0] = cats[0].replace(' ', '')
    cats[0] = cats[0].replace('&', '_')
    if len(cats) > 1:
        cats[1] = cats[1].replace(' ', '')
        cats[1] = cats[1].replace('&', '_')
    if len(cats) > 2:
        cats[2] = cats[2].replace(' ', '')
        cats[2] = cats[2].replace('&', '_')
    gencat_list.append(cats[0].strip())
    subcat1_list.append(cats[1].strip() if len(cats) > 1 else "Other")
    subcat2_list.append(cats[2].strip() if len(cats) > 2 else "Other")

dataset["general_category"] = gencat_list
dataset["subcategory_1"] = subcat1_list
dataset["subcategory_2"] = subcat2_list
dataset.drop('category_name', axis=1, inplace=True)

In [15]:
dataset.iloc[1]

train_id                                                             1
name                                  Razer BlackWidow Chroma Keyboard
item_condition_id                                                    3
brand_name                                                       Razer
price                                                             52.0
shipping                                                             0
item_description     This keyboard is in great condition and works ...
general_category                                           Electronics
subcategory_1                                        Computers_Tablets
subcategory_2                                         Components_Parts
Name: 1, dtype: object

In [16]:
X = dataset.loc[:, dataset.columns != 'price']
Y = np.log1p(dataset['price'])

# 80% training data, 20% test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [17]:
# % % time

preprocessor = CountVectorizer().build_preprocessor()


def build_field_preprocessor(field):
    field_idx = list(X_train.columns).index(field)
    return lambda x: preprocessor(x[field_idx])  # this preprocesses like stripping accents, etc.


class OneHotVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, column, column_name):
        # def __init__(self, preprocessor):
        self.column = column
        self.column_name = column_name

    def fit(self, X, y=None):
        self.enc = OneHotEncoder(sparse=True, handle_unknown='infrequent_if_exist')
        self.enc.fit(X[:, self.column].reshape(-1, 1))
        return self

    def transform(self, X):
        return self.enc.transform(X[:, self.column].reshape(-1, 1))

    def get_feature_names(self):
        return self.enc.get_feature_names_out([self.column_name])


vectorizer = FeatureUnion([
    *[(field, CountVectorizer(
        lowercase=False,
        binary=True,
        preprocessor=build_field_preprocessor(field),
    )) for field in ['brand_name', 'general_category',
                     'subcategory_1', 'subcategory_2']],

    ('name', TfidfVectorizer(
        ngram_range=(1, 3),
        min_df=3,
        max_features=250_000,
        preprocessor=build_field_preprocessor('name')
    )),
    ('item_description', TfidfVectorizer(
        ngram_range=(1, 3),
        min_df=5,
        max_features=500_000,
        preprocessor=build_field_preprocessor('item_description')
    )),

    *[(field, OneHotVectorizer(
        column=list(X_train.columns).index(field),
        column_name=field
    )) for field in ['item_condition_id', 'shipping']]
])

X_train_transformed = vectorizer.fit_transform(X_train.values)
X_test_transformed = vectorizer.transform(X_test.values)

In [18]:
import pickle

with open('x_train_transformed.pkl', 'wb') as f:
    pickle.dump(X_train_transformed, f)

with open('x_test_transformed.pkl', 'wb') as f:
    pickle.dump(X_test_transformed, f)

with open('y_train.pkl', 'wb') as f:
    pickle.dump(Y_train, f)

with open('y_test.pkl', 'wb') as f:
    pickle.dump(Y_test, f)

In [19]:
train_x, test_x = X_train_transformed, X_test_transformed

In [20]:
print(train_x.shape, test_x.shape)

(1186028, 755851) (296507, 755851)


In [21]:
alpha = [1, 2, 3, 3.5, 4, 4.5, 5, 6, 7]
test_rmsle_array = []
for i in tqdm(alpha):
    model = Ridge(solver="sparse_cg", random_state=42, alpha=i)
    model.fit(train_x, Y_train)
    preds_test = model.predict(test_x)
    test_rmsle_array.append(sqrt(mse(Y_test, preds_test)))

for i in range(len(test_rmsle_array)):
    print('RMSLE for alpha = ', alpha[i], 'is', test_rmsle_array[i])

best_alpha = np.argmin(test_rmsle_array)

fig, ax = plt.subplots()
ax.plot(alpha, test_rmsle_array)
ax.scatter(alpha, test_rmsle_array)
for i, txt in enumerate(np.round(test_rmsle_array, 3)):
    ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], test_rmsle_array[i]))

plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha")
plt.ylabel("Error")
plt.show()

  0%|          | 0/9 [02:38<?, ?it/s]

KeyboardInterrupt



In [22]:
%%time

best_alpha = 2
print("Best alpha: ", alpha[best_alpha])
model = Ridge(solver="sparse_cg", random_state=42, alpha=alpha[best_alpha])
model.fit(train_x, Y_train)
ridge_preds_train = model.predict(train_x)
ridge_preds_test = model.predict(test_x)

print('Train RMSLE:', sqrt(mse(Y_train, ridge_preds_train)))

ridge_rmsle = sqrt(mse(Y_test, ridge_preds_test))
print("Cross validation RMSLE: ", ridge_rmsle)

Best alpha:  3
Train RMSLE: 0.3799064785161814
Cross validation RMSLE:  0.4408779117241604
CPU times: user 12min, sys: 12.1 s, total: 12min 12s
Wall time: 4min 38s


In [23]:
# import pickle
#
# # save model to disk
# with open('ridge_vectorized.pkl', 'rb') as f:
#     model = pickle.load(f)

In [34]:
eli5.show_weights(model, vec=vectorizer, top=(30, 30))



Weight?,Feature
+2.472,<BIAS>
+2.111,name__tieks
+2.098,name__apple watch
+2.031,name__14k
+1.896,name__tyme
+1.886,name__inr
+1.871,item_description__128gb
+1.836,item_description__14k
+1.683,name__mcm
+1.645,name__hoverboard


In [35]:
import pickle

# save model to disk
with open('ridge_vectorized.pkl', 'wb') as f:
    pickle.dump(model, f)
# with open('ridge_vectorizer.pkl', 'wb') as f:
#     pickle.dump(vectorizer, f)


KeyboardInterrupt: 