In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [60]:
dataset = pd.read_csv('data/train.tsv', sep='\t', header=0)
dataset['category_name'] = dataset['category_name'].fillna('Other').astype(str)
dataset['brand_name'] = dataset['brand_name'].fillna('missing').astype(str)
dataset['shipping'] = dataset['shipping'].astype(str)  # makes this categorical
dataset['item_condition_id'] = dataset['item_condition_id'].astype(str)
dataset['item_description'] = dataset['item_description'].fillna('None')

X = dataset.loc[:, dataset.columns != 'price']
Y = np.log1p(dataset['price'])

# 80% training data, 20% test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [61]:
X_train.head(10)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,shipping,item_description
604635,604635,Enzo,3,Women/Shoes/Mules & Clogs,missing,0,Enzo Angiolini Mules/Clogs. Super Cute with so...
496799,496799,Black dress,3,Women/Dresses/Full-Length,missing,1,Xl long black dress Solid under with lace over...
1035231,1035231,2 items for Brittany,1,Electronics/Cell Phones & Accessories/Cables &...,missing,0,- Urban Decay Eyeshadow This has never been us...
628659,628659,Texas budle,2,Women/Tops & Blouses/T-Shirts,missing,0,Very cute!! No flaws
261459,261459,North face rain coat,3,Women/Athletic Apparel/Jackets,The North Face,0,Sea foam green rain coat Size 18 XL in youth I...
959361,959361,Michael kors crossbody bag,3,Women/Women's Handbags/Messenger & Crossbody,Michael Kors,0,"Used , the color is rose gold"
199415,199415,Cream studded wedges,3,Women/Shoes/Sandals,missing,0,The brand is Hot Rated purchased at Buckle. I ...
1460850,1460850,Pink and gold trinket box,3,Home/Home Décor/Home Décor Accents,missing,0,"5.75"" tall and 4.5"" wide. Heavy resin. The fel..."
568899,568899,True religion button shirt slimsize XXXL,3,Men/Tops/T-shirts,True Religion Brand Jeans,1,Please remember the brand name usually run sma...
1365752,1365752,WD Scorpio Blue 1TB 2.5 hard drive,3,"Electronics/Computers & Tablets/Drives, Storag...",Western Digital,1,Used like new condition Can be used on PS4 (yo...


In [83]:
%%time

from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

preprocessor = CountVectorizer().build_preprocessor()


def build_field_preprocessor(field):
    field_idx = list(X_train.columns).index(field)
    return lambda x: preprocessor(x[field_idx])  # this preprocesses like stripping accents, etc.


vectorizer = FeatureUnion([
    ('name', CountVectorizer(
        ngram_range=(1, 2),  # extract words and pairs of words
        max_features=50_000,
        preprocessor=build_field_preprocessor('name')
    )),
    ('category_name', CountVectorizer(
        token_pattern='.+',  # separate by space
        preprocessor=build_field_preprocessor('category_name')
    )),
    ('brand_name', CountVectorizer(
        token_pattern='.+',  # separate by space
        preprocessor=build_field_preprocessor('brand_name')
    )),
    ('shipping', CountVectorizer(
        token_pattern='\d+',  # decimal numbers
        preprocessor=build_field_preprocessor('shipping')
    )),
    ('item_condition_id', CountVectorizer(
        token_pattern='\d+',  # decimal numbers
        preprocessor=build_field_preprocessor('item_condition_id')
    )),
    ('item_description', TfidfVectorizer(
        ngram_range=(1, 3),  # up to three words in a sequence
        max_features=1_000_00,
        preprocessor=build_field_preprocessor('item_description')
    ))
])

X_train_transformed = vectorizer.fit_transform(X_train.values)
X_test_transformed = vectorizer.transform(X_test.values)  # don't fit to the test
X_train_transformed

CPU times: user 4min 59s, sys: 17.9 s, total: 5min 17s
Wall time: 5min 26s


<1186028x155839 sparse matrix of type '<class 'numpy.float64'>'
	with 59679331 stored elements in Compressed Sparse Row format>

In [84]:
%%time
from sklearn.linear_model import SGDRegressor

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error


def rmsle(y_pred, y_true):
    return np.sqrt(mean_squared_log_error(np.expm1(y_pred.clip(0)), np.expm1(y_true)))


# cv = KFold(n_splits=10, shuffle=True, random_state=42)
# model = make_pipeline(StandardScaler(with_mean=False), Ridge(
#     solver='auto',
#     fit_intercept=True,
#     alpha=0.5,
#     max_iter=100,
#     tol=0.05
# ))
model = make_pipeline(StandardScaler(with_mean=True), SGDRegressor(verbose=True))
# for train_ids, validation_ids in cv.split(X_train_transformed):
#     model.partial_fit(X_train_transformed[train_ids], Y_train.values[train_ids])
#     y_pred_validation = model.predict(X_train_transformed[validation_ids])
#     error = rmsle(y_pred_validation, Y_train.values[validation_ids])
#     print(f'Validation RMSLE = {error:.5f}')

model.fit(X_train_transformed, Y_train.values)

-- Epoch 1
Norm: 34957628231419.32, NNZs: 155839, Bias: -378398429.463090, T: 1186028, Avg. loss: 1279671156143664027978956800.000000
Total training time: 0.95 seconds.
-- Epoch 2
Norm: 25963532078586.90, NNZs: 155839, Bias: -93381717.351799, T: 2372056, Avg. loss: 677575702517766765085196288.000000
Total training time: 1.74 seconds.
-- Epoch 3
Norm: 23921512310695.80, NNZs: 155839, Bias: 226543099.779182, T: 3558084, Avg. loss: 427746372513961463991238656.000000
Total training time: 2.56 seconds.
-- Epoch 4
Norm: 21445184982683.36, NNZs: 155839, Bias: -70529423.093777, T: 4744112, Avg. loss: 368685835142318074483441664.000000
Total training time: 3.36 seconds.
-- Epoch 5
Norm: 20792540457743.72, NNZs: 155839, Bias: -111933035.863538, T: 5930140, Avg. loss: 308481888017009987633020928.000000
Total training time: 4.15 seconds.
-- Epoch 6
Norm: 19224708703775.84, NNZs: 155839, Bias: -230453081.116020, T: 7116168, Avg. loss: 289943821827839374294777856.000000
Total training time: 4.92 sec

In [89]:
y_pred_validation = model.predict(X_test_transformed)
# error = rmsle(y_pred_validation, Y_test.values)
# print(f'Validation RMSLE = {error:.5f}')
y_pred_validation

array([-1.18421290e+11, -9.49173835e+12, -4.29868434e+12, ...,
       -2.51483821e+13,  4.83050251e+12,  2.19834670e+12])

In [90]:
import eli5

eli5.show_weights(model, vec=vectorizer)



Weight?,Feature
+115496825500.815,item_description__correctly for teeth
+105358676254.622,item_description__lighters pokémon
+102861973791.616,item_description__rm try our
+97713075500.035,name__free iphone
+88270767775.729,name__brandnew temper
+85004382142.168,name__egyptian comfort
+81940086352.979,item_description__sexy and get
… 77863 more positive …,… 77863 more positive …
… 77957 more negative …,… 77957 more negative …
-82040250747.467,item_description__listing for 30oz


In [None]:
eli5.show_weights(model, vec=vectorizer, top=100, feature_filter=lambda x: x != '<BIAS>')

In [93]:
ridge_model = make_pipeline(StandardScaler(with_mean=False), Ridge(
    solver='auto',
    fit_intercept=True,
    alpha=0.5,
    max_iter=100,
    tol=0.05,
))
ridge_model.fit(X_train_transformed, Y_train.values)

y_pred_validation_ridge = ridge_model.predict(X_test_transformed)
ridge_error = rmsle(y_pred_validation_ridge, Y_test.values)
print(f'Validation RMSLE for ridge regression = {ridge_error:.5f}')


Validation RMSLE for ridge regression = 0.46513


In [94]:
y_pred_validation_ridge.min()

0.08284415584806659

In [96]:
eli5.show_weights(ridge_model, vec=vectorizer, top=50, feature_filter=lambda x: x != '<BIAS>')

Weight?,Feature
+0.062,name__bundle
+0.047,category_name__men/shoes/athletic
+0.047,item_condition_id__1
+0.045,category_name__electronics/cell phones & accessories/cell phones & smartphones
+0.044,shipping__0
+0.042,name__lularoe
+0.039,brand_name__lululemon
+0.038,brand_name__kendra scott
+0.037,category_name__women/shoes/athletic
+0.034,category_name__women/shoes/boots
