In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import FeatureUnion
# from keras.api._v2.keras import Model, Sequential
# from keras.api._v2.keras.layers import Dense

In [2]:
dataset = pd.read_csv('data/train.tsv', sep='\t', header=0)
dataset['category_name'] = dataset['category_name'].fillna('Other').astype(str)
dataset['brand_name'] = dataset['brand_name'].fillna('missing').astype(str)
dataset['shipping'] = dataset['shipping'].astype(str)  # makes this categorical
dataset['item_condition_id'] = dataset['item_condition_id'].astype(str)
dataset['item_description'] = dataset['item_description'].fillna('None')

X = dataset.loc[:, dataset.columns != 'price']
Y = np.log1p(dataset['price'])

# 80% training data, 20% test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
X_train.head(10)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,shipping,item_description
604635,604635,Enzo,3,Women/Shoes/Mules & Clogs,missing,0,Enzo Angiolini Mules/Clogs. Super Cute with so...
496799,496799,Black dress,3,Women/Dresses/Full-Length,missing,1,Xl long black dress Solid under with lace over...
1035231,1035231,2 items for Brittany,1,Electronics/Cell Phones & Accessories/Cables &...,missing,0,- Urban Decay Eyeshadow This has never been us...
628659,628659,Texas budle,2,Women/Tops & Blouses/T-Shirts,missing,0,Very cute!! No flaws
261459,261459,North face rain coat,3,Women/Athletic Apparel/Jackets,The North Face,0,Sea foam green rain coat Size 18 XL in youth I...
959361,959361,Michael kors crossbody bag,3,Women/Women's Handbags/Messenger & Crossbody,Michael Kors,0,"Used , the color is rose gold"
199415,199415,Cream studded wedges,3,Women/Shoes/Sandals,missing,0,The brand is Hot Rated purchased at Buckle. I ...
1460850,1460850,Pink and gold trinket box,3,Home/Home Décor/Home Décor Accents,missing,0,"5.75"" tall and 4.5"" wide. Heavy resin. The fel..."
568899,568899,True religion button shirt slimsize XXXL,3,Men/Tops/T-shirts,True Religion Brand Jeans,1,Please remember the brand name usually run sma...
1365752,1365752,WD Scorpio Blue 1TB 2.5 hard drive,3,"Electronics/Computers & Tablets/Drives, Storag...",Western Digital,1,Used like new condition Can be used on PS4 (yo...


In [3]:
% % time

preprocessor = CountVectorizer().build_preprocessor()


def build_field_preprocessor(field):
    field_idx = list(X_train.columns).index(field)
    return lambda x: preprocessor(x[field_idx])  # this preprocesses like stripping accents, etc.


vectorizer = FeatureUnion([
    ('name', CountVectorizer(
        ngram_range=(1, 2),  # extract words and pairs of words
        max_features=50_000,
        preprocessor=build_field_preprocessor('name')
    )),
    ('category_name', CountVectorizer(
        token_pattern='.+',  # separate by space
        preprocessor=build_field_preprocessor('category_name')
    )),
    ('brand_name', CountVectorizer(
        token_pattern='.+',  # separate by space
        preprocessor=build_field_preprocessor('brand_name')
    )),
    ('shipping', CountVectorizer(
        token_pattern='\d+',  # decimal numbers
        preprocessor=build_field_preprocessor('shipping')
    )),
    ('item_condition_id', CountVectorizer(
        token_pattern='\d+',  # decimal numbers
        preprocessor=build_field_preprocessor('item_condition_id')
    )),
    ('item_description', TfidfVectorizer(
        ngram_range=(1, 3),  # up to three words in a sequence
        max_features=1_000_00,
        preprocessor=build_field_preprocessor('item_description')
    ))
], n_jobs=8)

X_train_transformed = vectorizer.fit_transform(X_train.values)
X_test_transformed = vectorizer.transform(X_test.values)  # don't fit to the test

CPU times: user 43.1 s, sys: 11.2 s, total: 54.3 s
Wall time: 5min 31s


In [7]:
%%time

def rmsle(y_pred, y_true):
    return np.sqrt(mean_squared_log_error(np.expm1(y_pred.clip(0)), np.expm1(y_true)))


# model = make_pipeline(StandardScaler(with_mean=False), MLPRegressor(
#     hidden_layer_sizes=[100, 100],
#     activation='relu',
#     verbose=True,
# ), verbose=True)
model = MLPRegressor(
    hidden_layer_sizes=[100, 100],
    activation='tanh',
    verbose=True,
    solver='lbfgs',
)
model.fit(X_train_transformed, Y_train.values)

y_pred_validation_ridge = model.predict(X_test_transformed)
ridge_error = rmsle(y_pred_validation_ridge, Y_test.values)
print(f'Validation RMSLE = {ridge_error:.5f}')


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =     15594201     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.15688D+00    |proj g|=  2.78441D+00

At iterate    1    f=  3.05169D-01    |proj g|=  1.00781D-01

At iterate    2    f=  2.93430D-01    |proj g|=  6.95738D-02

At iterate    3    f=  2.87249D-01    |proj g|=  4.63349D-02

At iterate    4    f=  2.55249D-01    |proj g|=  4.43961D-02

At iterate    5    f=  2.48086D-01    |proj g|=  2.48983D-02

At iterate    6    f=  2.38240D-01    |proj g|=  9.93183D-03

At iterate    7    f=  2.31788D-01    |proj g|=  1.73269D-02

At iterate    8    f=  2.23827D-01    |proj g|=  1.37754D-02

At iterate    9    f=  2.08602D-01    |proj g|=  2.01270D-02

At iterate   10    f=  1.94359D-01    |proj g|=  3.72274D-02

At iterate   11    f=  1.88489D-01    |proj g|=  3.27773D-02

At iterate   12    f=  1.81555D-01    |proj g|=  1.36396D-02

At iterate   13    f=  1.7

KeyboardInterrupt: 

In [8]:
y_pred_validation_ridge = model.predict(X_test_transformed)
ridge_error = rmsle(y_pred_validation_ridge, Y_test.values)
print(f'Validation RMSLE = {ridge_error:.5f}')


Validation RMSLE = 0.55752
