## Mercari Price Suggestion Challenge

The objective of this challenge is to build an algorithm that automatically suggests the right product prices on Mercari. The training data consists of user-inputted text descriptions of their products, including details like product category name, brand name, and item condition

In [35]:
# Import required libraries
import numpy as np
import pandas as pd
import string
import re
import pickle
import gc
import math

# Text mining 
import nltk
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction import stop_words
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelBinarizer
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfTransformer
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import r2_score
# Time 
from time import time

#Plots
import matplotlib.pyplot as plt

In [41]:
def startTime():
    return time()
def endTime(s):
    print ("Time elapsed {}".format(time()-s))
def printSep():
    print "*"*40
    print ""

In [56]:
#df_train = pd.read_csv('../input/train.tsv', sep='\t')
#df_test = pd.read_csv('../input/test.tsv', sep='\t')

df_train = pd.read_csv('../../../../data/train.tsv', sep='\t')
#df_test = pd.read_csv('../../../../data/test.tsv', sep='\t')
n_samples = 100000
df_test = df_train[n_samples:n_samples+40000]
df_train = df_train[:n_samples]

print df_train.shape
print df_test.shape

(1482535, 8)
(693359, 7)


In [6]:
# Nulls in item description in train or test as tf-idf is not defined on nan
# lets drop these 4 items
df_train = df_train.loc[df_train.item_description == df_train.item_description]
df_train = df_train.loc[df_train.name == df_train.name]
print("Dropped records where item description was nan")

Dropped records where item description was nan


### Creating Categorical Features

In [7]:
NUM_BRANDS = 4000
NUM_CATEGORIES = 1000
NAME_MIN_DF = 10

def handle_missing_inplace(dataset):
    dataset['category_name'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    # dataset['item_description'].fillna(value='missing', inplace=True)


def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category = dataset['category_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    dataset.loc[~dataset['category_name'].isin(pop_category), 'category_name'] = 'missing'


def to_categorical(dataset):
    dataset['category_name'] = dataset['category_name'].astype('category')
    dataset['brand_name'] = dataset['brand_name'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')
    dataset['shipping'] = dataset['shipping'].astype('category')

In [8]:
nrow_train = df_train.shape[0]
y = np.log1p(df_train["price"])
y_test = np.log1p((df_test["price"]).reset_index()["price"])
merge = pd.concat([df_train, df_test])
gc.collect()

s = startTime()
handle_missing_inplace(merge)
cutting(merge)
to_categorical(merge)
endTime(s)

Time elapsed 0.168015956879


In [9]:
s = startTime()
cv = CountVectorizer(min_df=NAME_MIN_DF)
X_name = cv.fit_transform(merge['name'])
endTime(s)

Time elapsed 1.43288588524


In [10]:
s = startTime()
cv = CountVectorizer()
X_category = cv.fit_transform(merge['category_name'])
endTime(s)

Time elapsed 1.18913698196


In [11]:
s = startTime()
lb = LabelBinarizer(sparse_output=True)
X_brand = lb.fit_transform(merge['brand_name'])
endTime(s)

Time elapsed 0.425969839096


In [12]:
s = startTime()
X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                          sparse=True).values)
endTime(s)

Time elapsed 0.615557193756


In [13]:
sparse_merge = hstack((X_dummies, X_brand, X_category, X_name)).tocsr().astype('float32')

### Using Base features for regression

In [14]:
X_train_cat = sparse_merge[:nrow_train]
X_test_cat = sparse_merge[nrow_train:]

In [15]:
def calcMse(yTrue, yPred):
    s = 0
    for i in range(len(yTrue)):
        s += (yTrue[i]-yPred[i])**2
    return math.sqrt(s*1.0/len(yTrue))

In [30]:
def lgbm(X_train_cat,y,X_test_cat,y_test):
        train_X, valid_X, train_y, valid_y = train_test_split(X_train_cat, y, test_size = 0.1, random_state = 144) 
        d_train = lgb.Dataset(train_X, label=train_y, max_bin=8192)
        d_valid = lgb.Dataset(valid_X, label=valid_y, max_bin=8192)
        watchlist = [d_train, d_valid]
        params = {
                'learning_rate': 0.75,
                'application': 'regression',
                'max_depth': 3,
                'num_leaves': 50,
                'verbosity': -1,
                'metric': 'RMSE',
        }

        model = lgb.train(params, train_set=d_train, num_boost_round=3600, valid_sets=watchlist, \
        early_stopping_rounds=50, verbose_eval=100) 
        y_pred = model.predict(X_test_cat)

        print  'rmse value is ' + str(calcMse(y_test,y_pred))
        print  ' r^2 value is ' + str(r2_score(y_test, y_pred))
        print  ' correlation coef is ' + str(np.corrcoef(y_test, y_pred)[0,1])
        return y_pred

### Base features + ridge model

In [31]:
def ridgeReg(X_train_cat,y,X_test_cat,y_test):
        s = startTime()
        model = Ridge(solver="sag", fit_intercept=True, random_state=205)
        model.fit(X_train_cat, y)
        endTime(s)
        s = startTime()
        y_pred = model.predict(X=X_test_cat)
        endTime(s)

        print  'rmse value is ' + str(calcMse(y_test,y_pred))
        print  ' r^2 value is ' + str(r2_score(y_test, y_pred))
        print  ' correlation coef is ' + str(np.corrcoef(y_test, y_pred)[0,1])
        return y_pred

### Base features + xgboost model

In [32]:
def XgBoost(X_train_cat,y,X_test_cat,y_test):
        s = startTime()
        train_X, valid_X, train_y, valid_y = train_test_split(X_train_cat, y, test_size = 0.1, random_state = 144) 
        dtrain = xgb.DMatrix(train_X, train_y)
        dvalid  = xgb.DMatrix(valid_X,  valid_y)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        params = {'min_child_weight': 20, 'eta': 0.015, 'colsample_bytree': 0.48, 'max_depth': 14,
                    'subsample': 0.91, 'lambda': 2.01, 'nthread': 4, 'booster' : 'gbtree', 'silent': 1,
                    'eval_metric': 'rmse', 'objective': 'reg:linear','tree_method': 'auto'}
        model = xgb.train(params, dtrain, 1000, watchlist, verbose_eval=10, early_stopping_rounds=20)
        y_pred = model.predict(xgb.DMatrix(X_test_cat), ntree_limit=model.best_ntree_limit)
        endTime(s)

        print  'rmse value is ' + str(calcMse(y_test,y_pred))
        print  ' r^2 value is ' + str(r2_score(y_test, y_pred))
        print  ' correlation coef is ' + str(np.corrcoef(y_test, y_pred)[0,1])
        return y_pred

### Lgbm + ridge, basic features

In [33]:
def ensemble(y1,y2,w1,w2,y_test):
    y_pred = w1*y1 + w2*y2
    print  'rmse value is ' + str(calcMse(y_test,y_pred))
    print  ' r^2 value is ' + str(r2_score(y_test, y_pred))
    print  ' correlation coef is ' + str(np.corrcoef(y_test, y_pred)[0,1])
    return y_pred

In [36]:
y_lgb = lgbm(X_train_cat,y,X_test_cat,y_test)
y_rid = ridgeReg(X_train_cat,y,X_test_cat,y_test)
y_xg = XgBoost(X_train_cat,y,X_test_cat,y_test)
y_ens = ensemble(y_lgb,y_rid,0.6,0.4,y_test)

Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.553597	valid_1's rmse: 0.555354
[200]	training's rmse: 0.531934	valid_1's rmse: 0.538467
[300]	training's rmse: 0.521822	valid_1's rmse: 0.532646
[400]	training's rmse: 0.514428	valid_1's rmse: 0.529972
[500]	training's rmse: 0.508105	valid_1's rmse: 0.527372
[600]	training's rmse: 0.504054	valid_1's rmse: 0.527019
Early stopping, best iteration is:
[566]	training's rmse: 0.505406	valid_1's rmse: 0.526751
rmse value is 0.526555531359
 r^2 value is 0.500779241275
 correlation coef is 0.707898793313
Time elapsed 3.99578905106
Time elapsed 0.00165700912476
rmse value is 0.525341760868
 r^2 value is 0.503078110092
 correlation coef is 0.710372567038
[0]	train-rmse:2.55259	valid-rmse:2.54817
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 20 rounds.
[10]	train-rmse:2.22032	valid-rmse:2.21596
[20]	train-rmse:1.93773	valid

## Text Mining : Tf-Idf, NMF, LDA

### Define Tokenizer Function

In [37]:
stop = set(stopwords.words('english'))
def tokenize(text):
    """
    sent_tokenize(): segment text into sentences
    word_tokenize(): break sentences into words
    """
    try: 
        regex = re.compile('[' +re.escape(string.punctuation) + '0-9\\r\\t\\n]')
        text = regex.sub(" ", text) # remove punctuation
        
        tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent
        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        filtered_tokens = [w for w in tokens if re.search('[a-zA-Z]', w)]
        filtered_tokens = [w.lower() for w in filtered_tokens if len(w)>=3]
        
        return filtered_tokens
            
    except TypeError as e: print(text,e)

### LDA feature extraction

In [43]:
# use raw counts of words
print("Extracting tf features for LDA...")
tfidf_vectorizer_lda = CountVectorizer(min_df=20,max_features=50000,
                    tokenizer=tokenize,ngram_range=(1,2))


#tfidf_vectorizer_lda = CountVectorizer(max_features = 180000, 
#                             ngram_range = (1,3),
#                            stop_words = "english")

#tfidf_vectorizer_lda = CountVectorizer(
#    max_features = 180000,min_df=5, strip_accents='unicode', lowercase =True,
#    analyzer='word', token_pattern=r'\w+', ngram_range=(1, 3),
#    stop_words ='english')

t0 = startTime()
full_tfidf_lda = tfidf_vectorizer_lda.fit_transform(merge['item_description'].apply(str))
train_tfidf_lda = tfidf_vectorizer_lda.transform(df_train['item_description'].apply(str))
test_tfidf_lda = tfidf_vectorizer_lda.transform(df_test['item_description'].apply(str))
endTime(t0)

Extracting tf features for LDA...
Time elapsed 92.7914581299


### TF-IDF feature extraction

In [44]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF and Normal TFID...")

t0 = startTime()
tfidf = TfidfTransformer(norm='l2')
tfidf.fit(full_tfidf_lda)
full_tfidf = tfidf.transform(full_tfidf_lda)
train_tfidf = tfidf.transform(train_tfidf_lda)
test_tfidf = tfidf.transform(test_tfidf_lda)
endTime(t0)

Extracting tf-idf features for NMF and Normal TFID...
Time elapsed 0.135686159134


### SVD on Tf-Idf features

In [49]:

n_comp = 100
print("SVD on TFID to get Latent Representation : k = {} ...".format(n_comp))
t0 = startTime()
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf)
train_svd = csr_matrix(svd_obj.transform(train_tfidf))
test_svd =  csr_matrix(svd_obj.transform(test_tfidf))
endTime(t0)

#train_svd.columns = ['svd_item_'+str(i) for i in range(n_comp)]
#test_svd.columns = ['svd_item_'+str(i) for i in range(n_comp)]
#df_train = pd.concat([df_train, train_svd], axis=1)
#df_test = pd.concat([df_test, test_svd], axis=1)


SVD on TFID to get Latent Representation : k = 100 ...
Time elapsed 7.71344614029


## Base + Tf-Idf SVD Features

In [51]:
X_train_svd = hstack((X_train_cat, train_svd)).tocsr()
X_test_svd = hstack((X_test_cat, test_svd)).tocsr()

y_lgb_svd = lgbm(X_train_svd,y,X_test_svd,y_test)
printSep()
y_rid_svd = ridgeReg(X_train_svd,y,X_test_svd,y_test)
printSep()
y_xg_svd = XgBoost(X_train_svd,y,X_test_svd,y_test)
printSep()
y_ens_svd = ensemble(y_lgb_svd,y_rid_svd,0.6,0.4,y_test)
printSep()

Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.542515	valid_1's rmse: 0.556927
[200]	training's rmse: 0.515411	valid_1's rmse: 0.543204
[300]	training's rmse: 0.497762	valid_1's rmse: 0.537966
[400]	training's rmse: 0.484462	valid_1's rmse: 0.537559
Early stopping, best iteration is:
[362]	training's rmse: 0.488615	valid_1's rmse: 0.537036
rmse value is 0.538659111823
 r^2 value is 0.477564958676
 correlation coef is 0.692459936577
****************************************

Time elapsed 11.314218998
Time elapsed 0.0102150440216
rmse value is 0.516915054927
 r^2 value is 0.518891932608
 correlation coef is 0.721304441339
****************************************

[0]	train-rmse:2.55258	valid-rmse:2.54819
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 20 rounds.
[10]	train-rmse:2.22085	valid-rmse:2.21674
[20]	train-rmse:1.93782	valid-rmse:1.93425
[30]	train-rmse:1.

* ### NMF - Frobenious Norm,Kullback-Leibler, Divergence, LDA

In [52]:
n_components = 10
n_top_words = 10

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d .."
      % (n_samples))
t0 = startTime()
nmf_frob = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(full_tfidf)
endTime(t0)

train_nmf_frob = csr_matrix(nmf_frob.transform(train_tfidf))
test_nmf_frob = csr_matrix(nmf_frob.transform(test_tfidf))

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=100000 ..
Time elapsed 21.6252441406


## Base + Tfidf-SVD + NMF-Frobenius

In [53]:
X_train_nmf = hstack((X_train_cat, train_svd, train_nmf_frob)).tocsr()
X_test_nmf = hstack((X_test_cat, test_svd, test_nmf_frob)).tocsr()

y_lgb_nmf = lgbm(X_train_nmf,y,X_test_nmf,y_test)
printSep()
y_rid_nmf = ridgeReg(X_train_nmf,y,X_test_nmf,y_test)
printSep()
y_xg_nmf = XgBoost(X_train_nmf,y,X_test_nmf,y_test)
printSep()
y_ens_nmf = ensemble(y_lgb_nmf,y_rid_nmf,0.6,0.4,y_test)
printSep()

Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.541422	valid_1's rmse: 0.554417
[200]	training's rmse: 0.514476	valid_1's rmse: 0.540421
[300]	training's rmse: 0.495864	valid_1's rmse: 0.535435
[400]	training's rmse: 0.480871	valid_1's rmse: 0.533294
Early stopping, best iteration is:
[423]	training's rmse: 0.478757	valid_1's rmse: 0.532874
rmse value is 0.535991405575
 r^2 value is 0.482726858066
 correlation coef is 0.696553548529
****************************************

Time elapsed 10.7536699772
Time elapsed 0.00996208190918
rmse value is 0.516908269442
 r^2 value is 0.518904563427
 correlation coef is 0.721313237715
****************************************

[0]	train-rmse:2.55259	valid-rmse:2.54821
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 20 rounds.
[10]	train-rmse:2.22051	valid-rmse:2.21656
[20]	train-rmse:1.93742	valid-rmse:1.9338
[30]	train-rmse:1

# Fit the NMF model -  KLD
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d .."
      % (n_samples))
t0 = time()
nmf_kld = NMF(n_components=n_components, random_state=1,
          beta_loss= 'kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(full_tfidf)
endTime(t0)




## LDA Fit

In [54]:
#sample_sz = 100000
#combined_sample = merge.sample(n=sample_sz)

print("Fitting LDA models with tf features, "
      "n_samples=%d and .."
      % (n_samples))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=20,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = startTime()
lda.fit(full_tfidf_lda)
endTime(t0)



train_lda = csr_matrix(lda.transform(train_tfidf_lda))
test_lda = csr_matrix(lda.transform(test_tfidf_lda))


Fitting LDA models with tf features, n_samples=100000 and ..
Time elapsed 560.897192955


## Base + Tf- Idf : SVD + Nmf : Frobenius + LDA

In [55]:
X_train_lda = hstack((X_train_cat, train_svd, train_nmf_frob, train_lda)).tocsr()
X_test_lda = hstack((X_test_cat, test_svd, test_nmf_frob,test_lda)).tocsr()

y_lgb_lda = lgbm(X_train_lda,y,X_test_lda,y_test)
printSep()
y_rid_lda = ridgeReg(X_train_lda,y,X_test_lda,y_test)
printSep()
y_xg_lda = XgBoost(X_train_lda,y,X_test_lda,y_test)
printSep()
y_ens_lda = ensemble(y_lgb_lda,y_rid_lda,0.6,0.4,y_test)
printSep()

Training until validation scores don't improve for 50 rounds.
[100]	training's rmse: 0.543283	valid_1's rmse: 0.556673
[200]	training's rmse: 0.511156	valid_1's rmse: 0.540191
[300]	training's rmse: 0.493785	valid_1's rmse: 0.535101
[400]	training's rmse: 0.478592	valid_1's rmse: 0.534222
Early stopping, best iteration is:
[384]	training's rmse: 0.480788	valid_1's rmse: 0.533045
rmse value is 0.536690988055
 r^2 value is 0.4813756745
 correlation coef is 0.695674065741
****************************************

Time elapsed 11.963465929
Time elapsed 0.0106019973755
rmse value is 0.516806161905
 r^2 value is 0.519094611144
 correlation coef is 0.721453556
****************************************

[0]	train-rmse:2.55259	valid-rmse:2.54823
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 20 rounds.
[10]	train-rmse:2.22038	valid-rmse:2.21628
[20]	train-rmse:1.93738	valid-rmse:1.9337
[30]	train-rmse:1.69658	

In [None]:
train_nmf_frob = csr_matrix(nmf_frob.transform(train_tfidf))
test_nmf_frob = csr_matrix(nmf_frob.transform(test_tfidf))

#train_nmf_kld = csr_matrix(nmf_kld.transform(train_tfidf))
#test_nmf_kld = csr_matrix(nmf_kld.transform(test_tfidf))

train_lda = csr_matrix(lda.transform(train_tfidf_lda))
test_lda = csr_matrix(lda.transform(test_tfidf_lda))


#train_nmf_frob_df.columns = ['nmf_frob_'+str(i) for i in range(n_components)]
#test_nmf_frob_df.columns = ['nmf_frob_'+str(i) for i in range(n_components)]
#df_train = pd.concat([df_train, train_nmf_frob_df], axis=1)
#df_test = pd.concat([df_test, test_nmf_frob_df], axis=1)

#train_nmf_kld_df.columns = ['nmf_kld_'+str(i) for i in range(n_components)]
#test_nmf_kld_df.columns = ['nmf_kld_'+str(i) for i in range(n_components)]
#df_train = pd.concat([df_train, train_nmf_kld_df], axis=1)
#df_test = pd.concat([df_test, test_nmf_kld_df], axis=1)

#train_lda_df.columns = ['lda_'+str(i) for i in range(n_components)]
#test_lda_df.columns = ['lda_'+str(i) for i in range(n_components)]
#df_train = pd.concat([df_train, train_lda_df], axis=1)
#df_test = pd.concat([df_test, test_lda_df], axis=1)



In [56]:
print(df_train.shape)
print(df_test.shape)

(1000, 8)
(1000, 7)


In [None]:
X_train_cat = sparse_merge[:nrow_train]
X_test_cat = sparse_merge[nrow_train:]
#X_train = hstack((X_train_cat, train_svd, train_lda, train_nmf_frob, train_nmf_kld)).tocsr()
#X_test = hstack((X_test_cat, test_svd, test_lda, test_nmf_frob, test_nmf_kld)).tocsr()
X_train = hstack((X_train_cat, train_svd, train_lda, train_nmf_frob)).tocsr()
X_test = hstack((X_test_cat, test_svd, test_lda, test_nmf_frob)).tocsr()

In [None]:
def calcMse(yTrue, yPred):
    s = 0
    for i in range(len(yTrue)):
        s += (yTrue[i]-yPred[i])**2
    return math.sqrt(s*1.0/len(yTrue))

In [None]:
train_X, valid_X, train_y, valid_y = train_test_split(X_train, y, test_size = 0.1, random_state = 144) 
d_train = lgb.Dataset(train_X, label=train_y, max_bin=8192)
d_valid = lgb.Dataset(valid_X, label=valid_y, max_bin=8192)
watchlist = [d_train, d_valid]
params = {
        'learning_rate': 0.75,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 50,
        'verbosity': -1,
        'metric': 'RMSE',
}

model = lgb.train(params, train_set=d_train, num_boost_round=3600, valid_sets=watchlist, \
early_stopping_rounds=50, verbose_eval=100) 
preds = 0.55*model.predict(X_test)

s = startTime()
model = Ridge(solver="sag", fit_intercept=True, random_state=205)
model.fit(X_train, y)
endTime(s)
s = startTime()
preds += 0.45*model.predict(X=X_test)
endTime(s)
y_pred = np.expm1(preds)
print  'rmse value is ' + str(calcMse(y_test,y_pred))