## Mercari Price Suggestion Challenge

The objective of this challenge is to build an algorithm that automatically suggests the right product prices on Mercari. The training data consists of user-inputted text descriptions of their products, including details like product category name, brand name, and item condition

In [2]:
# Import required libraries
import numpy as np
import pandas as pd
import string
import re
import pickle
import gc

# Text mining 
import nltk
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction import stop_words
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelBinarizer
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb

# Time 
from time import time

#Plots
import matplotlib.pyplot as plt

In [None]:
def startTime():
    return time()
def endTime(s):
    print ("Time elapsed {}".format(time()-s))

In [28]:
#df_train = pd.read_csv('../input/train.tsv', sep='\t')
#df_test = pd.read_csv('../input/test.tsv', sep='\t')


df_train = pd.read_csv('../../../data/train.tsv', sep='\t')
df_test = pd.read_csv('../../../data/test.tsv', sep='\t')

df_train = df_train[n_samplpes:]

In [30]:
# Nulls in item description in train or test as tf-idf is not defined on nan
# lets drop these 4 items
print(df_train.shape)
print(df_test.shape)
df_train = df_train.loc[df_train.item_description == df_train.item_description]
df_test = df_test.loc[df_test.item_description == df_test.item_description]
df_train = df_train.loc[df_train.name == df_train.name]
print("Dropped records where item description was nan")
print(df_train.shape)
print(df_test.shape)

(1482531, 8)
(693359, 7)
Dropped records where item description was nan
(1482531, 8)
(693359, 7)


### Creating Categorical Features

In [5]:
NUM_BRANDS = 4000
NUM_CATEGORIES = 1000
NAME_MIN_DF = 10

def handle_missing_inplace(dataset):
    dataset['category_name'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    # dataset['item_description'].fillna(value='missing', inplace=True)


def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category = dataset['category_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    dataset.loc[~dataset['category_name'].isin(pop_category), 'category_name'] = 'missing'


def to_categorical(dataset):
    dataset['category_name'] = dataset['category_name'].astype('category')
    dataset['brand_name'] = dataset['brand_name'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')
    dataset['shipping'] = dataset['shipping'].astype('category')

In [6]:
nrow_train = df_train.shape[0]
y = np.log1p(df_train["price"])
merge = pd.concat([df_train, df_test])
submission = df_test[['test_id']]
gc.collect()

s = startTime()
handle_missing_inplace(merge)
cutting(merge)
to_categorical(merge)
endTime(s)

Time elapsed 7.4856157302856445


In [7]:
s = startTime()
cv = CountVectorizer(min_df=NAME_MIN_DF)
X_name = cv.fit_transform(merge['name'])
endTime(s)

Time elapsed 15.007386684417725


In [8]:
s = startTime()
cv = CountVectorizer()
X_category = cv.fit_transform(merge['category_name'])
endTime(s)

Time elapsed 12.96649432182312


In [9]:
s = startTime()
lb = LabelBinarizer(sparse_output=True)
X_brand = lb.fit_transform(merge['brand_name'])
endTime(s)

Time elapsed 11.55437684059143


In [10]:
s = startTime()
X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                          sparse=True).values)
endTime(s)

Time elapsed 10.293352365493774


In [11]:
sparse_merge = hstack((X_dummies, X_brand, X_category, X_name)).tocsr()

## Text Mining : Tf-Idf, NMF, LDA

### Define Tokenizer Function

In [12]:
stop = set(stopwords.words('english'))
def tokenize(text):
    """
    sent_tokenize(): segment text into sentences
    word_tokenize(): break sentences into words
    """
    try: 
        regex = re.compile('[' +re.escape(string.punctuation) + '0-9\\r\\t\\n]')
        text = regex.sub(" ", text) # remove punctuation
        
        tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent
        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        filtered_tokens = [w for w in tokens if re.search('[a-zA-Z]', w)]
        filtered_tokens = [w.lower() for w in filtered_tokens if len(w)>=3]
        
        return filtered_tokens
            
    except TypeError as e: print(text,e)

### TF-IDF feature extraction

In [50]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF and Normal TFID...")
tfidf_vectorizer = TfidfVectorizer(min_df=10,
                             max_features=180000,
                             tokenizer=tokenize,
                             ngram_range=(1, 2))

t0 = startTime()
full_tfidf = tfidf_vectorizer.fit_transform(merge['item_description'].apply(str))
train_tfidf = tfidf_vectorizer.transform(df_train['item_description'].apply(str))
test_tfidf = tfidf_vectorizer.transform(df_test['item_description'].apply(str))
endTime(t0)

Extracting tf-idf features for NMF and Normal TFID...
Time elapsed 1445.708969116211


### SVD on Tf-Idf features

In [None]:
n_comp = 25
print("SVD on TFID to get Latent Representation : k = {} ...".format(n_comp))
t0 = startTime()
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf)
train_svd = csr_matrix(svd_obj.transform(train_tfidf))
test_svd =  csr_matrix(svd_obj.transform(test_tfidf))
endTime(t0)

#train_svd.columns = ['svd_item_'+str(i) for i in range(n_comp)]
#test_svd.columns = ['svd_item_'+str(i) for i in range(n_comp)]
#df_train = pd.concat([df_train, train_svd], axis=1)
#df_test = pd.concat([df_test, test_svd], axis=1)


SVD on TFID to get Latent Representation : k = 25 ...
Time elapsed 84.3632824420929


### LDA feature extraction

In [None]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tfidf_vectorizer_lda = CountVectorizer(min_df=4,max_features=180000,
                     tokenizer=tokenize,ngram_range=(1,2))
t0 = startTime()
full_tfidf_lda = tfidf_vectorizer_lda.fit_transform(merge['item_description'].apply(str))
train_tfidf_lda = tfidf_vectorizer_lda.transform(df_train['item_description'].apply(str))
test_tfidf_lda = tfidf_vectorizer_lda.transform(df_test['item_description'].apply(str))
endTime(t0)

* ### NMF - Frobenious Norm,Kullback-Leibler, Divergence, LDA

In [None]:
n_components = 10
n_top_words = 10

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d .."
      % (n_samples))
t0 = startTime()
nmf_frob = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(full_tfidf)
endTime(t0)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d .."
      % (n_samples))
t0 = time()
nmf_kld = NMF(n_components=n_components, random_state=1,
          beta_loss= 'kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(full_tfidf)
endTime(t0)

print("Fitting LDA models with tf features, "
      "n_samples=%d and .."
      % (n_samples))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=20,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = startTime()
lda.fit(full_tfidf_lda)
endTime(t0)


In [None]:
train_nmf_frob = csr_matrix(nmf_frob.transform(train_tfidf))
test_nmf_frob = csr_matrix(nmf_frob.transform(test_tfidf))

train_nmf_kld = csr_matrix(nmf_kld.transform(train_tfidf))
test_nmf_kld = csr_matrix(nmf_kld.transform(test_tfidf))

train_lda = csr_matrix(lda.transform(train_tfidf_lda))
test_lda = csr_matrix(lda.transform(test_tfidf_lda))


#train_nmf_frob_df.columns = ['nmf_frob_'+str(i) for i in range(n_components)]
#test_nmf_frob_df.columns = ['nmf_frob_'+str(i) for i in range(n_components)]
#df_train = pd.concat([df_train, train_nmf_frob_df], axis=1)
#df_test = pd.concat([df_test, test_nmf_frob_df], axis=1)

#train_nmf_kld_df.columns = ['nmf_kld_'+str(i) for i in range(n_components)]
#test_nmf_kld_df.columns = ['nmf_kld_'+str(i) for i in range(n_components)]
#df_train = pd.concat([df_train, train_nmf_kld_df], axis=1)
#df_test = pd.concat([df_test, test_nmf_kld_df], axis=1)

#train_lda_df.columns = ['lda_'+str(i) for i in range(n_components)]
#test_lda_df.columns = ['lda_'+str(i) for i in range(n_components)]
#df_train = pd.concat([df_train, train_lda_df], axis=1)
#df_test = pd.concat([df_test, test_lda_df], axis=1)



In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
X_train_cat = sparse_merge[:nrow_train]
X_test_cat = sparse_merge[nrow_train:]
X_train = hstack((X_train_cat, train_svd, train_lda, train_nmf_frob, train_nmf_kld)).tocsr()
X_test = hstack((X_test_cat, test_svd, test_lda, test_nmf_frob, test_nmf_kld)).tocsr()

In [None]:
train_X, valid_X, train_y, valid_y = train_test_split(X_train, y, test_size = 0.1, random_state = 144) 
d_train = lgb.Dataset(train_X, label=train_y, max_bin=8192)
d_valid = lgb.Dataset(valid_X, label=valid_y, max_bin=8192)
watchlist = [d_train, d_valid]
params = {
        'learning_rate': 0.75,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 50,
        'verbosity': -1,
        'metric': 'RMSE',
}

model = lgb.train(params, train_set=d_train, num_boost_round=3600, valid_sets=watchlist, \
early_stopping_rounds=50, verbose_eval=100) 
preds = 0.55*model.predict(X_test)

s = startTime()
model = Ridge(solver="sag", fit_intercept=True, random_state=205)
model.fit(X_train, y)
endTime(s)
s = startTime()
preds += 0.45*model.predict(X=X_test)
endTime(s)
submission['price'] = np.expm1(preds)
submission.to_csv("submission_lgbm_ridge_6.csv", index=False)