## Mercari Price Suggestion Challenge

The objective of this challenge is to build an algorithm that automatically suggests the right product prices on Mercari. The training data consists of user-inputted text descriptions of their products, including details like product category name, brand name, and item condition

In [29]:
# Import required libraries
import numpy as np
import pandas as pd
import string
import re
import pickle

# Text mining 
import nltk
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction import stop_words
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelBinarizer

# Time 
from time import time

#Plots
import matplotlib.pyplot as plt

In [18]:
def startTime():
    return time()
def endTime(s):
    print ("Time elapsed {}".format(time()-s))

In [19]:
#df_train = pd.read_csv('../input/train.tsv', sep='\t')
#df_test = pd.read_csv('../input/test.tsv', sep='\t')

df_train = pd.read_csv('../../../data/train.tsv', sep='\t')
df_test = pd.read_csv('../../../data/test.tsv', sep='\t')

# Retain only part of the data 
n_samples = 10000
df_train = df_train.iloc[:n_samples,:]
df_test = df_test.iloc[:n_samples,:]

In [20]:
# Nulls in item description in train or test as tf-idf is not defined on nan
# lets drop these 4 items
df_train = df_train.loc[df_train.item_description == df_train.item_description]
df_train = df_train.loc[df_train.name == df_train.name]
print("Dropped records where item description was nan")

Dropped records where item description was nan


### Creating Categorical Features

In [21]:
NUM_BRANDS = 4000
NUM_CATEGORIES = 1000
NAME_MIN_DF = 10

def handle_missing_inplace(dataset):
    dataset['category_name'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    # dataset['item_description'].fillna(value='missing', inplace=True)


def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category = dataset['category_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    dataset.loc[~dataset['category_name'].isin(pop_category), 'category_name'] = 'missing'


def to_categorical(dataset):
    dataset['category_name'] = dataset['category_name'].astype('category')
    dataset['brand_name'] = dataset['brand_name'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')
    dataset['shipping'] = dataset['shipping'].astype('category')

In [22]:
s = startTime()
handle_missing_inplace(df_train)
handle_missing_inplace(df_test)
print('[{}] Finished to handle missing'.format(endTime(s)))

s = startTime()
cutting(df_train)
cutting(df_test)
print('[{}] Finished to cut'.format(endTime(s)))

s = startTime()
to_categorical(df_train)
to_categorical(df_test)
print('[{}] Finished to convert categorical'.format(endTime(s)))

Time elapsed 0.173171043396
[None] Finished to handle missing
Time elapsed 0.0727500915527
[None] Finished to cut
Time elapsed 0.0482790470123
[None] Finished to convert categorical


In [25]:
# product name related features 
s = startTime()
tf_vec = CountVectorizer(stop_words='english', ngram_range=(1,1))
full_tf = tf_vec.fit_transform(df_train['name'].values.tolist() + df_test['name'].values.tolist())
train_tf = tf_vec.transform(df_train['name'].values.tolist())
test_tf = tf_vec.transform(df_test['name'].values.tolist())

n_comp = 40
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tf.astype('float32'))
train_svd = pd.DataFrame(svd_obj.transform(train_tf))
test_svd = pd.DataFrame(svd_obj.transform(test_tf))
    
train_svd.columns = ['svd_name_'+str(i) for i in range(n_comp)]
test_svd.columns = ['svd_name_'+str(i) for i in range(n_comp)]
df_train = pd.concat([df_train, train_svd], axis=1)
df_test = pd.concat([df_test, test_svd], axis=1)
endTime(s)

In [27]:
s = startTime()
tf_vec = CountVectorizer(stop_words='english', ngram_range=(1,1))
full_tf = tf_vec.fit_transform(df_train['category_name'].values.tolist() + df_test['category_name'].values.tolist())
train_tf = tf_vec.transform(df_train['category_name'].values.tolist())
test_tf = tf_vec.transform(df_test['category_name'].values.tolist())

n_comp = 40
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tf.astype('float32'))
train_svd = pd.DataFrame(svd_obj.transform(train_tf))
test_svd = pd.DataFrame(svd_obj.transform(test_tf))
    
train_svd.columns = ['svd_category_name_'+str(i) for i in range(n_comp)]
test_svd.columns = ['svd_category_name_'+str(i) for i in range(n_comp)]
df_train = pd.concat([df_train, train_svd], axis=1)
df_test = pd.concat([df_test, test_svd], axis=1)
endTime(s)

Time elapsed 0.509783029556


In [31]:
s = startTime()
lb = LabelBinarizer(sparse_output=True)
brand_Vctzr = lb.fit_transform(df_train['brand_name'] + df_test['brand_name'])
brand_trainVctzr = pd.DataFrame(lb.transform(df_train['brand_name'].apply(str)))
brand_testVctzr = pd.DataFrame(lb.transform(df_test['brand_name'].apply(str)))
print('[{}] Finished label binarize `brand_name`'.format(time.time() - s))

TypeError: Categorical cannot perform the operation +

In [None]:


s = startTime()
nrow_train = df_train.shape[0]
print nrow_train
merge_df = pd.concat([df_train, df_test])
item_shipping_dummies = (pd.get_dummies(merge_df[['item_condition_id', 'shipping']],
                                         sparse=True).values)
print (item_shipping_dummies)
print('[{}] Finished to get dummies on `item_condition_id` and `shipping`'.format(endTime(s)))

In [16]:
print a.shape

(10000, 1153)


## Text Mining : Tf-Idf, NMF, LDA

### Define Tokenizer Function

In [None]:
stop = set(stopwords.words('english'))
def tokenize(text):
    """
    sent_tokenize(): segment text into sentences
    word_tokenize(): break sentences into words
    """
    try: 
        regex = re.compile('[' +re.escape(string.punctuation) + '0-9\\r\\t\\n]')
        text = regex.sub(" ", text) # remove punctuation
        
        tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent
        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        filtered_tokens = [w for w in tokens if re.search('[a-zA-Z]', w)]
        filtered_tokens = [w.lower() for w in filtered_tokens if len(w)>=3]
        
        return filtered_tokens
            
    except TypeError as e: print(text,e)

### TF-IDF feature extraction

In [None]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF and Normal TFID...")
tfidf_vectorizer = TfidfVectorizer(min_df=10,
                             max_features=180000,
                             tokenizer=tokenize,
                             ngram_range=(1, 2))

t0 = startTime()
full_tfidf = tfidf_vectorizer.fit_transform(df_train['item_description'].apply(str) + df_test['item_description'].apply(str))
train_tfidf = tfidf_vectorizer.transform(df_train['item_description'].apply(str))
test_tfidf = tfidf_vectorizer.transform(df_test['item_description'].apply(str))
endTime(t0)

### SVD on Tf-Idf features

In [None]:

n_comp = 25
print("SVD on TFID to get Latent Representation : k = {} ...".format(n_comp))
t0 = startTime()
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf)
train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
endTime(t0)

train_svd.columns = ['svd_item_'+str(i) for i in range(n_comp)]
test_svd.columns = ['svd_item_'+str(i) for i in range(n_comp)]
df_train = pd.concat([df_train, train_svd], axis=1)
df_test = pd.concat([df_test, test_svd], axis=1)


### LDA feature extraction

In [None]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tfidf_vectorizer_lda = CountVectorizer(min_df=4,max_features=180000,
                     tokenizer=tokenize,ngram_range=(1,2))
t0 = startTime()
full_tfidf_lda = tfidf_vectorizer_lda.fit_transform(df_train['item_description'].apply(str) + df_test['item_description'].apply(str))
train_tfidf_lda = tfidf_vectorizer_lda.transform(df_train['item_description'].apply(str))
test_tfidf_lda = tfidf_vectorizer_lda.transform(df_test['item_description'].apply(str))
endTime(t0)

* ### NMF - Frobenious Norm,Kullback-Leibler, Divergence, LDA

In [None]:
n_components = 10
n_top_words = 10

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d .."
      % (n_samples))
t0 = startTime()
nmf_frob = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(full_tfidf)
endTime(t0)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d .."
      % (n_samples))
t0 = time()
nmf_kld = NMF(n_components=n_components, random_state=1,
          beta_loss= 'kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(full_tfidf)
endTime(t0)

print("Fitting LDA models with tf features, "
      "n_samples=%d and .."
      % (n_samples))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=20,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = startTime()
lda.fit(full_tfidf_lda)
endTime(t0)


In [None]:
train_nmf_frob_df = pd.DataFrame(nmf_frob.transform(train_tfidf))
test_nmf_frob_df = pd.DataFrame(nmf_frob.transform(test_tfidf))

train_nmf_kld_df = pd.DataFrame(nmf_kld.transform(train_tfidf))
test_nmf_kld_df = pd.DataFrame(nmf_kld.transform(test_tfidf))

train_lda_df = pd.DataFrame(lda.transform(train_tfidf_lda))
test_lda_df = pd.DataFrame(lda.transform(test_tfidf_lda))


train_nmf_frob_df.columns = ['nmf_frob_'+str(i) for i in range(n_components)]
test_nmf_frob_df.columns = ['nmf_frob_'+str(i) for i in range(n_components)]
df_train = pd.concat([df_train, train_nmf_frob_df], axis=1)
df_test = pd.concat([df_test, test_nmf_frob_df], axis=1)

train_nmf_kld_df.columns = ['nmf_kld_'+str(i) for i in range(n_components)]
test_nmf_kld_df.columns = ['nmf_kld_'+str(i) for i in range(n_components)]
df_train = pd.concat([df_train, train_nmf_kld_df], axis=1)
df_test = pd.concat([df_test, test_nmf_kld_df], axis=1)

train_lda_df.columns = ['lda_'+str(i) for i in range(n_components)]
test_lda_df.columns = ['lda_'+str(i) for i in range(n_components)]
df_train = pd.concat([df_train, train_lda_df], axis=1)
df_test = pd.concat([df_test, test_lda_df], axis=1)



In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
df_train.head(20)