## Mercari Price Suggestion Challenge

The objective of this challenge is to build an algorithm that automatically suggests the right product prices on Mercari. The training data consists of user-inputted text descriptions of their products, including details like product category name, brand name, and item condition

In [1]:
# Import required libraries
import numpy as np
import pandas as pd

# Text mining 
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

# Time 
from time import time

## Exploratory Data Analysis

In [17]:
def startTime():
    return time()
def endTime(s):
    print "Time elapsed {}".format(-s+time())


In [3]:
train_df = pd.read_csv('../../data/train.tsv', sep='\t')
#test = pd.read_csv('../../data/test.tsv', sep='\t')

In [4]:
# size of training and dataset
print(train_df.shape)

(593376, 8)


In [5]:
# different data types in the dataset: categorical (strings) and numeric
train_df.dtypes


train_id               int64
name                  object
item_condition_id      int64
category_name         object
brand_name            object
price                float64
shipping               int64
item_description      object
dtype: object

In [6]:
# Overall summary of train data
train_df.describe()

Unnamed: 0,train_id,item_condition_id,price,shipping
count,593376.0,593376.0,593376.0,593376.0
mean,296687.5,1.907738,26.689003,0.446294
std,171293.041002,0.902921,38.340061,0.497108
min,0.0,1.0,0.0,0.0
25%,148343.75,1.0,10.0,0.0
50%,296687.5,2.0,17.0,0.0
75%,445031.25,3.0,29.0,1.0
max,593375.0,5.0,2000.0,1.0


In [7]:
# First few rows of the dataset 
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [8]:
# Retain only part of the data 
n_samples = 1000
train_df = train_df.iloc[:n_samples,:]
print(train_df.shape)

(1000, 8)


In [20]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [21]:
n_features = 1000
n_components = 10
n_top_words = 20


# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(
    min_df=5, strip_accents='unicode', lowercase =True,
    analyzer='word', token_pattern=r'\w+', ngram_range=(1, 3), use_idf=True, 
    smooth_idf=True, sublinear_tf=True, stop_words='english')

t0 = startTime()
tfidf = tfidf_vectorizer.fit_transform(train_df['item_description'])
endTime(t0)

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(min_df=4,
                                max_features=180000,
                                strip_accents='unicode', lowercase =True,
                                analyzer='word', token_pattern=r'\w+', ngram_range=(1, 3),
                                stop_words='english')
t0 = startTime()
tf = tf_vectorizer.fit_transform(train_df['item_description'])
endTime(t0)

Extracting tf-idf features for NMF...
Time elapsed 0.338000059128
Extracting tf features for LDA...
Time elapsed 0.266000032425


In [24]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss= 'kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=1000 and n_features=1000...
done in 0.150s.

Topics in NMF model (Frobenius norm):
Topic #0: description zipper flaw free ask free foundation forever flowers flower floral fleece flaws flat free pet fl oz fl fitted fits like fits fit
Topic #1: new brand new brand brand new used new used new tags brand new tags tags opened new tag leggings package new worn tag sealed beautiful background new size packaging new sealed
Topic #2: size s medium like small size medium black 7 large size small like new pink women size 7 8 women s fits white secret victoria
Topic #3: shipping free shipping rm free price firm price firm bundle 2 3 1 save items item ask authentic t 4 fast retail
Topic #4: condition great great condition good good condition excellent perfect condition perfect times excellent condition boys condition worn comes 9 times great book 3 xl item great shape
Topic #5: box new box authentic new 100 100 authentic brand 

TypeError: __init__() got an unexpected keyword argument 'beta_loss'

In [None]:
def compute_tfidf(description):
    description = str(description)
    description.translate(string.maketrans("",""),string.punctuation)
    

    tfidf_sum=0
    words_count=0
    for w in description.lower().split():
        words_count += 1
        if w in tfidf_dict:
            tfidf_sum += tfidf_dict[w]
    
    if words_count > 0:
        return tfidf_sum/words_count
    else:
        return 0



tfidf.fit_transform(df_train['item_description'].apply(str))
tfidf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
df_train['tfidf'] = df_train['item_description'].apply(compute_tfidf)