## Mercari Price Suggestion Challenge

The objective of this challenge is to build an algorithm that automatically suggests the right product prices on Mercari. The training data consists of user-inputted text descriptions of their products, including details like product category name, brand name, and item condition

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import string
import re
import pickle

# Text mining 
import nltk
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction import stop_words
from sklearn.decomposition import TruncatedSVD

# Time 
from time import time


## Exploratory Data Analysis

In [None]:
def startTime():
    return time()
def endTime(s):
    print "Time elapsed {}".format(-s+time())


In [None]:
train_df = pd.read_csv('../../data/train.tsv', sep='\t')
#test = pd.read_csv('../../data/test.tsv', sep='\t')

In [None]:
# size of training and dataset
print(train_df.shape)

In [None]:
# different data types in the dataset: categorical (strings) and numeric
train_df.dtypes


In [None]:
# Overall summary of train data
train_df.describe()

In [None]:
# First few rows of the dataset 
train_df.head()

In [None]:
# Retain only part of the data 
n_samples = 1000
train_df = train_df.iloc[:n_samples,:]
print(train_df.shape)

In [30]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [31]:
stop = set(stopwords.words('english'))
def tokenize(text):
    """
    sent_tokenize(): segment text into sentences
    word_tokenize(): break sentences into words
    """
    try: 
        regex = re.compile('[' +re.escape(string.punctuation) + '0-9\\r\\t\\n]')
        text = regex.sub(" ", text) # remove punctuation
        
        tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent
        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        filtered_tokens = [w for w in tokens if re.search('[a-zA-Z]', w)]
        filtered_tokens = [w.lower() for w in filtered_tokens if len(w)>=3]
        
        return filtered_tokens
            
    except TypeError as e: print(text,e)

In [73]:
n_features = 1000

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(min_df=10,
                             max_features=180000,
                             tokenizer=tokenize,
                             ngram_range=(1, 2))

t0 = startTime()
tfidf = tfidf_vectorizer.fit_transform(train_df['item_description'].apply(str))
endTime(t0)

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer_lda = CountVectorizer(min_df=4,max_features=180000,
                     tokenizer=tokenize,ngram_range=(1,2))
t0 = startTime()
tfidf_lda = tf_vectorizer_lda.fit_transform(train_df['item_description'].apply(str))
endTime(t0)

Extracting tf-idf features for NMF...
Time elapsed 0.772000074387
Extracting tf features for LDA...
Time elapsed 0.774000167847


In [113]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf_frob = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf_frob, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf_kld = NMF(n_components=n_components, random_state=1,
          beta_loss= 'kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf_kld, tfidf_feature_names, n_top_words)


print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

n_components = 10
n_top_words = 10
lda = LatentDirichletAllocation(n_components=n_components, max_iter=20,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tfidf_lda)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=1000 and n_features=1000...
done in 0.175s.

Topics in NMF model (Frobenius norm):
Topic #0: description yet yet description last large know jeans jacket items item
Topic #1: brand brand new new new never sealed firm price firm price leggings new tags
Topic #2: size medium size medium like large size small full men black small
Topic #3: worn never worn never new never washed times tags material flaws excellent condition
Topic #4: free shipping free shipping smoke price home smoke free free home fast firm
Topic #5: condition great great condition good good condition times excellent worn excellent condition comes
Topic #6: used never used never new never gently used condition eye color opened set
Topic #7: cute super pink one black super cute small white top bundle
Topic #8: new like new like large new condition authentic bag white times package
Topic #9: box new box authentic original without comes set still color ca

In [69]:

tfidf_dict = dict(zip(tfidf_vectorizer.get_feature_names(), tfidf_vectorizer.idf_))

In [82]:
n_comp = 20
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(tfidf)
train_svd = pd.DataFrame(svd_obj.transform(tfidf))

In [83]:
train_svd.shape

(1000, 20)

In [114]:
lda_trans = lda.transform(tfidf_lda)
nmf_frob_trans = nmf_frob.transform(tfidf)
nmf_kld_trans = nmf_klb.transform(tfidf)

NameError: name 'nmf_klb' is not defined

In [None]:
plt.figure(figsize=(20, 15))
plt.scatter(df_train['tfidf'], df_train['price'])
plt.title('Train price X item_description TF-IDF', fontsize=15)
plt.xlabel('Price', fontsize=15)
plt.ylabel('TF-IDF', fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize=15)
plt.show()

