## Mercari Price Suggestion Challenge

The objective of this challenge is to build an algorithm that automatically suggests the right product prices on Mercari. The training data consists of user-inputted text descriptions of their products, including details like product category name, brand name, and item condition

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import string
import re
import pickle

# Text mining 
import nltk
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction import stop_words
from sklearn.decomposition import TruncatedSVD

# Time 
from time import time

#Plots
import matplotlib.pyplot as plt


In [2]:
def startTime():
    return time()
def endTime(s):
    print ("Time elapsed {}".format(time()-s))


In [3]:
df_train = pd.read_csv('../input/train.tsv', sep='\t')
df_test = pd.read_csv('../input/test.tsv', sep='\t')

# Retain only part of the data 
n_samples = 10000
df_train = df_train.iloc[:n_samples,:]
df_test = df_test.iloc[:n_samples,:]
print(df_train.shape)
print(df_test.shape)


(10000, 8)
(10000, 7)


## Text Mining : Tf-Idf, NMF, LDA

In [4]:
# Nulls in item description in train or test as tf-idf is not defined on nan
# lets drop these 4 items
df_train = df_train.loc[df_train.item_description == df_train.item_description]
df_train = df_train.loc[df_train.name == df_train.name]
print("Dropped records where item description was nan")

Dropped records where item description was nan


### Define Tokenizer Function

In [5]:
stop = set(stopwords.words('english'))
def tokenize(text):
    """
    sent_tokenize(): segment text into sentences
    word_tokenize(): break sentences into words
    """
    try: 
        regex = re.compile('[' +re.escape(string.punctuation) + '0-9\\r\\t\\n]')
        text = regex.sub(" ", text) # remove punctuation
        
        tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent
        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        filtered_tokens = [w for w in tokens if re.search('[a-zA-Z]', w)]
        filtered_tokens = [w.lower() for w in filtered_tokens if len(w)>=3]
        
        return filtered_tokens
            
    except TypeError as e: print(text,e)

### TF-IDF feature extraction

In [6]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF and Normal TFID...")
tfidf_vectorizer = TfidfVectorizer(min_df=10,
                             max_features=180000,
                             tokenizer=tokenize,
                             ngram_range=(1, 2))

t0 = startTime()
full_tfidf = tfidf_vectorizer.fit_transform(df_train['item_description'].apply(str) + df_test['item_description'].apply(str))
train_tfidf = tfidf_vectorizer.transform(df_train['item_description'].apply(str))
test_tfidf = tfidf_vectorizer.transform(df_test['item_description'].apply(str))
endTime(t0)

Extracting tf-idf features for NMF and Normal TFID...
Time elapsed 11.950494050979614


### SVD on Tf-Idf features

In [7]:

n_comp = 25
print("SVD on TFID to get Latent Representation : k = {} ...".format(n_comp))
t0 = startTime()
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf)
train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
endTime(t0)

train_svd.columns = ['svd_item_'+str(i) for i in range(n_comp)]
test_svd.columns = ['svd_item_'+str(i) for i in range(n_comp)]
df_train = pd.concat([df_train, train_svd], axis=1)
df_test = pd.concat([df_test, test_svd], axis=1)


SVD on TFID to get Latent Representation : k = 25 ...
Time elapsed 1.0578930377960205


### LDA feature extraction

In [8]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tfidf_vectorizer_lda = CountVectorizer(min_df=4,max_features=180000,
                     tokenizer=tokenize,ngram_range=(1,2))
t0 = startTime()
full_tfidf_lda = tfidf_vectorizer_lda.fit_transform(df_train['item_description'].apply(str) + df_test['item_description'].apply(str))
train_tfidf_lda = tfidf_vectorizer_lda.transform(df_train['item_description'].apply(str))
test_tfidf_lda = tfidf_vectorizer_lda.transform(df_test['item_description'].apply(str))
endTime(t0)

Extracting tf features for LDA...
Time elapsed 12.171874523162842


* ### NMF - Frobenious Norm,Kullback-Leibler, Divergence, LDA

In [10]:
n_components = 10
n_top_words = 10

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d .."
      % (n_samples))
t0 = startTime()
nmf_frob = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(full_tfidf)
endTime(t0)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d .."
      % (n_samples))
t0 = time()
nmf_kld = NMF(n_components=n_components, random_state=1,
          beta_loss= 'kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(full_tfidf)
endTime(t0)

print("Fitting LDA models with tf features, "
      "n_samples=%d and .."
      % (n_samples))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=20,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = startTime()
lda.fit(full_tfidf_lda)
endTime(t0)


Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=10000 ..
Time elapsed 50.55987524986267
Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=10000 ..
Time elapsed 10.947026491165161
Fitting LDA models with tf features, n_samples=10000 and ..
Time elapsed 58.79438519477844


In [36]:
train_nmf_frob_df = pd.DataFrame(nmf_frob.transform(train_tfidf))
test_nmf_frob_df = pd.DataFrame(nmf_frob.transform(test_tfidf))

train_nmf_kld_df = pd.DataFrame(nmf_kld.transform(train_tfidf))
test_nmf_kld_df = pd.DataFrame(nmf_kld.transform(test_tfidf))

train_lda_df = pd.DataFrame(lda.transform(train_tfidf_lda))
test_lda_df = pd.DataFrame(lda.transform(test_tfidf_lda))


train_nmf_frob_df.columns = ['nmf_frob_'+str(i) for i in range(n_components)]
test_nmf_frob_df.columns = ['nmf_frob_'+str(i) for i in range(n_components)]
df_train = pd.concat([df_train, train_nmf_frob_df], axis=1)
df_test = pd.concat([df_test, test_nmf_frob_df], axis=1)

train_nmf_kld_df.columns = ['nmf_kld_'+str(i) for i in range(n_components)]
test_nmf_kld_df.columns = ['nmf_kld_'+str(i) for i in range(n_components)]
df_train = pd.concat([df_train, train_nmf_kld_df], axis=1)
df_test = pd.concat([df_test, test_nmf_kld_df], axis=1)

train_lda_df.columns = ['lda_'+str(i) for i in range(n_components)]
test_lda_df.columns = ['lda_'+str(i) for i in range(n_components)]
df_train = pd.concat([df_train, train_lda_df], axis=1)
df_test = pd.concat([df_test, test_lda_df], axis=1)



In [40]:
print(df_train.shape)
print(df_test.shape)

(10000, 73)
(10000, 72)


In [43]:
df_train.head(20)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,svd_item_0,svd_item_1,...,lda_0,lda_1,lda_2,lda_3,lda_4,lda_5,lda_6,lda_7,lda_8,lda_9
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,0.190975,0.9442,...,0.025,0.025,0.025001,0.025,0.025,0.025,0.025,0.774998,0.025,0.025
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,0.090106,-0.013857,...,0.004762,0.004764,0.004762,0.004762,0.004763,0.004762,0.004762,0.309089,0.652812,0.004762
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,0.077609,-0.017788,...,0.005556,0.005556,0.005556,0.005556,0.233878,0.005556,0.005556,0.16805,0.559179,0.005558
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,0.184471,-0.05003,...,0.004545,0.566862,0.004546,0.004547,0.004547,0.004545,0.004545,0.004547,0.337294,0.064021
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,0.008449,-0.001556,...,0.025,0.025006,0.025009,0.025023,0.025015,0.025,0.025,0.025007,0.774939,0.025
5,5,Bundled items requested for Ruie,3,Women/Other/Other,,59.0,0,"Banana republic bottoms, Candies skirt with ma...",0.029567,-0.00852,...,0.00715,0.007144,0.007143,0.007143,0.007143,0.007143,0.007143,0.007143,0.935707,0.007143
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.0,0,Size small but straps slightly shortened to fi...,0.156045,-0.027283,...,0.009091,0.009091,0.009091,0.009091,0.009091,0.009093,0.009091,0.009091,0.918178,0.009091
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.0,1,You get three pairs of Sophie cheer shorts siz...,0.144615,-0.033441,...,0.002273,0.092516,0.209413,0.002273,0.002273,0.002273,0.002273,0.002273,0.68216,0.002273
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,19.0,0,Girls Size small Plus green. Three shorts total.,0.11637,-0.022463,...,0.009091,0.009091,0.009091,0.009091,0.009093,0.009091,0.009092,0.069258,0.85801,0.009091
9,9,Porcelain clown doll checker pants VTG,3,Vintage & Collectibles/Collectibles/Doll,,8.0,0,I realized his pants are on backwards after th...,0.044541,-0.009439,...,0.003572,0.003573,0.003572,0.179279,0.31452,0.003574,0.003571,0.003573,0.481192,0.003574
