In [1]:
import numpy as np
import pandas as pd

from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords       # list of stopwords like articles, preposition

stop = set(stopwords.words('english'))


from string import punctuation
from collections import Counter

train = pd.read_csv("/home/ec2-user/.kaggle/competitions/mercari-price-suggestion-challenge/train.tsv", sep='\t')

print ("Number of rows are : {}\nNumber of columns are : {} ".format(train.shape[0], train.shape[1]))

col_list = list(train)

train.head(3)

train.isnull().sum()

print("We don't have any information regarding the {} % of brands!".format(round(np.divide(train.isnull().sum()["brand_name"],train.shape[0]),2)*100
))

train["price"][train["brand_name"].isnull()].median() # What this code is doing here ?

col_list

col_list

def lower_cap(row) :                           # to lower capitalise column names like : name , item_description
                                                                                 # for better lookups and consistency
    try :
        return row.lower()
    except :
        return row
    
    

train.loc[:,"name"] = train["name"].apply(lower_cap)


train.loc[:,"item_description"] = train["item_description"].apply(lower_cap)

train.loc[:,"brand_name"] = train["brand_name"].apply(lower_cap)

train.loc[:,"category_name"] = train["category_name"].apply(lower_cap)

In [13]:
# Let's make two brand lists : 
# pop_brands contains top 209 brands
# all_brands contain all the 4210 brands

# this is a Series containing brand names along with their respective frequency
brand_counts = train.brand_name.value_counts()   

all_brands=[]

for i in brand_counts.index :
    all_brands.append(i.lower())

# this list contains those brands whose frequency is more than 500.

pop_brands = brand_counts[brand_counts>500].index.tolist()

# Also add non_branded to the list : ( we are doing this so that we have only 2069 major categories of brands instead of )
pop_brands.append("non_branded")

pop_brands = [i.lower() for i in pop_brands ]                # lower capitalizing each name 

 



train.loc[:,["item_description","name","price"]][train["brand_name"].isnull()]

# There are two types of brand names right now :
  #                                            a) String (normal)
   #                                           b) NaN 's'

# Part I : Converting NaN's to strings : non_branded

train["brand_name"] = train["brand_name"].fillna("non_branded")



# just to see what a typical description of a non branded item looks like :

# train["item_description"][train["brand_name"]=="non_branded"]

# the over all plan is to create two categories : 
#             a) Non_branded which containes Nan as well as brands which have very less frequency in the dataset (<500)
#             b) Create a list of ~200 distinct brands 

In [17]:
def brand_mapper(name) :
    if name in pop_brands :
        return name
    else :
        name = "low_brand"
        return name

#brand_name1 is a column that contains 210 distinct brands (including non_branded)



train["brand_bucket"] = train["brand_name"].apply(brand_mapper)

def brand_mapper(name) :
    if name == "non_branded" :
        dummy = 0
    elif name == "low_brand" :
        dummy = 1
    else :
        dummy = 2
    return dummy

train["brand_name2"] = train["brand_name1"].apply(brand_mapper)

def cat_split(cat) :
    try :
        c1, c2, c3 = cat.split("/")
        return c1, c2, c3
    except :
        return np.nan, np.nan, np.nan
    
###what the purpose of the next code?
train["prime_cat"] , train["sec_cat"] , train["ter_cat"] = zip(*train["category_name"].apply(lambda x : cat_split(x)))

for item in set(train["prime_cat"]) :
    print(item,train[train["prime_cat"]== item]["price"].mean())

import re

def tokenizer(text):            # text --> sent --> tokens
    try:
        tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text)]    # tokens_ copntains each and 
                                                                           #                   every word including stop words
        
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent

        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        tokens = list(filter(lambda t: t not in punctuation, tokens))
        #tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', u'\u2014', u'\u2026', u'\u2013'], tokens))
        
        
        
        filtered_tokens = []
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)

        filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

        return filtered_tokens
    except Error as e:
        print(e)

train.loc[:,"item_description"] = train["item_description"].astype(str)

train["tokens"] = train["item_description"].map(tokenizer)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD      # To reduce the dimensions fater creating tfidf matrix
from sklearn.manifold import TSNE



import time

# min_df is minimum number of item_descriptions  that contain a term (maybe word, brand) t
# max_features is maximum number of unique tokens (across documents) that we'd consider
# TfidfVectorizer preprocesses the descriptions using the tokenizer we defined above

start = time.time()

vectorizer = TfidfVectorizer(min_df=500, max_features=10000, tokenizer=tokenizer, ngram_range=(1, 2))
vz = vectorizer.fit_transform(list(train['item_description']))

end= time.time()

print("It took {} seconds for learning.".format(end-start))

In [95]:
# vz.shape This was the previous verion

(1482535, 10000)

In [107]:
vz.shape #  Thisis the new version

(1482535, 7418)

In [108]:
start = time.time()


tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

end = time.time()

print("It took {} seconds to execute this cell".format(end-start))

It took 0.02078986167907715 seconds to execute this cell


In [98]:
# old version : tfidf.sort_values(by=['tfidf'], ascending=True).head(30)


Unnamed: 0,tfidf
new,2.196085
size,2.34652
brand,2.780637
condition,2.811572
brand new,2.9079
free,2.949424
shipping,3.093035
worn,3.115678
used,3.175714
's,3.22758


In [109]:
tfidf.sort_values(by=['tfidf'], ascending=True).head(30)


Unnamed: 0,tfidf
new,2.196085
size,2.34652
brand,2.780637
condition,2.811572
brand new,2.9079
free,2.949424
shipping,3.093035
worn,3.115678
used,3.175714
's,3.22758


In [99]:
tfidf.sort_values(by=['tfidf'], ascending=False).head(30)



Unnamed: 0,tfidf
hidrocor,11.338064
pipes glass,10.564874
ultra-matte,10.527133
glass pipes,10.357234
weft,10.29661
temper glass,10.205318
glass hand,10.172312
rm pairs-,10.153019
beard,10.14667
pairs- rm,10.14036
