In [12]:
import nltk
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import wordnet as wn
from bs4 import BeautifulSoup
from nltk.stem.snowball import SnowballStemmer


1. Modelos bag of words
2. Pre-processamento com lowercasing, stemming e remoção de caracteres não alfa-numericos

In [27]:
def correct_text(s):
    s = s.lower()
    s = s.replace(" in.","in.")
    s = s.replace(" inch","in.")
    s = s.replace("inch","in.")
    s = s.replace(" in ","in. ")
    s = s.replace(" ft ","ft. ")
    s = s.replace(" ft.","ft.")
    s = s.replace(" foot","ft.")
    s = s.replace(" feet","ft.")
    s = s.replace("foot","ft.")
    s = s.replace("feet","ft.")
    s = s.replace(" ft ","ft. ")
    s = s.replace(" gallon ","gal. ")
    s = s.replace("gallon","gal.")
    s = s.replace(" oz.","oz.")
    s = s.replace(" ounce","oz.")
    s = s.replace("ounce","oz.")
    s = s.replace(" oz ","oz. ")
    s = s.replace(" cm.","cm.")
    s = s.replace(" cm ","cm. ")
    s = s.replace('H x', 'height')
    s = s.replace('sq.', 'square')
    s = s.replace('cu.', 'cubic')
    s = s.replace('lbs.', 'pounds')
    s = s.replace('W x', 'width')
    s = s.replace('H x', 'height')
    s = s.replace('Ah ', 'amphere')
    
    return s

def clean_text(text):
    text = correct_text(text)
    # Remove '&nbsp;' from the text content before HTML tags strip off.
    text.replace('&nbsp;', ' ')
    # Remove HTML tags
    text = BeautifulSoup(text, "lxml").get_text(separator=" ")
    # Replace all punctuation and special characters by space
    text.replace("[ &<>)(_,.;:!?/-]+", " ")
    # Remove the apostrophe's
    text.replace("'s\\b", "")
    # Remove the apostrophe
    text.replace("[']+", "")
    # Remove the double quotes
    text.replace("[\"]+", "")
    # Convert to lower case, split into individual words
    words = text.lower().split()
    return( " ".join( words ))
 

In [28]:
train_data = pd.read_csv('../data/train.csv', encoding="ISO-8859-1")
test_data = pd.read_csv('../data/test.csv', encoding="ISO-8859-1")
attribute_data = pd.read_csv('../data/attributes.csv')
descriptions = pd.read_csv('../data/product_descriptions.csv')

train_data = pd.merge(train_data, descriptions, on="product_uid", how="left")
test_data = pd.merge(test_data, descriptions, on="product_uid", how="left")

product_count = pd.DataFrame(pd.Series(train_data.groupby(["product_uid"]).size(), name="product_count"))
product_count = pd.DataFrame(pd.Series(test_data.groupby(["product_uid"]).size(), name="product_count"))

In [29]:
english_sw = stopwords.words('english')

In [30]:
def clean(sentence):
    
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = PorterStemmer()
    tokens = tokenizer.tokenize(sentence.lower())
    tokens = [token for token in tokens if token not in english_sw]
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

In [31]:
train_data['product_description'] = train_data.apply(lambda x: clean_text(x['product_description']),axis=1)
train_data['product_title'] = train_data.apply(lambda x: clean_text(x['product_title']),axis=1)
train_data['search_term'] = train_data.apply(lambda x: clean_text(x['search_term']),axis=1)

In [32]:
train_data['description_tokens'] = train_data.apply(lambda x: word_tokenize(x['product_description']), axis=1)
train_data['description_tokens_clean'] = train_data.apply(lambda x: clean(x['product_description']), axis=1)

In [33]:
train_data['title_tokens'] = train_data.apply(lambda x: word_tokenize(x['product_title']), axis=1)
train_data['title_tokens_clean'] = train_data.apply(lambda x: clean(x['product_title']), axis=1)

In [34]:
train_data['search_tokens'] = train_data.apply(lambda x: word_tokenize(x['search_term']), axis=1)
train_data['search_tokens_clean'] = train_data.apply(lambda x: clean(x['search_term']), axis=1)

In [35]:
train_data['n_tokens_desc'] = train_data.apply(lambda x: len(x['description_tokens']), axis=1)
train_data['n_tokens_title'] = train_data.apply(lambda x: len(x['title_tokens']), axis=1)
train_data['n_tokens_search'] = train_data.apply(lambda x: len(x['search_tokens']), axis=1)

In [13]:
train_data.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,description_tokens,description_tokens_clean,title_tokens,title_tokens_clean,search_tokens,search_tokens_clean,n_tokens_desc,n_tokens_title,n_tokens_search
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"Not only do angles make joints stronger, they ...","[Not, only, do, angles, make, joints, stronger...","[angl, make, joint, stronger, also, provid, co...","[Simpson, Strong-Tie, 12-Gauge, Angle]","[simpson, strong, tie, 12, gaug, angl]","[angle, bracket]","[angl, bracket]",148,4,2
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"Not only do angles make joints stronger, they ...","[Not, only, do, angles, make, joints, stronger...","[angl, make, joint, stronger, also, provid, co...","[Simpson, Strong-Tie, 12-Gauge, Angle]","[simpson, strong, tie, 12, gaug, angl]","[l, bracket]","[l, bracket]",148,4,2
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DECKOVER is an innovativ...,"[BEHR, Premium, Textured, DECKOVER, is, an, in...","[behr, premium, textur, deckov, innov, solid, ...","[BEHR, Premium, Textured, DeckOver, 1-gal, ., ...","[behr, premium, textur, deckov, 1, gal, sc, 14...","[deck, over]",[deck],196,13,2
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...,"[Update, your, bathroom, with, the, Delta, Ver...","[updat, bathroom, delta, vero, singl, handl, s...","[Delta, Vero, 1-Handle, Shower, Only, Faucet, ...","[delta, vero, 1, handl, shower, faucet, trim, ...","[rain, shower, head]","[rain, shower, head]",116,15,3
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...,"[Update, your, bathroom, with, the, Delta, Ver...","[updat, bathroom, delta, vero, singl, handl, s...","[Delta, Vero, 1-Handle, Shower, Only, Faucet, ...","[delta, vero, 1, handl, shower, faucet, trim, ...","[shower, only, faucet]","[shower, faucet]",116,15,3


In [14]:
#train_data['product_description']

### Features

1. feature_1 = is search term a substring in title?
2. feature_2 = is search term a substring in description?
3. feature_3 = proportion of terms of search in title - no stemm, no stopword removal
4. feature_4 = proportion of terms of search in title - stemm and stopword removal
5. feature_5 = proportion of terms of search in description - no stemm, no stopword removal
6. feature_6 = proportion of terms of search in description - stemm and stopword removal
7. feature_7 = length of search
8. feature_8 = length of description
9. feature_9 = length of title
7. features_10 to 43 = if word i in search is in description
7. features_44 to 77 = if word i in search is in title

#### Ideias

1. 1 feature para cada palavra de busca indicando se ela está no titulo ou descrição
2. Usando word2vec, distance entre palavras da busca e do titulo/descrição
3. Usando topic modeling: Probabilidade de busca conter o mesmo topico que titulo/descrição

In [36]:
train_data['feature_1'] = train_data.apply(lambda x: int(x['search_term'] in x['product_title']), axis=1)
train_data['feature_2'] = train_data.apply(lambda x: int(x['search_term'] in x['product_description']), axis=1)
train_data['feature_3'] = train_data.apply(lambda x: float(len( set(x['search_tokens']).intersection(set(x['title_tokens']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_4'] = train_data.apply(lambda x: float(len( set(x['search_tokens_clean']).intersection(set(x['title_tokens_clean']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_5'] = train_data.apply(lambda x: float(len( set(x['search_tokens']).intersection(set(x['description_tokens']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_6'] = train_data.apply(lambda x: float(len( set(x['search_tokens_clean']).intersection(set(x['description_tokens_clean']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_7'] = train_data.apply(lambda x: float(len(x['search_tokens'])), axis=1)
train_data['feature_8'] = train_data.apply(lambda x: float(len(x['description_tokens'])), axis=1)
train_data['feature_9'] = train_data.apply(lambda x: float(len(x['title_tokens'])), axis=1)

In [37]:
train_data.head(3)

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,description_tokens,description_tokens_clean,title_tokens,title_tokens_clean,...,n_tokens_search,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
0,2,100001,simpson strong-tie 12-gauge angle,angle bracket,3.0,"not only do angles make joints stronger, they ...","[not, only, do, angles, make, joints, stronger...","[angl, make, joint, stronger, also, provid, co...","[simpson, strong-tie, 12-gauge, angle]","[simpson, strong, tie, 12, gaug, angl]",...,2,0,0,0.5,0.5,0.0,0.5,2,144,4
1,3,100001,simpson strong-tie 12-gauge angle,l bracket,2.5,"not only do angles make joints stronger, they ...","[not, only, do, angles, make, joints, stronger...","[angl, make, joint, stronger, also, provid, co...","[simpson, strong-tie, 12-gauge, angle]","[simpson, strong, tie, 12, gaug, angl]",...,2,0,0,0.0,0.0,0.0,0.0,2,144,4
2,9,100002,behr premium textured deckover 1-gal. #sc-141 ...,deck over,3.0,behr premium textured deckover is an innovativ...,"[behr, premium, textured, deckover, is, an, in...","[behr, premium, textur, deckov, innov, solid, ...","[behr, premium, textured, deckover, 1-gal, ., ...","[behr, premium, textur, deckov, 1, gal, sc, 14...",...,2,0,0,0.0,0.0,0.5,0.5,2,190,13


In [38]:
max([len(x) for x in train_data['search_tokens']])

14

In [39]:
def word_search(word_list, index, tokens):
    if len(word_list) < index + 1:
        return 0
    text = ''.join(tokens)
    return int(word_list[index] in text)

In [40]:
for index in range(0,14):
    train_data['feature_{}'.format(index+10)] = train_data.apply(lambda x: word_search(x['search_tokens'], index, x['title_tokens']), axis=1)

In [41]:
for index in range(0,14):
    train_data['feature_{}'.format(index+24)] = train_data.apply(lambda x: word_search(x['search_tokens'], index, x['description_tokens']), axis=1)

In [42]:
train_data

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,description_tokens,description_tokens_clean,title_tokens,title_tokens_clean,...,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37
0,2,100001,simpson strong-tie 12-gauge angle,angle bracket,3.00,"not only do angles make joints stronger, they ...","[not, only, do, angles, make, joints, stronger...","[angl, make, joint, stronger, also, provid, co...","[simpson, strong-tie, 12-gauge, angle]","[simpson, strong, tie, 12, gaug, angl]",...,0,0,0,0,0,0,0,0,0,0
1,3,100001,simpson strong-tie 12-gauge angle,l bracket,2.50,"not only do angles make joints stronger, they ...","[not, only, do, angles, make, joints, stronger...","[angl, make, joint, stronger, also, provid, co...","[simpson, strong-tie, 12-gauge, angle]","[simpson, strong, tie, 12, gaug, angl]",...,0,0,0,0,0,0,0,0,0,0
2,9,100002,behr premium textured deckover 1-gal. #sc-141 ...,deck over,3.00,behr premium textured deckover is an innovativ...,"[behr, premium, textured, deckover, is, an, in...","[behr, premium, textur, deckov, innov, solid, ...","[behr, premium, textured, deckover, 1-gal, ., ...","[behr, premium, textur, deckov, 1, gal, sc, 14...",...,0,0,0,0,0,0,0,0,0,0
3,16,100005,delta vero 1-handle shower only faucet trim ki...,rain shower head,2.33,update your bathroom with the delta vero singl...,"[update, your, bathroom, with, the, delta, ver...","[updat, bathroom, delta, vero, singl, handl, s...","[delta, vero, 1-handle, shower, only, faucet, ...","[delta, vero, 1, handl, shower, faucet, trim, ...",...,0,0,0,0,0,0,0,0,0,0
4,17,100005,delta vero 1-handle shower only faucet trim ki...,shower only faucet,2.67,update your bathroom with the delta vero singl...,"[update, your, bathroom, with, the, delta, ver...","[updat, bathroom, delta, vero, singl, handl, s...","[delta, vero, 1-handle, shower, only, faucet, ...","[delta, vero, 1, handl, shower, faucet, trim, ...",...,0,0,0,0,0,0,0,0,0,0
5,18,100006,whirlpool 1.9 cubicft. over the range convecti...,convection otr,3.00,achieving delicious results is almost effortle...,"[achieving, delicious, results, is, almost, ef...","[achiev, delici, result, almost, effortless, w...","[whirlpool, 1.9, cubicft, ., over, the, range,...","[whirlpool, 1, 9, cubicft, rang, convect, micr...",...,0,0,0,0,0,0,0,0,0,0
6,20,100006,whirlpool 1.9 cubicft. over the range convecti...,microwave over stove,2.67,achieving delicious results is almost effortle...,"[achieving, delicious, results, is, almost, ef...","[achiev, delici, result, almost, effortless, w...","[whirlpool, 1.9, cubicft, ., over, the, range,...","[whirlpool, 1, 9, cubicft, rang, convect, micr...",...,0,0,0,0,0,0,0,0,0,0
7,21,100006,whirlpool 1.9 cubicft. over the range convecti...,microwaves,3.00,achieving delicious results is almost effortle...,"[achieving, delicious, results, is, almost, ef...","[achiev, delici, result, almost, effortless, w...","[whirlpool, 1.9, cubicft, ., over, the, range,...","[whirlpool, 1, 9, cubicft, rang, convect, micr...",...,0,0,0,0,0,0,0,0,0,0
8,23,100007,lithonia lighting quantum 2-light black led em...,emergency light,2.67,the quantum adjustable 2-light led black emerg...,"[the, quantum, adjustable, 2-light, led, black...","[quantum, adjust, 2, light, led, black, emerg,...","[lithonia, lighting, quantum, 2-light, black, ...","[lithonia, light, quantum, 2, light, black, le...",...,0,0,0,0,0,0,0,0,0,0
9,27,100009,house of fara 3/4in. x 3in. x 8ft. mdf fluted ...,mdf 3/4,3.00,get the house of fara 3/4in. x 3in. x 8ft. mdf...,"[get, the, house, of, fara, 3/4in, ., x, 3in, ...","[get, hous, fara, 3, 4in, x, 3in, x, 8ft, mdf,...","[house, of, fara, 3/4in, ., x, 3in, ., x, 8ft,...","[hous, fara, 3, 4in, x, 3in, x, 8ft, mdf, flut...",...,0,0,0,0,0,0,0,0,0,0


In [44]:
train_data['feature_11']

0        0
1        0
2        1
3        1
4        1
5        0
6        1
7        0
8        1
9        1
10       1
11       0
12       1
13       1
14       1
15       0
16       1
17       0
18       0
19       0
20       0
21       0
22       0
23       1
24       0
25       0
26       0
27       0
28       0
29       0
        ..
74037    1
74038    1
74039    1
74040    1
74041    1
74042    0
74043    1
74044    0
74045    1
74046    0
74047    1
74048    1
74049    0
74050    1
74051    1
74052    1
74053    1
74054    1
74055    1
74056    0
74057    1
74058    1
74059    1
74060    0
74061    1
74062    0
74063    1
74064    0
74065    1
74066    1
Name: feature_11, dtype: int64