In [1]:
import nltk
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

1. Modelos bag of words
2. Pre-processamento com lowercasing, stemming e remoção de caracteres não alfa-numericos

In [2]:
train_data = pd.read_csv('../data/train.csv', encoding="ISO-8859-1")
test_data = pd.read_csv('../data/test.csv', encoding="ISO-8859-1")
attribute_data = pd.read_csv('../data/attributes.csv')
descriptions = pd.read_csv('../data/product_descriptions.csv')

train_data = pd.merge(train_data, descriptions, on="product_uid", how="left")
test_data = pd.merge(test_data, descriptions, on="product_uid", how="left")

product_count = pd.DataFrame(pd.Series(train_data.groupby(["product_uid"]).size(), name="product_count"))
product_count = pd.DataFrame(pd.Series(test_data.groupby(["product_uid"]).size(), name="product_count"))

In [3]:
train_data.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"Not only do angles make joints stronger, they ..."
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"Not only do angles make joints stronger, they ..."
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DECKOVER is an innovativ...
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...


In [4]:
english_sw = stopwords.words('english')

In [5]:
def clean(sentence):
    
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = PorterStemmer()
    tokens = tokenizer.tokenize(sentence.lower())
    tokens = [token for token in tokens if token not in english_sw]
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

In [6]:
train_data['description_tokens'] = train_data.apply(lambda x: word_tokenize(x['product_description']), axis=1)
train_data['description_tokens_clean'] = train_data.apply(lambda x: clean(x['product_description']), axis=1)

In [7]:
train_data['title_tokens'] = train_data.apply(lambda x: word_tokenize(x['product_title']), axis=1)
train_data['title_tokens_clean'] = train_data.apply(lambda x: clean(x['product_title']), axis=1)

In [8]:
train_data['search_tokens'] = train_data.apply(lambda x: word_tokenize(x['search_term']), axis=1)
train_data['search_tokens_clean'] = train_data.apply(lambda x: clean(x['search_term']), axis=1)

In [9]:
train_data['n_tokens_desc'] = train_data.apply(lambda x: len(x['description_tokens']), axis=1)
train_data['n_tokens_title'] = train_data.apply(lambda x: len(x['title_tokens']), axis=1)
train_data['n_tokens_search'] = train_data.apply(lambda x: len(x['search_tokens']), axis=1)

In [16]:
#train_data.apply(lambda x: tokenizer.tokenize(x['product_description'].lower()),axis=1)

In [30]:
train_data.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,description_tokens,title_tokens,search_tokens,n_tokens_desc,n_tokens_title,n_tokens_search,description_tokens_clean,title_tokens_clean,search_tokens_clean
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"Not only do angles make joints stronger, they ...","[Not, only, do, angles, make, joints, stronger...","[Simpson, Strong-Tie, 12-Gauge, Angle]","[angle, bracket]",93,6,2,"[angl, make, joint, stronger, also, provid, co...","[simpson, strong, tie, 12, gaug, angl]","[angl, bracket]"
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"Not only do angles make joints stronger, they ...","[Not, only, do, angles, make, joints, stronger...","[Simpson, Strong-Tie, 12-Gauge, Angle]","[l, bracket]",93,6,2,"[angl, make, joint, stronger, also, provid, co...","[simpson, strong, tie, 12, gaug, angl]","[l, bracket]"
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DECKOVER is an innovativ...,"[BEHR, Premium, Textured, DECKOVER, is, an, in...","[BEHR, Premium, Textured, DeckOver, 1-gal, ., ...","[deck, over]",121,12,1,"[behr, premium, textur, deckov, innov, solid, ...","[behr, premium, textur, deckov, 1, gal, sc, 14...",[deck]
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...,"[Update, your, bathroom, with, the, Delta, Ver...","[Delta, Vero, 1-Handle, Shower, Only, Faucet, ...","[rain, shower, head]",70,11,3,"[updat, bathroom, delta, vero, singl, handl, s...","[delta, vero, 1, handl, shower, faucet, trim, ...","[rain, shower, head]"
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...,"[Update, your, bathroom, with, the, Delta, Ver...","[Delta, Vero, 1-Handle, Shower, Only, Faucet, ...","[shower, only, faucet]",70,11,2,"[updat, bathroom, delta, vero, singl, handl, s...","[delta, vero, 1, handl, shower, faucet, trim, ...","[shower, faucet]"


In [29]:
#train_data['product_description']

### Features

1. feature_1 = is search term a substring in title?
2. feature_2 = is search term a substring in description?
3. feature_3 = proportion of terms of search in title - no stemm, no stopword removal
4. feature_4 = proportion of terms of search in title - stemm and stopword removal
5. feature_5 = proportion of terms of search in description - no stemm, no stopword removal
6. feature_6 = proportion of terms of search in description - stemm and stopword removal
7. feature_7 = length of search
8. feature_8 = length of description
9. feature_9 = length of title
7. features_10 to 43 = if word i in search is in description
7. features_44 to 77 = if word i in search is in title

#### Ideias

1. 1 feature para cada palavra de busca indicando se ela está no titulo ou descrição
2. Usando word2vec, distance entre palavras da busca e do titulo/descrição
3. Usando topic modeling: Probabilidade de busca conter o mesmo topico que titulo/descrição

In [27]:
train_data['feature_1'] = train_data.apply(lambda x: int(x['search_term'] in x['product_title']), axis=1)
train_data['feature_2'] = train_data.apply(lambda x: int(x['search_term'] in x['product_description']), axis=1)
train_data['feature_3'] = train_data.apply(lambda x: float(len( set(x['search_tokens']).intersection(set(x['title_tokens']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_4'] = train_data.apply(lambda x: float(len( set(x['search_tokens_clean']).intersection(set(x['title_tokens_clean']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_5'] = train_data.apply(lambda x: float(len( set(x['search_tokens']).intersection(set(x['description_tokens']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_6'] = train_data.apply(lambda x: float(len( set(x['search_tokens_clean']).intersection(set(x['description_tokens_clean']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_7'] = train_data.apply(lambda x: float(len(x['search_tokens'])), axis=1)
train_data['feature_8'] = train_data.apply(lambda x: float(len(x['description_tokens'])), axis=1)
train_data['feature_9'] = train_data.apply(lambda x: float(len(x['title_tokens'])), axis=1)

In [30]:
def word_search(word_list, i, text):
    if len(word_list) < (i + 1):
        return 0
    else:
        return int(word_list[i] in text)

In [28]:
for i in xrange(34):
    train_data['feature_{0}'.format(10 + i)] = train_data.apply(lambda x: word_search(x['search_tokens'], i, x['product_description']), axis=1)
    

In [29]:
for i in xrange(34):
    train_data['feature_{0}'.format(44 + i)] = train_data.apply(lambda x: word_search(x['search_tokens'], i, x['product_title']), axis=1)
    

In [31]:
train_data.tail(10)

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,description_tokens,description_tokens_clean,title_tokens,title_tokens_clean,...,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_41,feature_76,feature_77
74057,221434,206621,Crown Bolt M6-32 x 90 mm. Internal Hex Socket ...,m6 screw 90mm,2.67,Crown Bolt's metric socket cap screws are idea...,"[Crown, Bolt, 's, metric, socket, cap, screws,...","[crown, bolt, metric, socket, cap, screw, idea...","[Crown, Bolt, M6-32, x, 90, mm, ., Internal, H...","[crown, bolt, m6, 32, x, 90, mm, intern, hex, ...",...,0,0,0,0,0,0,0,0,0,0
74058,221443,206627,25 in. Stainless Tip-Out Sink Front Tray,sink tip-out tray,3.0,Add a little luxury to otherwise wasted space ...,"[Add, a, little, luxury, to, otherwise, wasted...","[add, littl, luxuri, otherwis, wast, space, st...","[25, in, ., Stainless, Tip-Out, Sink, Front, T...","[25, stainless, tip, sink, front, tray]",...,0,0,0,0,0,0,0,0,0,0
74059,221449,206631,Masonite New Haven Three Quarter Oval Lite Pri...,fiberglass front doors by masonite,3.0,"Create an inviting, stylish entryway with the ...","[Create, an, inviting, ,, stylish, entryway, w...","[creat, invit, stylish, entryway, masonit, new...","[Masonite, New, Haven, Three, Quarter, Oval, L...","[masonit, new, haven, three, quarter, oval, li...",...,0,0,0,0,0,0,0,0,0,0
74060,221450,206632,Lilly Miller UltraGreen 1 Gal. Vitamin B-1 Pla...,starter fertillzer,2.0,Lilly Miller UltraGreen Vitamin B-1 Plant Star...,"[Lilly, Miller, UltraGreen, Vitamin, B-1, Plan...","[lilli, miller, ultragreen, vitamin, b, 1, pla...","[Lilly, Miller, UltraGreen, 1, Gal, ., Vitamin...","[lilli, miller, ultragreen, 1, gal, vitamin, b...",...,0,0,0,0,0,0,0,0,0,0
74061,221455,206637,Schluter Rondec Stainless Steel 3/8 in. x 1 in...,rondec stainless steel 3/8 edge protection,3.0,The EV/RO100E is a prefabricated corner for Ro...,"[The, EV/RO100E, is, a, prefabricated, corner,...","[ev, ro100, prefabr, corner, rondec, elimin, n...","[Schluter, Rondec, Stainless, Steel, 3/8, in, ...","[schluter, rondec, stainless, steel, 3, 8, x, ...",...,0,0,0,0,0,0,0,0,0,0
74062,221457,206638,Atlantic Windowpane 576 CD or 192 DVD Blu-Ray ...,tv riser glass,1.0,"Atlantic, Inc. 94835722 Uniquely designed for ...","[Atlantic, ,, Inc., 94835722, Uniquely, design...","[atlant, inc, 94835722, uniqu, design, maximum...","[Atlantic, Windowpane, 576, CD, or, 192, DVD, ...","[atlant, windowpan, 576, cd, 192, dvd, blu, ra...",...,0,0,0,0,0,0,0,0,0,0
74063,221458,206639,Philips 40-Watt Halogen R20 Flood Light Bulb (...,r20 halogen light,3.0,Philips Energy Advantage lamps use less energy...,"[Philips, Energy, Advantage, lamps, use, less,...","[philip, energi, advantag, lamp, use, less, en...","[Philips, 40-Watt, Halogen, R20, Flood, Light,...","[philip, 40, watt, halogen, r20, flood, light,...",...,0,0,0,0,0,0,0,0,0,0
74064,221463,206641,Schlage Camelot In-Active Aged Bronze Handlese...,schlage lock siena half dummy knob with,2.33,The Schlage Camelot In-Active Aged Bronze Hand...,"[The, Schlage, Camelot, In-Active, Aged, Bronz...","[schlage, camelot, activ, age, bronz, handlese...","[Schlage, Camelot, In-Active, Aged, Bronze, Ha...","[schlage, camelot, activ, age, bronz, handlese...",...,0,0,0,0,0,0,0,0,0,0
74065,221471,206648,Plastec 11 in. x 24 in. Rose Garden Wall Decor...,zen garden decor,3.0,The Rose Garden is inspired by the popular ear...,"[The, Rose, Garden, is, inspired, by, the, pop...","[rose, garden, inspir, popular, earli, 20th, c...","[Plastec, 11, in, ., x, 24, in, ., Rose, Garde...","[plastec, 11, x, 24, rose, garden, wall, decor...",...,0,0,0,0,0,0,0,0,0,0
74066,221473,206650,LICHTENBERG Pool Blue No. 918 Millennial Ryan ...,fine sheer curtain 63 inches,2.33,No. 918 Millennial Ryan heathered texture semi...,"[No, ., 918, Millennial, Ryan, heathered, text...","[918, millenni, ryan, heather, textur, semi, s...","[LICHTENBERG, Pool, Blue, No, ., 918, Millenni...","[lichtenberg, pool, blue, 918, millenni, ryan,...",...,0,0,0,0,0,0,0,0,0,0


In [40]:
train_data[train_data['n_tokens_title'] == 34]

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,description_tokens,title_tokens,search_tokens,n_tokens_desc,...,n_tokens_search,description_tokens_clean,title_tokens_clean,search_tokens_clean,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6
26273,80600,123547,Lucky Dog 4 ft. H x 5 ft. W x 10 ft. L or 4 ft...,4 ftawning frame,2,Our Lucky Dog Box chain link dog kennel is con...,"[Our, Lucky, Dog, Box, chain, link, dog, kenne...","[Lucky, Dog, 4, ft., H, x, 5, ft., W, x, 10, f...","[4, ftawning, frame]",102,...,3,"[lucki, dog, box, chain, link, dog, kennel, co...","[lucki, dog, 4, ft, h, x, 5, ft, w, x, 10, ft,...","[4, ftawn, frame]",False,False,0.333333,0.666667,0.333333,0.333333


In [32]:
features_df = train_data[['id', 'product_uid'] +['feature_{0}'.format(i) for i in xrange(1, 77)]]

In [34]:
features_df.to_csv('bag_of_word_features.csv')