In [1]:
import nltk
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import wordnet as wn
from bs4 import BeautifulSoup
from nltk.stem.snowball import SnowballStemmer
from gensim.models import word2vec

1. Modelos bag of words
2. Pre-processamento com lowercasing, stemming e remoção de caracteres não alfa-numericos

In [2]:
def correct_text(s):
    s = s.lower()
    s = s.replace(" in.","in.")
    s = s.replace(" inch","in.")
    s = s.replace("inch","in.")
    s = s.replace(" in ","in. ")
    s = s.replace(" ft ","ft. ")
    s = s.replace(" ft.","ft.")
    s = s.replace(" foot","ft.")
    s = s.replace(" feet","ft.")
    s = s.replace("foot","ft.")
    s = s.replace("feet","ft.")
    s = s.replace(" ft ","ft. ")
    s = s.replace(" gallon ","gal. ")
    s = s.replace("gallon","gal.")
    s = s.replace(" oz.","oz.")
    s = s.replace(" ounce","oz.")
    s = s.replace("ounce","oz.")
    s = s.replace(" oz ","oz. ")
    s = s.replace(" cm.","cm.")
    s = s.replace(" cm ","cm. ")
    s = s.replace('H x', 'height')
    s = s.replace('sq.', 'square')
    s = s.replace('cu.', 'cubic')
    s = s.replace('lbs.', 'pounds')
    s = s.replace('W x', 'width')
    s = s.replace('H x', 'height')
    s = s.replace('Ah ', 'amphere')
    
    return s

def clean_text(text):
    text = correct_text(text)
    # Remove '&nbsp;' from the text content before HTML tags strip off.
    text.replace('&nbsp;', ' ')
    # Remove HTML tags
    text = BeautifulSoup(text, "lxml").get_text(separator=" ")
    # Replace all punctuation and special characters by space
    text.replace("[ &<>)(_,.;:!?/-]+", " ")
    # Remove the apostrophe's
    text.replace("'s\\b", "")
    # Remove the apostrophe
    text.replace("[']+", "")
    # Remove the double quotes
    text.replace("[\"]+", "")
    # Convert to lower case, split into individual words
    words = text.lower().split()
    return( " ".join( words ))
 

In [3]:
train_data = pd.read_csv('../data/train.csv', encoding="ISO-8859-1")
test_data = pd.read_csv('../data/test.csv', encoding="ISO-8859-1")
attribute_data = pd.read_csv('../data/attributes.csv')
descriptions = pd.read_csv('../data/product_descriptions.csv')

train_data = pd.merge(train_data, descriptions, on="product_uid", how="left")
test_data = pd.merge(test_data, descriptions, on="product_uid", how="left")

product_count = pd.DataFrame(pd.Series(train_data.groupby(["product_uid"]).size(), name="product_count"))
product_count = pd.DataFrame(pd.Series(test_data.groupby(["product_uid"]).size(), name="product_count"))

In [4]:
english_sw = stopwords.words('english')

In [5]:
def clean(sentence):
    
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = PorterStemmer()
    tokens = tokenizer.tokenize(sentence.lower())
    tokens = [token for token in tokens if token not in english_sw]
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

In [6]:
train_data['product_description'] = train_data.apply(lambda x: clean_text(x['product_description']),axis=1)
train_data['product_title'] = train_data.apply(lambda x: clean_text(x['product_title']),axis=1)
train_data['search_term'] = train_data.apply(lambda x: clean_text(x['search_term']),axis=1)

In [7]:
train_data['description_tokens'] = train_data.apply(lambda x: word_tokenize(x['product_description']), axis=1)
train_data['description_tokens_clean'] = train_data.apply(lambda x: clean(x['product_description']), axis=1)

In [8]:
train_data['title_tokens'] = train_data.apply(lambda x: word_tokenize(x['product_title']), axis=1)
train_data['title_tokens_clean'] = train_data.apply(lambda x: clean(x['product_title']), axis=1)

In [9]:
train_data['search_tokens'] = train_data.apply(lambda x: word_tokenize(x['search_term']), axis=1)
train_data['search_tokens_clean'] = train_data.apply(lambda x: clean(x['search_term']), axis=1)

In [10]:
train_data['n_tokens_desc'] = train_data.apply(lambda x: len(x['description_tokens']), axis=1)
train_data['n_tokens_title'] = train_data.apply(lambda x: len(x['title_tokens']), axis=1)
train_data['n_tokens_search'] = train_data.apply(lambda x: len(x['search_tokens']), axis=1)

In [11]:
train_data.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,description_tokens,description_tokens_clean,title_tokens,title_tokens_clean,search_tokens,search_tokens_clean,n_tokens_desc,n_tokens_title,n_tokens_search
0,2,100001,simpson strong-tie 12-gauge angle,angle bracket,3.0,"not only do angles make joints stronger, they ...","[not, only, do, angles, make, joints, stronger...","[angl, make, joint, stronger, also, provid, co...","[simpson, strong-tie, 12-gauge, angle]","[simpson, strong, tie, 12, gaug, angl]","[angle, bracket]","[angl, bracket]",144,4,2
1,3,100001,simpson strong-tie 12-gauge angle,l bracket,2.5,"not only do angles make joints stronger, they ...","[not, only, do, angles, make, joints, stronger...","[angl, make, joint, stronger, also, provid, co...","[simpson, strong-tie, 12-gauge, angle]","[simpson, strong, tie, 12, gaug, angl]","[l, bracket]","[l, bracket]",144,4,2
2,9,100002,behr premium textured deckover 1-gal. #sc-141 ...,deck over,3.0,behr premium textured deckover is an innovativ...,"[behr, premium, textured, deckover, is, an, in...","[behr, premium, textur, deckov, innov, solid, ...","[behr, premium, textured, deckover, 1-gal, ., ...","[behr, premium, textur, deckov, 1, gal, sc, 14...","[deck, over]",[deck],190,13,2
3,16,100005,delta vero 1-handle shower only faucet trim ki...,rain shower head,2.33,update your bathroom with the delta vero singl...,"[update, your, bathroom, with, the, delta, ver...","[updat, bathroom, delta, vero, singl, handl, s...","[delta, vero, 1-handle, shower, only, faucet, ...","[delta, vero, 1, handl, shower, faucet, trim, ...","[rain, shower, head]","[rain, shower, head]",113,15,3
4,17,100005,delta vero 1-handle shower only faucet trim ki...,shower only faucet,2.67,update your bathroom with the delta vero singl...,"[update, your, bathroom, with, the, delta, ver...","[updat, bathroom, delta, vero, singl, handl, s...","[delta, vero, 1-handle, shower, only, faucet, ...","[delta, vero, 1, handl, shower, faucet, trim, ...","[shower, only, faucet]","[shower, faucet]",113,15,3


In [12]:
#train_data['product_description']

### Features

1. feature_1 = is search term a substring in title?
2. feature_2 = is search term a substring in description?
3. feature_3 = proportion of terms of search in title - no stemm, no stopword removal
4. feature_4 = proportion of terms of search in title - stemm and stopword removal
5. feature_5 = proportion of terms of search in description - no stemm, no stopword removal
6. feature_6 = proportion of terms of search in description - stemm and stopword removal
7. feature_7 = length of search
8. feature_8 = length of description
9. feature_9 = length of title
7. features_10 to 43 = if word i in search is in description
7. features_44 to 77 = if word i in search is in title

#### Ideias

1. 1 feature para cada palavra de busca indicando se ela está no titulo ou descrição
2. Usando word2vec, distance entre palavras da busca e do titulo/descrição
3. Usando topic modeling: Probabilidade de busca conter o mesmo topico que titulo/descrição

In [13]:
train_data['feature_1'] = train_data.apply(lambda x: int(x['search_term'] in x['product_title']), axis=1)
train_data['feature_2'] = train_data.apply(lambda x: int(x['search_term'] in x['product_description']), axis=1)
train_data['feature_3'] = train_data.apply(lambda x: float(len( set(x['search_tokens']).intersection(set(x['title_tokens']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_4'] = train_data.apply(lambda x: float(len( set(x['search_tokens_clean']).intersection(set(x['title_tokens_clean']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_5'] = train_data.apply(lambda x: float(len( set(x['search_tokens']).intersection(set(x['description_tokens']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_6'] = train_data.apply(lambda x: float(len( set(x['search_tokens_clean']).intersection(set(x['description_tokens_clean']))))/len( set(x['search_tokens'])) , axis=1)
train_data['feature_7'] = train_data.apply(lambda x: float(len(x['search_tokens'])), axis=1)
train_data['feature_8'] = train_data.apply(lambda x: float(len(x['description_tokens'])), axis=1)
train_data['feature_9'] = train_data.apply(lambda x: float(len(x['title_tokens'])), axis=1)

In [14]:
train_data.head(3)

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description,description_tokens,description_tokens_clean,title_tokens,title_tokens_clean,...,n_tokens_search,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
0,2,100001,simpson strong-tie 12-gauge angle,angle bracket,3.0,"not only do angles make joints stronger, they ...","[not, only, do, angles, make, joints, stronger...","[angl, make, joint, stronger, also, provid, co...","[simpson, strong-tie, 12-gauge, angle]","[simpson, strong, tie, 12, gaug, angl]",...,2,0,0,0.5,0.5,0.0,0.5,2,144,4
1,3,100001,simpson strong-tie 12-gauge angle,l bracket,2.5,"not only do angles make joints stronger, they ...","[not, only, do, angles, make, joints, stronger...","[angl, make, joint, stronger, also, provid, co...","[simpson, strong-tie, 12-gauge, angle]","[simpson, strong, tie, 12, gaug, angl]",...,2,0,0,0.0,0.0,0.0,0.0,2,144,4
2,9,100002,behr premium textured deckover 1-gal. #sc-141 ...,deck over,3.0,behr premium textured deckover is an innovativ...,"[behr, premium, textured, deckover, is, an, in...","[behr, premium, textur, deckov, innov, solid, ...","[behr, premium, textured, deckover, 1-gal, ., ...","[behr, premium, textur, deckov, 1, gal, sc, 14...",...,2,0,0,0.0,0.0,0.5,0.5,2,190,13


In [15]:
max([len(x) for x in train_data['search_tokens']])

14

In [16]:
def word_search(word_list, index, tokens):
    if len(word_list) < index + 1:
        return 0
    text = ''.join(tokens)
    return int(word_list[index] in text)

In [17]:
for index in range(0,14):
    train_data['feature_{}'.format(index+10)] = train_data.apply(lambda x: word_search(x['search_tokens'], index, x['title_tokens']), axis=1)

In [18]:
for index in range(0,14):
    train_data['feature_{}'.format(index+24)] = train_data.apply(lambda x: word_search(x['search_tokens'], index, x['description_tokens']), axis=1)

In [19]:
train_data['feature_10'].head()

0    1
1    1
2    1
3    0
4    1
Name: feature_10, dtype: int64

In [21]:
descriptions = list(train_data.product_description)
title = list(train_data.product_title)


In [32]:
sentences = descriptions + title

In [33]:
sentences = [[word for word in sentence.lower().split()] for sentence in sentences]

In [34]:
from collections import defaultdict

frequency = defaultdict(int)
for sentence in sentences:
    for token in sentence:
        frequency[token] += 1
        
sentences = [[token for token in sentence if frequency[token] > 1] for sentence in sentences]


In [35]:
model = word2vec.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)

In [36]:
model.vocab

{u'fawn': <gensim.models.word2vec.Vocab at 0x7fcaa97638d0>,
 u'wooda': <gensim.models.word2vec.Vocab at 0x7fc9be6ac8d0>,
 u'double-pole,': <gensim.models.word2vec.Vocab at 0x7fc9be6ac510>,
 u'nano-polymer': <gensim.models.word2vec.Vocab at 0x7fc9be6ac4d0>,
 u'raining': <gensim.models.word2vec.Vocab at 0x7fc9bd29b7d0>,
 u'1,800': <gensim.models.word2vec.Vocab at 0x7fc9be6ac490>,
 u'wrought-iron': <gensim.models.word2vec.Vocab at 0x7fc9bd29b810>,
 u'polypropylenelatex': <gensim.models.word2vec.Vocab at 0x7fc9bd29b790>,
 u'woods': <gensim.models.word2vec.Vocab at 0x7fc9be6ac5d0>,
 u'spiders': <gensim.models.word2vec.Vocab at 0x7fc9be6ac610>,
 u'hanging': <gensim.models.word2vec.Vocab at 0x7fc9be6ac650>,
 u'outagesoxygen': <gensim.models.word2vec.Vocab at 0x7fc9be6ac710>,
 u'cellulose/cellulene': <gensim.models.word2vec.Vocab at 0x7fc9bd29b850>,
 u'localized': <gensim.models.word2vec.Vocab at 0x7fc9be6ac750>,
 u'11in.suitable': <gensim.models.word2vec.Vocab at 0x7fc9bccb5f50>,
 u'canes': <